clone3: allow spawning processes into cgroups
This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:

committed by
Tejun Heo

parent
f3553220d4
commit
ef2c41cf38
@@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
* between here and cgroup_post_fork() if an organisation operation is in
|
||||
* progress.
|
||||
*/
|
||||
retval = cgroup_can_fork(p);
|
||||
retval = cgroup_can_fork(p, args);
|
||||
if (retval)
|
||||
goto bad_fork_put_pidfd;
|
||||
|
||||
@@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
write_unlock_irq(&tasklist_lock);
|
||||
|
||||
proc_fork_connector(p);
|
||||
cgroup_post_fork(p);
|
||||
cgroup_post_fork(p, args);
|
||||
perf_event_fork(p);
|
||||
|
||||
trace_task_newtask(p, clone_flags);
|
||||
@@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
bad_fork_cancel_cgroup:
|
||||
spin_unlock(¤t->sighand->siglock);
|
||||
write_unlock_irq(&tasklist_lock);
|
||||
cgroup_cancel_fork(p);
|
||||
cgroup_cancel_fork(p, args);
|
||||
bad_fork_put_pidfd:
|
||||
if (clone_flags & CLONE_PIDFD) {
|
||||
fput(pidfile);
|
||||
@@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
|
||||
!valid_signal(args.exit_signal)))
|
||||
return -EINVAL;
|
||||
|
||||
if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
|
||||
return -EINVAL;
|
||||
|
||||
*kargs = (struct kernel_clone_args){
|
||||
.flags = args.flags,
|
||||
.pidfd = u64_to_user_ptr(args.pidfd),
|
||||
@@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
|
||||
.stack_size = args.stack_size,
|
||||
.tls = args.tls,
|
||||
.set_tid_size = args.set_tid_size,
|
||||
.cgroup = args.cgroup,
|
||||
};
|
||||
|
||||
if (args.set_tid &&
|
||||
@@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
|
||||
static bool clone3_args_valid(struct kernel_clone_args *kargs)
|
||||
{
|
||||
/* Verify that no unknown flags are passed along. */
|
||||
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
|
||||
if (kargs->flags &
|
||||
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
|
||||
return false;
|
||||
|
||||
/*
|
||||
|
Reference in New Issue
Block a user