clone3: allow spawning processes into cgroups
This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:

committed by
Tejun Heo

parent
f3553220d4
commit
ef2c41cf38
@@ -35,6 +35,7 @@
|
||||
|
||||
/* Flags for the clone3() syscall. */
|
||||
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
|
||||
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
||||
|
||||
/*
|
||||
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
||||
@@ -81,6 +82,8 @@
|
||||
* @set_tid_size: This defines the size of the array referenced
|
||||
* in @set_tid. This cannot be larger than the
|
||||
* kernel's limit of nested PID namespaces.
|
||||
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
|
||||
* a file descriptor for the cgroup.
|
||||
*
|
||||
* The structure is versioned by size and thus extensible.
|
||||
* New struct members must go at the end of the struct and
|
||||
@@ -97,11 +100,13 @@ struct clone_args {
|
||||
__aligned_u64 tls;
|
||||
__aligned_u64 set_tid;
|
||||
__aligned_u64 set_tid_size;
|
||||
__aligned_u64 cgroup;
|
||||
};
|
||||
#endif
|
||||
|
||||
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
|
||||
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
|
||||
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
|
||||
|
||||
/*
|
||||
* Scheduling policies
|
||||
|
Reference in New Issue
Block a user