clone3: allow spawning processes into cgroups
This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:

committed by
Tejun Heo

parent
f3553220d4
commit
ef2c41cf38
@@ -5881,8 +5881,7 @@ out:
|
||||
* @child: pointer to task_struct of forking parent process.
|
||||
*
|
||||
* A task is associated with the init_css_set until cgroup_post_fork()
|
||||
* attaches it to the parent's css_set. Empty cg_list indicates that
|
||||
* @child isn't holding reference to its css_set.
|
||||
* attaches it to the target css_set.
|
||||
*/
|
||||
void cgroup_fork(struct task_struct *child)
|
||||
{
|
||||
@@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
|
||||
return cgrp;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_css_set_fork - find or create a css_set for a child process
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* This functions finds or creates a new css_set which the child
|
||||
* process will be attached to in cgroup_post_fork(). By default,
|
||||
* the child process will be given the same css_set as its parent.
|
||||
*
|
||||
* If CLONE_INTO_CGROUP is specified this function will try to find an
|
||||
* existing css_set which includes the requested cgroup and if not create
|
||||
* a new css_set that the child will be attached to later. If this function
|
||||
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
|
||||
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
|
||||
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
|
||||
* to the target cgroup.
|
||||
*/
|
||||
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
|
||||
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
|
||||
{
|
||||
int ret;
|
||||
struct cgroup *dst_cgrp = NULL;
|
||||
struct css_set *cset;
|
||||
struct super_block *sb;
|
||||
struct file *f;
|
||||
|
||||
if (kargs->flags & CLONE_INTO_CGROUP)
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
cgroup_threadgroup_change_begin(current);
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
cset = task_css_set(current);
|
||||
get_css_set(cset);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
|
||||
kargs->cset = cset;
|
||||
return 0;
|
||||
}
|
||||
|
||||
f = fget_raw(kargs->cgroup);
|
||||
if (!f) {
|
||||
ret = -EBADF;
|
||||
goto err;
|
||||
}
|
||||
sb = f->f_path.dentry->d_sb;
|
||||
|
||||
dst_cgrp = cgroup_get_from_file(f);
|
||||
if (IS_ERR(dst_cgrp)) {
|
||||
ret = PTR_ERR(dst_cgrp);
|
||||
dst_cgrp = NULL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (cgroup_is_dead(dst_cgrp)) {
|
||||
ret = -ENODEV;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that we the target cgroup is writable for us. This is
|
||||
* usually done by the vfs layer but since we're not going through
|
||||
* the vfs layer here we need to do it "manually".
|
||||
*/
|
||||
ret = cgroup_may_write(dst_cgrp, sb);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
|
||||
!(kargs->flags & CLONE_THREAD));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
kargs->cset = find_css_set(cset, dst_cgrp);
|
||||
if (!kargs->cset) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
put_css_set(cset);
|
||||
fput(f);
|
||||
kargs->cgrp = dst_cgrp;
|
||||
return ret;
|
||||
|
||||
err:
|
||||
cgroup_threadgroup_change_end(current);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
if (f)
|
||||
fput(f);
|
||||
if (dst_cgrp)
|
||||
cgroup_put(dst_cgrp);
|
||||
put_css_set(cset);
|
||||
if (kargs->cset)
|
||||
put_css_set(kargs->cset);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_css_set_put_fork - drop references we took during fork
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* Drop references to the prepared css_set and target cgroup if
|
||||
* CLONE_INTO_CGROUP was requested.
|
||||
*/
|
||||
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
|
||||
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
|
||||
{
|
||||
cgroup_threadgroup_change_end(current);
|
||||
|
||||
if (kargs->flags & CLONE_INTO_CGROUP) {
|
||||
struct cgroup *cgrp = kargs->cgrp;
|
||||
struct css_set *cset = kargs->cset;
|
||||
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
if (cset) {
|
||||
put_css_set(cset);
|
||||
kargs->cset = NULL;
|
||||
}
|
||||
|
||||
if (cgrp) {
|
||||
cgroup_put(cgrp);
|
||||
kargs->cgrp = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_can_fork - called on a new task before the process is exposed
|
||||
* @child: the child process
|
||||
*
|
||||
* This prepares a new css_set for the child process which the child will
|
||||
* be attached to in cgroup_post_fork().
|
||||
* This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
|
||||
* callback returns an error, the fork aborts with that error code. This
|
||||
* allows for a cgroup subsystem to conditionally allow or deny new forks.
|
||||
*/
|
||||
int cgroup_can_fork(struct task_struct *child)
|
||||
__acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem)
|
||||
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i, j, ret;
|
||||
|
||||
cgroup_threadgroup_change_begin(current);
|
||||
ret = cgroup_css_set_fork(kargs);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
do_each_subsys_mask(ss, i, have_canfork_callback) {
|
||||
ret = ss->can_fork(child);
|
||||
ret = ss->can_fork(child, kargs->cset);
|
||||
if (ret)
|
||||
goto out_revert;
|
||||
} while_each_subsys_mask();
|
||||
@@ -5937,32 +6066,34 @@ out_revert:
|
||||
if (j >= i)
|
||||
break;
|
||||
if (ss->cancel_fork)
|
||||
ss->cancel_fork(child);
|
||||
ss->cancel_fork(child, kargs->cset);
|
||||
}
|
||||
|
||||
cgroup_threadgroup_change_end(current);
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
|
||||
* @child: the child process
|
||||
*
|
||||
* This calls the cancel_fork() callbacks if a fork failed *after*
|
||||
* cgroup_can_fork() succeded.
|
||||
*/
|
||||
void cgroup_cancel_fork(struct task_struct *child)
|
||||
__releases(&cgroup_threadgroup_rwsem)
|
||||
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
|
||||
* @child: the child process
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* This calls the cancel_fork() callbacks if a fork failed *after*
|
||||
* cgroup_can_fork() succeded and cleans up references we took to
|
||||
* prepare a new css_set for the child process in cgroup_can_fork().
|
||||
*/
|
||||
void cgroup_cancel_fork(struct task_struct *child,
|
||||
struct kernel_clone_args *kargs)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i;
|
||||
|
||||
for_each_subsys(ss, i)
|
||||
if (ss->cancel_fork)
|
||||
ss->cancel_fork(child);
|
||||
ss->cancel_fork(child, kargs->cset);
|
||||
|
||||
cgroup_threadgroup_change_end(current);
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child)
|
||||
* Attach the child process to its css_set calling the subsystem fork()
|
||||
* callbacks.
|
||||
*/
|
||||
void cgroup_post_fork(struct task_struct *child)
|
||||
__releases(&cgroup_threadgroup_rwsem)
|
||||
void cgroup_post_fork(struct task_struct *child,
|
||||
struct kernel_clone_args *kargs)
|
||||
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
struct css_set *cset;
|
||||
int i;
|
||||
|
||||
cset = kargs->cset;
|
||||
kargs->cset = NULL;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
|
||||
/* init tasks are special, only link regular threads */
|
||||
if (likely(child->pid)) {
|
||||
WARN_ON_ONCE(!list_empty(&child->cg_list));
|
||||
cset = task_css_set(current); /* current is @child's parent */
|
||||
get_css_set(cset);
|
||||
cset->nr_tasks++;
|
||||
css_set_move_task(child, NULL, cset, false);
|
||||
} else {
|
||||
put_css_set(cset);
|
||||
cset = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child)
|
||||
ss->fork(child);
|
||||
} while_each_subsys_mask();
|
||||
|
||||
cgroup_threadgroup_change_end(current);
|
||||
/* Make the new cset the root_cset of the new cgroup namespace. */
|
||||
if (kargs->flags & CLONE_NEWCGROUP) {
|
||||
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
|
||||
|
||||
get_css_set(cset);
|
||||
child->nsproxy->cgroup_ns->root_cset = cset;
|
||||
put_css_set(rcset);
|
||||
}
|
||||
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
Reference in New Issue
Block a user