clone3: allow spawning processes into cgroups

This adds support for creating a process in a different cgroup than its
parent. Callers can limit and account processes and threads right from
the moment they are spawned:
- A service manager can directly spawn new services into dedicated
  cgroups.
- A process can be directly created in a frozen cgroup and will be
  frozen as well.
- The initial accounting jitter experienced by process supervisors and
  daemons is eliminated with this.
- Threaded applications or even thread implementations can choose to
  create a specific cgroup layout where each thread is spawned
  directly into a dedicated cgroup.

This feature is limited to the unified hierarchy. Callers need to pass
a directory file descriptor for the target cgroup. The caller can
choose to pass an O_PATH file descriptor. All usual migration
restrictions apply, i.e. there can be no processes in inner nodes. In
general, creating a process directly in a target cgroup adheres to all
migration restrictions.

One of the biggest advantages of this feature is that CLONE_INTO_GROUP does
not need to grab the write side of the cgroup cgroup_threadgroup_rwsem.
This global lock makes moving tasks/threads around super expensive. With
clone3() this lock is avoided.

Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: cgroups@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
Christian Brauner
2020-02-05 14:26:22 +01:00
committed by Tejun Heo
parent f3553220d4
commit ef2c41cf38
7 changed files with 214 additions and 39 deletions

View File

@@ -5881,8 +5881,7 @@ out:
* @child: pointer to task_struct of forking parent process.
*
* A task is associated with the init_css_set until cgroup_post_fork()
* attaches it to the parent's css_set. Empty cg_list indicates that
* @child isn't holding reference to its css_set.
* attaches it to the target css_set.
*/
void cgroup_fork(struct task_struct *child)
{
@@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
return cgrp;
}
/**
* cgroup_css_set_fork - find or create a css_set for a child process
* @kargs: the arguments passed to create the child process
*
* This functions finds or creates a new css_set which the child
* process will be attached to in cgroup_post_fork(). By default,
* the child process will be given the same css_set as its parent.
*
* If CLONE_INTO_CGROUP is specified this function will try to find an
* existing css_set which includes the requested cgroup and if not create
* a new css_set that the child will be attached to later. If this function
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
* to the target cgroup.
*/
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
int ret;
struct cgroup *dst_cgrp = NULL;
struct css_set *cset;
struct super_block *sb;
struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
mutex_lock(&cgroup_mutex);
cgroup_threadgroup_change_begin(current);
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
kargs->cset = cset;
return 0;
}
f = fget_raw(kargs->cgroup);
if (!f) {
ret = -EBADF;
goto err;
}
sb = f->f_path.dentry->d_sb;
dst_cgrp = cgroup_get_from_file(f);
if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL;
goto err;
}
if (cgroup_is_dead(dst_cgrp)) {
ret = -ENODEV;
goto err;
}
/*
* Verify that we the target cgroup is writable for us. This is
* usually done by the vfs layer but since we're not going through
* the vfs layer here we need to do it "manually".
*/
ret = cgroup_may_write(dst_cgrp, sb);
if (ret)
goto err;
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD));
if (ret)
goto err;
kargs->cset = find_css_set(cset, dst_cgrp);
if (!kargs->cset) {
ret = -ENOMEM;
goto err;
}
put_css_set(cset);
fput(f);
kargs->cgrp = dst_cgrp;
return ret;
err:
cgroup_threadgroup_change_end(current);
mutex_unlock(&cgroup_mutex);
if (f)
fput(f);
if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset);
if (kargs->cset)
put_css_set(kargs->cset);
return ret;
}
/**
* cgroup_css_set_put_fork - drop references we took during fork
* @kargs: the arguments passed to create the child process
*
* Drop references to the prepared css_set and target cgroup if
* CLONE_INTO_CGROUP was requested.
*/
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
cgroup_threadgroup_change_end(current);
if (kargs->flags & CLONE_INTO_CGROUP) {
struct cgroup *cgrp = kargs->cgrp;
struct css_set *cset = kargs->cset;
mutex_unlock(&cgroup_mutex);
if (cset) {
put_css_set(cset);
kargs->cset = NULL;
}
if (cgrp) {
cgroup_put(cgrp);
kargs->cgrp = NULL;
}
}
}
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
* This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
* callback returns an error, the fork aborts with that error code. This
* allows for a cgroup subsystem to conditionally allow or deny new forks.
*/
int cgroup_can_fork(struct task_struct *child)
__acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem)
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i, j, ret;
cgroup_threadgroup_change_begin(current);
ret = cgroup_css_set_fork(kargs);
if (ret)
return ret;
do_each_subsys_mask(ss, i, have_canfork_callback) {
ret = ss->can_fork(child);
ret = ss->can_fork(child, kargs->cset);
if (ret)
goto out_revert;
} while_each_subsys_mask();
@@ -5937,32 +6066,34 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
ss->cancel_fork(child);
ss->cancel_fork(child, kargs->cset);
}
cgroup_threadgroup_change_end(current);
cgroup_css_set_put_fork(kargs);
return ret;
}
/**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
* @child: the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded.
*/
void cgroup_cancel_fork(struct task_struct *child)
__releases(&cgroup_threadgroup_rwsem)
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
* @child: the child process
* @kargs: the arguments passed to create the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded and cleans up references we took to
* prepare a new css_set for the child process in cgroup_can_fork().
*/
void cgroup_cancel_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
ss->cancel_fork(child);
ss->cancel_fork(child, kargs->cset);
cgroup_threadgroup_change_end(current);
cgroup_css_set_put_fork(kargs);
}
/**
@@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child)
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
*/
void cgroup_post_fork(struct task_struct *child)
__releases(&cgroup_threadgroup_rwsem)
void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
cset = kargs->cset;
kargs->cset = NULL;
spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset = task_css_set(current); /* current is @child's parent */
get_css_set(cset);
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
} else {
put_css_set(cset);
cset = NULL;
}
/*
@@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child)
ss->fork(child);
} while_each_subsys_mask();
cgroup_threadgroup_change_end(current);
/* Make the new cset the root_cset of the new cgroup namespace. */
if (kargs->flags & CLONE_NEWCGROUP) {
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
get_css_set(cset);
child->nsproxy->cgroup_ns->root_cset = cset;
put_css_set(rcset);
}
cgroup_css_set_put_fork(kargs);
}
/**