Merge branch 'for-4.6-ns' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup namespace support from Tejun Heo:
 "These are changes to implement namespace support for cgroup which has
  been pending for quite some time now.  It is very straight-forward and
  only affects what part of cgroup hierarchies are visible.

  After unsharing, mounting a cgroup fs will be scoped to the cgroups
  the task belonged to at the time of unsharing and the cgroup paths
  exposed to userland would be adjusted accordingly"

* 'for-4.6-ns' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: fix and restructure error handling in copy_cgroup_ns()
  cgroup: fix alloc_cgroup_ns() error handling in copy_cgroup_ns()
  Add FS_USERNS_FLAG to cgroup fs
  cgroup: Add documentation for cgroup namespaces
  cgroup: mount cgroupns-root when inside non-init cgroupns
  kernfs: define kernfs_node_dentry
  cgroup: cgroup namespace setns support
  cgroup: introduce cgroup namespaces
  sched: new clone flag CLONE_NEWCGROUP for cgroup namespace
  kernfs: Add API to generate relative kernfs path
This commit is contained in:
Linus Torvalds
2016-03-21 10:05:13 -07:00
13 changed files with 689 additions and 48 deletions

View File

@@ -59,6 +59,9 @@
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
#include <net/sock.h>
/*
@@ -215,6 +218,15 @@ static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_free_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
.count = { .counter = 2, },
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
.root_cset = &init_css_set,
};
/* Ditto for the can_fork callback. */
static u16 have_canfork_callback __read_mostly;
@@ -2002,6 +2014,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
{
bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
@@ -2010,6 +2023,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int i;
bool new_sb;
get_cgroup_ns(ns);
/* Check if the caller has permission to mount. */
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
put_cgroup_ns(ns);
return ERR_PTR(-EPERM);
}
/*
* The first time anyone tries to mount a cgroup, enable the list
* linking each css_set to its tasks and fix up all existing tasks.
@@ -2020,6 +2041,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (is_v2) {
if (data) {
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
put_cgroup_ns(ns);
return ERR_PTR(-EINVAL);
}
cgrp_dfl_visible = true;
@@ -2125,6 +2147,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
/*
* We know this subsystem has not yet been bound. Users in a non-init
* user namespace may only mount hierarchies with no bound subsystems,
* i.e. 'none,name=user1'
*/
if (!opts.none && !capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out_unlock;
}
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {
ret = -ENOMEM;
@@ -2143,12 +2175,37 @@ out_free:
kfree(opts.release_agent);
kfree(opts.name);
if (ret)
if (ret) {
put_cgroup_ns(ns);
return ERR_PTR(ret);
}
out_mount:
dentry = kernfs_mount(fs_type, flags, root->kf_root,
is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
&new_sb);
/*
* In non-init cgroup namespace, instead of root cgroup's
* dentry, we return the dentry corresponding to the
* cgroupns->root_cgrp.
*/
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
struct dentry *nsdentry;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
spin_lock_bh(&css_set_lock);
cgrp = cset_cgroup_from_root(ns->root_cset, root);
spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);
nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
dput(dentry);
dentry = nsdentry;
}
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
@@ -2161,6 +2218,7 @@ out_mount:
deactivate_super(pinned_sb);
}
put_cgroup_ns(ns);
return dentry;
}
@@ -2189,14 +2247,45 @@ static struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
.name = "cgroup2",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
int ret;
ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
if (ret < 0 || ret >= buflen)
return NULL;
return buf;
}
char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
char *ret;
mutex_lock(&cgroup_mutex);
spin_lock_bh(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
@@ -2224,7 +2313,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
if (root) {
cgrp = task_cgroup_from_root(task, root);
path = cgroup_path(cgrp, buf, buflen);
path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
if (strlcpy(buf, "/", buflen) < buflen)
@@ -5450,6 +5539,8 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
/*
@@ -5601,7 +5692,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
* " (deleted)" is appended to the cgroup path.
*/
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
path = cgroup_path(cgrp, buf, PATH_MAX);
path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
if (!path) {
retval = -ENAMETOOLONG;
goto out_unlock;
@@ -5886,7 +5978,9 @@ static void cgroup_release_agent(struct work_struct *work)
if (!pathbuf || !agentbuf)
goto out;
path = cgroup_path(cgrp, pathbuf, PATH_MAX);
spin_lock_bh(&css_set_lock);
path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
spin_unlock_bh(&css_set_lock);
if (!path)
goto out;
@@ -6098,6 +6192,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
/* cgroup namespaces */
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
struct cgroup_namespace *new_ns;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
return ERR_PTR(ret);
}
atomic_set(&new_ns->count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
put_css_set(ns->root_cset);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
}
EXPORT_SYMBOL(free_cgroup_ns);
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
struct cgroup_namespace *new_ns;
struct css_set *cset;
BUG_ON(!old_ns);
if (!(flags & CLONE_NEWCGROUP)) {
get_cgroup_ns(old_ns);
return old_ns;
}
/* Allow only sysadmin to create cgroup namespace. */
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
mutex_lock(&cgroup_mutex);
spin_lock_bh(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);
new_ns = alloc_cgroup_ns();
if (IS_ERR(new_ns)) {
put_css_set(cset);
return new_ns;
}
new_ns->user_ns = get_user_ns(user_ns);
new_ns->root_cset = cset;
return new_ns;
}
static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
{
return container_of(ns, struct cgroup_namespace, ns);
}
static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
{
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/* Don't need to do anything if we are attaching to our own cgroupns. */
if (cgroup_ns == nsproxy->cgroup_ns)
return 0;
get_cgroup_ns(cgroup_ns);
put_cgroup_ns(nsproxy->cgroup_ns);
nsproxy->cgroup_ns = cgroup_ns;
return 0;
}
static struct ns_common *cgroupns_get(struct task_struct *task)
{
struct cgroup_namespace *ns = NULL;
struct nsproxy *nsproxy;
task_lock(task);
nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->cgroup_ns;
get_cgroup_ns(ns);
}
task_unlock(task);
return ns ? &ns->ns : NULL;
}
static void cgroupns_put(struct ns_common *ns)
{
put_cgroup_ns(to_cg_ns(ns));
}
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
.type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
};
static __init int cgroup_namespaces_init(void)
{
return 0;
}
subsys_initcall(cgroup_namespaces_init);
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)

View File

@@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
goto out;
retval = -ENAMETOOLONG;
rcu_read_lock();
css = task_css(tsk, cpuset_cgrp_id);
p = cgroup_path(css->cgroup, buf, PATH_MAX);
rcu_read_unlock();
css = task_get_css(tsk, cpuset_cgrp_id);
p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
css_put(css);
if (!p)
goto out_free;
seq_puts(m, p);

View File

@@ -1892,7 +1892,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
CLONE_NEWUSER|CLONE_NEWPID))
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing

View File

@@ -25,6 +25,7 @@
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
static struct kmem_cache *nsproxy_cachep;
@@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_NET
.net_ns = &init_net,
#endif
#ifdef CONFIG_CGROUPS
.cgroup_ns = &init_cgroup_ns,
#endif
};
static inline struct nsproxy *create_nsproxy(void)
@@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_pid;
}
new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
tsk->nsproxy->cgroup_ns);
if (IS_ERR(new_nsp->cgroup_ns)) {
err = PTR_ERR(new_nsp->cgroup_ns);
goto out_cgroup;
}
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
@@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return new_nsp;
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
if (new_nsp->pid_ns_for_children)
put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
@@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *new_ns;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET)))) {
CLONE_NEWPID | CLONE_NEWNET |
CLONE_NEWCGROUP)))) {
get_nsproxy(old_ns);
return 0;
}
@@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
@@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWNET | CLONE_NEWPID)))
CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();