Merge branch 'for-4.6-ns' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup namespace support from Tejun Heo: "These are changes to implement namespace support for cgroup which has been pending for quite some time now. It is very straight-forward and only affects what part of cgroup hierarchies are visible. After unsharing, mounting a cgroup fs will be scoped to the cgroups the task belonged to at the time of unsharing and the cgroup paths exposed to userland would be adjusted accordingly" * 'for-4.6-ns' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: fix and restructure error handling in copy_cgroup_ns() cgroup: fix alloc_cgroup_ns() error handling in copy_cgroup_ns() Add FS_USERNS_FLAG to cgroup fs cgroup: Add documentation for cgroup namespaces cgroup: mount cgroupns-root when inside non-init cgroupns kernfs: define kernfs_node_dentry cgroup: cgroup namespace setns support cgroup: introduce cgroup namespaces sched: new clone flag CLONE_NEWCGROUP for cgroup namespace kernfs: Add API to generate relative kernfs path
This commit is contained in:
229
kernel/cgroup.c
229
kernel/cgroup.c
@@ -59,6 +59,9 @@
|
||||
#include <linux/delay.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/proc_ns.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/proc_ns.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
/*
|
||||
@@ -215,6 +218,15 @@ static u16 have_fork_callback __read_mostly;
|
||||
static u16 have_exit_callback __read_mostly;
|
||||
static u16 have_free_callback __read_mostly;
|
||||
|
||||
/* cgroup namespace for init task */
|
||||
struct cgroup_namespace init_cgroup_ns = {
|
||||
.count = { .counter = 2, },
|
||||
.user_ns = &init_user_ns,
|
||||
.ns.ops = &cgroupns_operations,
|
||||
.ns.inum = PROC_CGROUP_INIT_INO,
|
||||
.root_cset = &init_css_set,
|
||||
};
|
||||
|
||||
/* Ditto for the can_fork callback. */
|
||||
static u16 have_canfork_callback __read_mostly;
|
||||
|
||||
@@ -2002,6 +2014,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
{
|
||||
bool is_v2 = fs_type == &cgroup2_fs_type;
|
||||
struct super_block *pinned_sb = NULL;
|
||||
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
|
||||
struct cgroup_subsys *ss;
|
||||
struct cgroup_root *root;
|
||||
struct cgroup_sb_opts opts;
|
||||
@@ -2010,6 +2023,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
int i;
|
||||
bool new_sb;
|
||||
|
||||
get_cgroup_ns(ns);
|
||||
|
||||
/* Check if the caller has permission to mount. */
|
||||
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
|
||||
put_cgroup_ns(ns);
|
||||
return ERR_PTR(-EPERM);
|
||||
}
|
||||
|
||||
/*
|
||||
* The first time anyone tries to mount a cgroup, enable the list
|
||||
* linking each css_set to its tasks and fix up all existing tasks.
|
||||
@@ -2020,6 +2041,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
if (is_v2) {
|
||||
if (data) {
|
||||
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
|
||||
put_cgroup_ns(ns);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
cgrp_dfl_visible = true;
|
||||
@@ -2125,6 +2147,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* We know this subsystem has not yet been bound. Users in a non-init
|
||||
* user namespace may only mount hierarchies with no bound subsystems,
|
||||
* i.e. 'none,name=user1'
|
||||
*/
|
||||
if (!opts.none && !capable(CAP_SYS_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
root = kzalloc(sizeof(*root), GFP_KERNEL);
|
||||
if (!root) {
|
||||
ret = -ENOMEM;
|
||||
@@ -2143,12 +2175,37 @@ out_free:
|
||||
kfree(opts.release_agent);
|
||||
kfree(opts.name);
|
||||
|
||||
if (ret)
|
||||
if (ret) {
|
||||
put_cgroup_ns(ns);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
out_mount:
|
||||
dentry = kernfs_mount(fs_type, flags, root->kf_root,
|
||||
is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
|
||||
&new_sb);
|
||||
|
||||
/*
|
||||
* In non-init cgroup namespace, instead of root cgroup's
|
||||
* dentry, we return the dentry corresponding to the
|
||||
* cgroupns->root_cgrp.
|
||||
*/
|
||||
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
|
||||
struct dentry *nsdentry;
|
||||
struct cgroup *cgrp;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
spin_lock_bh(&css_set_lock);
|
||||
|
||||
cgrp = cset_cgroup_from_root(ns->root_cset, root);
|
||||
|
||||
spin_unlock_bh(&css_set_lock);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
|
||||
dput(dentry);
|
||||
dentry = nsdentry;
|
||||
}
|
||||
|
||||
if (IS_ERR(dentry) || !new_sb)
|
||||
cgroup_put(&root->cgrp);
|
||||
|
||||
@@ -2161,6 +2218,7 @@ out_mount:
|
||||
deactivate_super(pinned_sb);
|
||||
}
|
||||
|
||||
put_cgroup_ns(ns);
|
||||
return dentry;
|
||||
}
|
||||
|
||||
@@ -2189,14 +2247,45 @@ static struct file_system_type cgroup_fs_type = {
|
||||
.name = "cgroup",
|
||||
.mount = cgroup_mount,
|
||||
.kill_sb = cgroup_kill_sb,
|
||||
.fs_flags = FS_USERNS_MOUNT,
|
||||
};
|
||||
|
||||
static struct file_system_type cgroup2_fs_type = {
|
||||
.name = "cgroup2",
|
||||
.mount = cgroup_mount,
|
||||
.kill_sb = cgroup_kill_sb,
|
||||
.fs_flags = FS_USERNS_MOUNT,
|
||||
};
|
||||
|
||||
static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
|
||||
struct cgroup_namespace *ns)
|
||||
{
|
||||
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
|
||||
int ret;
|
||||
|
||||
ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
|
||||
if (ret < 0 || ret >= buflen)
|
||||
return NULL;
|
||||
return buf;
|
||||
}
|
||||
|
||||
char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
|
||||
struct cgroup_namespace *ns)
|
||||
{
|
||||
char *ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
spin_lock_bh(&css_set_lock);
|
||||
|
||||
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
|
||||
|
||||
spin_unlock_bh(&css_set_lock);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_path_ns);
|
||||
|
||||
/**
|
||||
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
|
||||
* @task: target task
|
||||
@@ -2224,7 +2313,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
|
||||
|
||||
if (root) {
|
||||
cgrp = task_cgroup_from_root(task, root);
|
||||
path = cgroup_path(cgrp, buf, buflen);
|
||||
path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
|
||||
} else {
|
||||
/* if no hierarchy exists, everyone is in "/" */
|
||||
if (strlcpy(buf, "/", buflen) < buflen)
|
||||
@@ -5450,6 +5539,8 @@ int __init cgroup_init(void)
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
|
||||
|
||||
get_user_ns(init_cgroup_ns.user_ns);
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
/*
|
||||
@@ -5601,7 +5692,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
||||
* " (deleted)" is appended to the cgroup path.
|
||||
*/
|
||||
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
|
||||
path = cgroup_path(cgrp, buf, PATH_MAX);
|
||||
path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
|
||||
current->nsproxy->cgroup_ns);
|
||||
if (!path) {
|
||||
retval = -ENAMETOOLONG;
|
||||
goto out_unlock;
|
||||
@@ -5886,7 +5978,9 @@ static void cgroup_release_agent(struct work_struct *work)
|
||||
if (!pathbuf || !agentbuf)
|
||||
goto out;
|
||||
|
||||
path = cgroup_path(cgrp, pathbuf, PATH_MAX);
|
||||
spin_lock_bh(&css_set_lock);
|
||||
path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
|
||||
spin_unlock_bh(&css_set_lock);
|
||||
if (!path)
|
||||
goto out;
|
||||
|
||||
@@ -6098,6 +6192,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
|
||||
|
||||
#endif /* CONFIG_SOCK_CGROUP_DATA */
|
||||
|
||||
/* cgroup namespaces */
|
||||
|
||||
static struct cgroup_namespace *alloc_cgroup_ns(void)
|
||||
{
|
||||
struct cgroup_namespace *new_ns;
|
||||
int ret;
|
||||
|
||||
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
|
||||
if (!new_ns)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
ret = ns_alloc_inum(&new_ns->ns);
|
||||
if (ret) {
|
||||
kfree(new_ns);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
atomic_set(&new_ns->count, 1);
|
||||
new_ns->ns.ops = &cgroupns_operations;
|
||||
return new_ns;
|
||||
}
|
||||
|
||||
void free_cgroup_ns(struct cgroup_namespace *ns)
|
||||
{
|
||||
put_css_set(ns->root_cset);
|
||||
put_user_ns(ns->user_ns);
|
||||
ns_free_inum(&ns->ns);
|
||||
kfree(ns);
|
||||
}
|
||||
EXPORT_SYMBOL(free_cgroup_ns);
|
||||
|
||||
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
|
||||
struct user_namespace *user_ns,
|
||||
struct cgroup_namespace *old_ns)
|
||||
{
|
||||
struct cgroup_namespace *new_ns;
|
||||
struct css_set *cset;
|
||||
|
||||
BUG_ON(!old_ns);
|
||||
|
||||
if (!(flags & CLONE_NEWCGROUP)) {
|
||||
get_cgroup_ns(old_ns);
|
||||
return old_ns;
|
||||
}
|
||||
|
||||
/* Allow only sysadmin to create cgroup namespace. */
|
||||
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
spin_lock_bh(&css_set_lock);
|
||||
|
||||
cset = task_css_set(current);
|
||||
get_css_set(cset);
|
||||
|
||||
spin_unlock_bh(&css_set_lock);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
new_ns = alloc_cgroup_ns();
|
||||
if (IS_ERR(new_ns)) {
|
||||
put_css_set(cset);
|
||||
return new_ns;
|
||||
}
|
||||
|
||||
new_ns->user_ns = get_user_ns(user_ns);
|
||||
new_ns->root_cset = cset;
|
||||
|
||||
return new_ns;
|
||||
}
|
||||
|
||||
static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
|
||||
{
|
||||
return container_of(ns, struct cgroup_namespace, ns);
|
||||
}
|
||||
|
||||
static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
|
||||
{
|
||||
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
|
||||
|
||||
if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
|
||||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
/* Don't need to do anything if we are attaching to our own cgroupns. */
|
||||
if (cgroup_ns == nsproxy->cgroup_ns)
|
||||
return 0;
|
||||
|
||||
get_cgroup_ns(cgroup_ns);
|
||||
put_cgroup_ns(nsproxy->cgroup_ns);
|
||||
nsproxy->cgroup_ns = cgroup_ns;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ns_common *cgroupns_get(struct task_struct *task)
|
||||
{
|
||||
struct cgroup_namespace *ns = NULL;
|
||||
struct nsproxy *nsproxy;
|
||||
|
||||
task_lock(task);
|
||||
nsproxy = task->nsproxy;
|
||||
if (nsproxy) {
|
||||
ns = nsproxy->cgroup_ns;
|
||||
get_cgroup_ns(ns);
|
||||
}
|
||||
task_unlock(task);
|
||||
|
||||
return ns ? &ns->ns : NULL;
|
||||
}
|
||||
|
||||
static void cgroupns_put(struct ns_common *ns)
|
||||
{
|
||||
put_cgroup_ns(to_cg_ns(ns));
|
||||
}
|
||||
|
||||
const struct proc_ns_operations cgroupns_operations = {
|
||||
.name = "cgroup",
|
||||
.type = CLONE_NEWCGROUP,
|
||||
.get = cgroupns_get,
|
||||
.put = cgroupns_put,
|
||||
.install = cgroupns_install,
|
||||
};
|
||||
|
||||
static __init int cgroup_namespaces_init(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(cgroup_namespaces_init);
|
||||
|
||||
#ifdef CONFIG_CGROUP_DEBUG
|
||||
static struct cgroup_subsys_state *
|
||||
debug_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
|
@@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
|
||||
goto out;
|
||||
|
||||
retval = -ENAMETOOLONG;
|
||||
rcu_read_lock();
|
||||
css = task_css(tsk, cpuset_cgrp_id);
|
||||
p = cgroup_path(css->cgroup, buf, PATH_MAX);
|
||||
rcu_read_unlock();
|
||||
css = task_get_css(tsk, cpuset_cgrp_id);
|
||||
p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
|
||||
current->nsproxy->cgroup_ns);
|
||||
css_put(css);
|
||||
if (!p)
|
||||
goto out_free;
|
||||
seq_puts(m, p);
|
||||
|
@@ -1892,7 +1892,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
|
||||
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
|
||||
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
|
||||
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
|
||||
CLONE_NEWUSER|CLONE_NEWPID))
|
||||
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
|
||||
return -EINVAL;
|
||||
/*
|
||||
* Not implemented, but pretend it works if there is nothing
|
||||
|
@@ -25,6 +25,7 @@
|
||||
#include <linux/proc_ns.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/cgroup.h>
|
||||
|
||||
static struct kmem_cache *nsproxy_cachep;
|
||||
|
||||
@@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = {
|
||||
#ifdef CONFIG_NET
|
||||
.net_ns = &init_net,
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUPS
|
||||
.cgroup_ns = &init_cgroup_ns,
|
||||
#endif
|
||||
};
|
||||
|
||||
static inline struct nsproxy *create_nsproxy(void)
|
||||
@@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
|
||||
goto out_pid;
|
||||
}
|
||||
|
||||
new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
|
||||
tsk->nsproxy->cgroup_ns);
|
||||
if (IS_ERR(new_nsp->cgroup_ns)) {
|
||||
err = PTR_ERR(new_nsp->cgroup_ns);
|
||||
goto out_cgroup;
|
||||
}
|
||||
|
||||
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
|
||||
if (IS_ERR(new_nsp->net_ns)) {
|
||||
err = PTR_ERR(new_nsp->net_ns);
|
||||
@@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
|
||||
return new_nsp;
|
||||
|
||||
out_net:
|
||||
put_cgroup_ns(new_nsp->cgroup_ns);
|
||||
out_cgroup:
|
||||
if (new_nsp->pid_ns_for_children)
|
||||
put_pid_ns(new_nsp->pid_ns_for_children);
|
||||
out_pid:
|
||||
@@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
|
||||
struct nsproxy *new_ns;
|
||||
|
||||
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
||||
CLONE_NEWPID | CLONE_NEWNET)))) {
|
||||
CLONE_NEWPID | CLONE_NEWNET |
|
||||
CLONE_NEWCGROUP)))) {
|
||||
get_nsproxy(old_ns);
|
||||
return 0;
|
||||
}
|
||||
@@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns)
|
||||
put_ipc_ns(ns->ipc_ns);
|
||||
if (ns->pid_ns_for_children)
|
||||
put_pid_ns(ns->pid_ns_for_children);
|
||||
put_cgroup_ns(ns->cgroup_ns);
|
||||
put_net(ns->net_ns);
|
||||
kmem_cache_free(nsproxy_cachep, ns);
|
||||
}
|
||||
@@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
|
||||
int err = 0;
|
||||
|
||||
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
||||
CLONE_NEWNET | CLONE_NEWPID)))
|
||||
CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
|
||||
return 0;
|
||||
|
||||
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
|
||||
|
Reference in New Issue
Block a user