Merge branch 'work.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull vfs mount infrastructure updates from Al Viro:
 "The rest of core infrastructure; no new syscalls in that pile, but the
  old parts are switched to new infrastructure. At that point
  conversions of individual filesystems can happen independently; some
  are done here (afs, cgroup, procfs, etc.), there's also a large series
  outside of that pile dealing with NFS (quite a bit of option-parsing
  stuff is getting used there - it's one of the most convoluted
  filesystems in terms of mount-related logics), but NFS bits are the
  next cycle fodder.

  It got seriously simplified since the last cycle; documentation is
  probably the weakest bit at the moment - I considered dropping the
  commit introducing Documentation/filesystems/mount_api.txt (cutting
  the size increase by quarter ;-), but decided that it would be better
  to fix it up after -rc1 instead.

  That pile allows to do followup work in independent branches, which
  should make life much easier for the next cycle. fs/super.c size
  increase is unpleasant; there's a followup series that allows to
  shrink it considerably, but I decided to leave that until the next
  cycle"

* 'work.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (41 commits)
  afs: Use fs_context to pass parameters over automount
  afs: Add fs_context support
  vfs: Add some logging to the core users of the fs_context log
  vfs: Implement logging through fs_context
  vfs: Provide documentation for new mount API
  vfs: Remove kern_mount_data()
  hugetlbfs: Convert to fs_context
  cpuset: Use fs_context
  kernfs, sysfs, cgroup, intel_rdt: Support fs_context
  cgroup: store a reference to cgroup_ns into cgroup_fs_context
  cgroup1_get_tree(): separate "get cgroup_root to use" into a separate helper
  cgroup_do_mount(): massage calling conventions
  cgroup: stash cgroup_root reference into cgroup_fs_context
  cgroup2: switch to option-by-option parsing
  cgroup1: switch to option-by-option parsing
  cgroup: take options parsing into ->parse_monolithic()
  cgroup: fold cgroup1_mount() into cgroup1_get_tree()
  cgroup: start switching to fs_context
  ipc: Convert mqueue fs to fs_context
  proc: Add fs_context support to procfs
  ...
This commit is contained in:
Linus Torvalds
2019-03-12 14:08:19 -07:00
45 changed files with 4377 additions and 1352 deletions

View File

@@ -7,6 +7,7 @@
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/fs_context.h>
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
@@ -36,6 +37,31 @@ extern void __init enable_debug_cgroup(void);
} \
} while (0)
/*
* The cgroup filesystem superblock creation/mount context.
*/
struct cgroup_fs_context {
struct kernfs_fs_context kfc;
struct cgroup_root *root;
struct cgroup_namespace *ns;
unsigned int flags; /* CGRP_ROOT_* flags */
/* cgroup1 bits */
bool cpuset_clone_children;
bool none; /* User explicitly requested empty subsystem */
bool all_ss; /* Seen 'all' option */
u16 subsys_mask; /* Selected subsystems */
char *name; /* Hierarchy name */
char *release_agent; /* Path for release notifications */
};
static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
{
struct kernfs_fs_context *kfc = fc->fs_private;
return container_of(kfc, struct cgroup_fs_context, kfc);
}
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
@@ -117,16 +143,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
struct cgroup_sb_opts {
u16 subsys_mask;
unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
};
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
@@ -197,12 +213,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
struct cgroup_namespace *ns);
int cgroup_do_get_tree(struct fs_context *fc);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
@@ -246,14 +260,15 @@ extern const struct proc_ns_operations cgroupns_operations;
*/
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
extern const struct fs_parameter_description cgroup1_fs_parameters;
int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
void cgroup1_release_agent(struct work_struct *work);
void cgroup1_check_for_release(struct cgroup *cgrp);
struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
void *data, unsigned long magic,
struct cgroup_namespace *ns);
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
int cgroup1_get_tree(struct fs_context *fc);
int cgroup1_reconfigure(struct fs_context *ctx);
#endif /* __CGROUP_INTERNAL_H */

View File

@@ -13,9 +13,12 @@
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
#include <linux/cgroupstats.h>
#include <linux/fs_parser.h>
#include <trace/events/cgroup.h>
#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -906,172 +909,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
return 0;
}
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
enum cgroup1_param {
Opt_all,
Opt_clone_children,
Opt_cpuset_v2_mode,
Opt_name,
Opt_none,
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
};
static const struct fs_parameter_spec cgroup1_param_specs[] = {
fsparam_flag ("all", Opt_all),
fsparam_flag ("clone_children", Opt_clone_children),
fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
fsparam_string("name", Opt_name),
fsparam_flag ("none", Opt_none),
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
{}
};
const struct fs_parameter_description cgroup1_fs_parameters = {
.name = "cgroup1",
.specs = cgroup1_param_specs,
};
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
u16 mask = U16_MAX;
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct cgroup_subsys *ss;
struct fs_parse_result result;
int opt, i;
opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
if (strcmp(param->key, "source") == 0) {
fc->source = param->string;
param->string = NULL;
return 0;
}
for_each_subsys(ss, i) {
if (strcmp(param->key, ss->legacy_name))
continue;
ctx->subsys_mask |= (1 << i);
return 0;
}
return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
}
if (opt < 0)
return opt;
switch (opt) {
case Opt_none:
/* Explicitly have no subsystems */
ctx->none = true;
break;
case Opt_all:
ctx->all_ss = true;
break;
case Opt_noprefix:
ctx->flags |= CGRP_ROOT_NOPREFIX;
break;
case Opt_clone_children:
ctx->cpuset_clone_children = true;
break;
case Opt_cpuset_v2_mode:
ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
break;
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return cg_invalf(fc, "cgroup1: release_agent respecified");
ctx->release_agent = param->string;
param->string = NULL;
break;
case Opt_name:
/* blocked by boot param? */
if (cgroup_no_v1_named)
return -ENOENT;
/* Can't specify an empty name */
if (!param->size)
return cg_invalf(fc, "cgroup1: Empty name");
if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
return cg_invalf(fc, "cgroup1: Name too long");
/* Must match [\w.-]+ */
for (i = 0; i < param->size; i++) {
char c = param->string[i];
if (isalnum(c))
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
return cg_invalf(fc, "cgroup1: Invalid name");
}
/* Specifying two names is forbidden */
if (ctx->name)
return cg_invalf(fc, "cgroup1: name respecified");
ctx->name = param->string;
param->string = NULL;
break;
}
return 0;
}
static int check_cgroupfs_options(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
u16 mask = U16_MAX;
u16 enabled = 0;
struct cgroup_subsys *ss;
int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
mask = ~((u16)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
enabled |= 1 << i;
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
nr_opts++;
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
/* Explicitly have no subsystems */
opts->none = true;
continue;
}
if (!strcmp(token, "all")) {
/* Mutually exclusive option 'all' + subsystem name */
if (one_ss)
return -EINVAL;
all_ss = true;
continue;
}
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
}
if (!strcmp(token, "clone_children")) {
opts->cpuset_clone_children = true;
continue;
}
if (!strcmp(token, "cpuset_v2_mode")) {
opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
continue;
}
if (!strcmp(token, "xattr")) {
opts->flags |= CGRP_ROOT_XATTR;
continue;
}
if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
opts->release_agent =
kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
continue;
}
if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
/* blocked by boot param? */
if (cgroup_no_v1_named)
return -ENOENT;
/* Can't specify an empty name */
if (!strlen(name))
return -EINVAL;
/* Must match [\w.-]+ */
for (i = 0; i < strlen(name); i++) {
char c = name[i];
if (isalnum(c))
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
return -EINVAL;
}
/* Specifying two names is forbidden */
if (opts->name)
return -EINVAL;
opts->name = kstrndup(name,
MAX_CGROUP_ROOT_NAMELEN - 1,
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
continue;
}
for_each_subsys(ss, i) {
if (strcmp(token, ss->legacy_name))
continue;
if (!cgroup_ssid_enabled(i))
continue;
if (cgroup1_ssid_disabled(i))
continue;
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
return -EINVAL;
opts->subsys_mask |= (1 << i);
one_ss = true;
break;
}
if (i == CGROUP_SUBSYS_COUNT)
return -ENOENT;
}
ctx->subsys_mask &= enabled;
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
* not specified, let's default to 'all'
* In absense of 'none', 'name=' or subsystem name options,
* let's default to 'all'.
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
opts->subsys_mask |= (1 << i);
if (!ctx->subsys_mask && !ctx->none && !ctx->name)
ctx->all_ss = true;
if (ctx->all_ss) {
/* Mutually exclusive option 'all' + subsystem name */
if (ctx->subsys_mask)
return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
/* 'all' => select all the subsystems */
ctx->subsys_mask = enabled;
}
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
if (!opts->subsys_mask && !opts->name)
return -EINVAL;
if (!ctx->subsys_mask && !ctx->name)
return cg_invalf(fc, "cgroup1: Need name or subsystem set");
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
if (ctx->subsys_mask && ctx->none)
return cg_invalf(fc, "cgroup1: none used incorrectly");
return 0;
}
static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
int cgroup1_reconfigure(struct fs_context *fc)
{
int ret = 0;
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
int ret = 0;
u16 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
ret = check_cgroupfs_options(fc);
if (ret)
goto out_unlock;
if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
added_mask = opts.subsys_mask & ~root->subsys_mask;
removed_mask = root->subsys_mask & ~opts.subsys_mask;
added_mask = ctx->subsys_mask & ~root->subsys_mask;
removed_mask = root->subsys_mask & ~ctx->subsys_mask;
/* Don't allow flags or name to change at remount */
if ((opts.flags ^ root->flags) ||
(opts.name && strcmp(opts.name, root->name))) {
pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
opts.flags, opts.name ?: "", root->flags, root->name);
if ((ctx->flags ^ root->flags) ||
(ctx->name && strcmp(ctx->name, root->name))) {
cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
ctx->flags, ctx->name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
@@ -1088,17 +1114,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
if (opts.release_agent) {
if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
strcpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
trace_cgroup_remount(root);
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -1106,28 +1130,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
.rename = cgroup1_rename,
.show_options = cgroup1_show_options,
.remount_fs = cgroup1_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
};
struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
void *data, unsigned long magic,
struct cgroup_namespace *ns)
/*
* The guts of cgroup1 mount - find or create cgroup_root to use.
* Called with cgroup_mutex held; returns 0 on success, -E... on
* error and positive - in case when the candidate is busy dying.
* On success it stashes a reference to cgroup_root into given
* cgroup_fs_context; that reference is *NOT* counting towards the
* cgroup_root refcount.
*/
static int cgroup1_root_to_use(struct fs_context *fc)
{
struct cgroup_sb_opts opts;
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct dentry *dentry;
int i, ret;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
ret = check_cgroupfs_options(fc);
if (ret)
goto out_unlock;
return ret;
/*
* Destruction of cgroup root is asynchronous, so subsystems may
@@ -1137,16 +1163,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
if (!(opts.subsys_mask & (1 << i)) ||
if (!(ctx->subsys_mask & (1 << i)) ||
ss->root == &cgrp_dfl_root)
continue;
if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
return 1; /* restart */
cgroup_put(&ss->root->cgrp);
}
@@ -1161,8 +1183,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
*/
if (opts.name) {
if (strcmp(opts.name, root->name))
if (ctx->name) {
if (strcmp(ctx->name, root->name))
continue;
name_match = true;
}
@@ -1171,19 +1193,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
*/
if ((opts.subsys_mask || opts.none) &&
(opts.subsys_mask != root->subsys_mask)) {
if ((ctx->subsys_mask || ctx->none) &&
(ctx->subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
ret = -EBUSY;
goto out_unlock;
return -EBUSY;
}
if (root->flags ^ opts.flags)
if (root->flags ^ ctx->flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
ret = 0;
goto out_unlock;
ctx->root = root;
return 0;
}
/*
@@ -1191,55 +1212,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* specification is allowed for already existing hierarchies but we
* can't create new one without subsys specification.
*/
if (!opts.subsys_mask && !opts.none) {
ret = -EINVAL;
goto out_unlock;
}
if (!ctx->subsys_mask && !ctx->none)
return cg_invalf(fc, "cgroup1: No subsys list or none specified");
/* Hierarchies may only be created in the initial cgroup namespace. */
if (ns != &init_cgroup_ns) {
ret = -EPERM;
goto out_unlock;
}
if (ctx->ns != &init_cgroup_ns)
return -EPERM;
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {
ret = -ENOMEM;
goto out_unlock;
}
if (!root)
return -ENOMEM;
init_cgroup_root(root, &opts);
ctx->root = root;
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, opts.subsys_mask);
ret = cgroup_setup_root(root, ctx->subsys_mask);
if (ret)
cgroup_free_root(root);
return ret;
}
int cgroup1_get_tree(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
/* Check if the caller has permission to mount. */
if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
ret = cgroup1_root_to_use(fc);
if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
ret = 1; /* restart */
out_unlock:
if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
mutex_unlock(&cgroup_mutex);
out_free:
kfree(opts.release_agent);
kfree(opts.name);
if (ret)
return ERR_PTR(ret);
if (!ret)
ret = cgroup_do_get_tree(fc);
dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
CGROUP_SUPER_MAGIC, ns);
if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
struct super_block *sb = dentry->d_sb;
dput(dentry);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
struct super_block *sb = fc->root->d_sb;
dput(fc->root);
deactivate_locked_super(sb);
msleep(10);
dentry = ERR_PTR(restart_syscall());
ret = 1;
}
return dentry;
if (unlikely(ret > 0)) {
msleep(10);
return restart_syscall();
}
return ret;
}
static int __init cgroup1_wq_init(void)

View File

@@ -54,6 +54,7 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/psi.h>
#include <net/sock.h>
@@ -1772,26 +1773,37 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
enum cgroup2_param {
Opt_nsdelegate,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_param_specs[] = {
fsparam_flag ("nsdelegate", Opt_nsdelegate),
{}
};
static const struct fs_parameter_description cgroup2_fs_parameters = {
.name = "cgroup2",
.specs = cgroup2_param_specs,
};
static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
char *token;
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct fs_parse_result result;
int opt;
*root_flags = 0;
opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
if (opt < 0)
return opt;
if (!data || *data == '\0')
switch (opt) {
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
while ((token = strsep(&data, ",")) != NULL) {
if (!strcmp(token, "nsdelegate")) {
*root_flags |= CGRP_ROOT_NS_DELEGATE;
continue;
}
pr_err("cgroup2: unknown option \"%s\"\n", token);
return -EINVAL;
}
return 0;
return -EINVAL;
}
static void apply_cgroup_root_flags(unsigned int root_flags)
@@ -1811,16 +1823,11 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
return 0;
}
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
static int cgroup_reconfigure(struct fs_context *fc)
{
unsigned int root_flags;
int ret;
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
ret = parse_cgroup_root_flags(data, &root_flags);
if (ret)
return ret;
apply_cgroup_root_flags(root_flags);
apply_cgroup_root_flags(ctx->flags);
return 0;
}
@@ -1908,8 +1915,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
void init_cgroup_root(struct cgroup_fs_context *ctx)
{
struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
@@ -1918,12 +1926,12 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
init_cgroup_housekeeping(cgrp);
idr_init(&root->cgroup_idr);
root->flags = opts->flags;
if (opts->release_agent)
strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
if (opts->name)
strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
if (opts->cpuset_clone_children)
root->flags = ctx->flags;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
@@ -2028,60 +2036,104 @@ out:
return ret;
}
struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_root *root, unsigned long magic,
struct cgroup_namespace *ns)
int cgroup_do_get_tree(struct fs_context *fc)
{
struct dentry *dentry;
bool new_sb = false;
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
ctx->kfc.root = ctx->root->kf_root;
if (fc->fs_type == &cgroup2_fs_type)
ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
else
ctx->kfc.magic = CGROUP_SUPER_MAGIC;
ret = kernfs_get_tree(fc);
/*
* In non-init cgroup namespace, instead of root cgroup's dentry,
* we return the dentry corresponding to the cgroupns->root_cgrp.
*/
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
if (!ret && ctx->ns != &init_cgroup_ns) {
struct dentry *nsdentry;
struct super_block *sb = dentry->d_sb;
struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
cgrp = cset_cgroup_from_root(ns->root_cset, root);
cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(dentry);
if (IS_ERR(nsdentry))
dput(fc->root);
fc->root = nsdentry;
if (IS_ERR(nsdentry)) {
ret = PTR_ERR(nsdentry);
deactivate_locked_super(sb);
dentry = nsdentry;
}
}
if (!new_sb)
cgroup_put(&root->cgrp);
if (!ctx->kfc.new_sb_created)
cgroup_put(&ctx->root->cgrp);
return dentry;
return ret;
}
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
/*
* Destroy a cgroup filesystem context.
*/
static void cgroup_fs_context_free(struct fs_context *fc)
{
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct dentry *dentry;
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
kfree(ctx->name);
kfree(ctx->release_agent);
put_cgroup_ns(ctx->ns);
kernfs_free_fs_context(fc);
kfree(ctx);
}
static int cgroup_get_tree(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
get_cgroup_ns(ns);
cgrp_dfl_visible = true;
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
/* Check if the caller has permission to mount. */
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
put_cgroup_ns(ns);
return ERR_PTR(-EPERM);
}
ret = cgroup_do_get_tree(fc);
if (!ret)
apply_cgroup_root_flags(ctx->flags);
return ret;
}
static const struct fs_context_operations cgroup_fs_context_ops = {
.free = cgroup_fs_context_free,
.parse_param = cgroup2_parse_param,
.get_tree = cgroup_get_tree,
.reconfigure = cgroup_reconfigure,
};
static const struct fs_context_operations cgroup1_fs_context_ops = {
.free = cgroup_fs_context_free,
.parse_param = cgroup1_parse_param,
.get_tree = cgroup1_get_tree,
.reconfigure = cgroup1_reconfigure,
};
/*
* Initialise the cgroup filesystem creation/reconfiguration context. Notably,
* we select the namespace we're going to use.
*/
static int cgroup_init_fs_context(struct fs_context *fc)
{
struct cgroup_fs_context *ctx;
ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
/*
* The first time anyone tries to mount a cgroup, enable the list
@@ -2090,29 +2142,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
if (fs_type == &cgroup2_fs_type) {
unsigned int root_flags;
ret = parse_cgroup_root_flags(data, &root_flags);
if (ret) {
put_cgroup_ns(ns);
return ERR_PTR(ret);
}
cgrp_dfl_visible = true;
cgroup_get_live(&cgrp_dfl_root.cgrp);
dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
if (!IS_ERR(dentry))
apply_cgroup_root_flags(root_flags);
} else {
dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
CGROUP_SUPER_MAGIC, ns);
}
put_cgroup_ns(ns);
return dentry;
ctx->ns = current->nsproxy->cgroup_ns;
get_cgroup_ns(ctx->ns);
fc->fs_private = &ctx->kfc;
if (fc->fs_type == &cgroup2_fs_type)
fc->ops = &cgroup_fs_context_ops;
else
fc->ops = &cgroup1_fs_context_ops;
if (fc->user_ns)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
return 0;
}
static void cgroup_kill_sb(struct super_block *sb)
@@ -2135,17 +2176,19 @@ static void cgroup_kill_sb(struct super_block *sb)
}
struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
.name = "cgroup",
.init_fs_context = cgroup_init_fs_context,
.parameters = &cgroup1_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
.name = "cgroup2",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
.name = "cgroup2",
.init_fs_context = cgroup_init_fs_context,
.parameters = &cgroup2_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
@@ -5280,7 +5323,6 @@ int cgroup_rmdir(struct kernfs_node *kn)
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.show_options = cgroup_show_options,
.remount_fs = cgroup_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
@@ -5347,11 +5389,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
*/
int __init cgroup_init_early(void)
{
static struct cgroup_sb_opts __initdata opts;
static struct cgroup_fs_context __initdata ctx;
struct cgroup_subsys *ss;
int i;
init_cgroup_root(&cgrp_dfl_root, &opts);
ctx.root = &cgrp_dfl_root;
init_cgroup_root(&ctx);
cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

View File

@@ -39,6 +39,7 @@
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
@@ -359,25 +360,52 @@ static inline bool is_in_v2_mode(void)
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
static struct dentry *cpuset_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name, void *data)
static int cpuset_get_tree(struct fs_context *fc)
{
struct file_system_type *cgroup_fs = get_fs_type("cgroup");
struct dentry *ret = ERR_PTR(-ENODEV);
if (cgroup_fs) {
char mountopts[] =
"cpuset,noprefix,"
"release_agent=/sbin/cpuset_release_agent";
ret = cgroup_fs->mount(cgroup_fs, flags,
unused_dev_name, mountopts);
put_filesystem(cgroup_fs);
struct file_system_type *cgroup_fs;
struct fs_context *new_fc;
int ret;
cgroup_fs = get_fs_type("cgroup");
if (!cgroup_fs)
return -ENODEV;
new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags);
if (IS_ERR(new_fc)) {
ret = PTR_ERR(new_fc);
} else {
static const char agent_path[] = "/sbin/cpuset_release_agent";
ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0);
if (!ret)
ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0);
if (!ret)
ret = vfs_parse_fs_string(new_fc, "release_agent",
agent_path, sizeof(agent_path) - 1);
if (!ret)
ret = vfs_get_tree(new_fc);
if (!ret) { /* steal the result */
fc->root = new_fc->root;
new_fc->root = NULL;
}
put_fs_context(new_fc);
}
put_filesystem(cgroup_fs);
return ret;
}
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cpuset_get_tree,
};
static int cpuset_init_fs_context(struct fs_context *fc)
{
fc->ops = &cpuset_fs_context_ops;
return 0;
}
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
.mount = cpuset_mount,
.name = "cpuset",
.init_fs_context = cpuset_init_fs_context,
};
/*