Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull user namespace updates from Eric Biederman:
 "Long ago and far away when user namespaces where young it was realized
  that allowing fresh mounts of proc and sysfs with only user namespace
  permissions could violate the basic rule that only root gets to decide
  if proc or sysfs should be mounted at all.

  Some hacks were put in place to reduce the worst of the damage could
  be done, and the common sense rule was adopted that fresh mounts of
  proc and sysfs should allow no more than bind mounts of proc and
  sysfs.  Unfortunately that rule has not been fully enforced.

  There are two kinds of gaps in that enforcement.  Only filesystems
  mounted on empty directories of proc and sysfs should be ignored but
  the test for empty directories was insufficient.  So in my tree
  directories on proc, sysctl and sysfs that will always be empty are
  created specially.  Every other technique is imperfect as an ordinary
  directory can have entries added even after a readdir returns and
  shows that the directory is empty.  Special creation of directories
  for mount points makes the code in the kernel a smidge clearer about
  it's purpose.  I asked container developers from the various container
  projects to help test this and no holes were found in the set of mount
  points on proc and sysfs that are created specially.

  This set of changes also starts enforcing the mount flags of fresh
  mounts of proc and sysfs are consistent with the existing mount of
  proc and sysfs.  I expected this to be the boring part of the work but
  unfortunately unprivileged userspace winds up mounting fresh copies of
  proc and sysfs with noexec and nosuid clear when root set those flags
  on the previous mount of proc and sysfs.  So for now only the atime,
  read-only and nodev attributes which userspace happens to keep
  consistent are enforced.  Dealing with the noexec and nosuid
  attributes remains for another time.

  This set of changes also addresses an issue with how open file
  descriptors from /proc/<pid>/ns/* are displayed.  Recently readlink of
  /proc/<pid>/fd has been triggering a WARN_ON that has not been
  meaningful since it was added (as all of the code in the kernel was
  converted) and is not now actively wrong.

  There is also a short list of issues that have not been fixed yet that
  I will mention briefly.

  It is possible to rename a directory from below to above a bind mount.
  At which point any directory pointers below the renamed directory can
  be walked up to the root directory of the filesystem.  With user
  namespaces enabled a bind mount of the bind mount can be created
  allowing the user to pick a directory whose children they can rename
  to outside of the bind mount.  This is challenging to fix and doubly
  so because all obvious solutions must touch code that is in the
  performance part of pathname resolution.

  As mentioned above there is also a question of how to ensure that
  developers by accident or with purpose do not introduce exectuable
  files on sysfs and proc and in doing so introduce security regressions
  in the current userspace that will not be immediately obvious and as
  such are likely to require breaking userspace in painful ways once
  they are recognized"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  vfs: Remove incorrect debugging WARN in prepend_path
  mnt: Update fs_fully_visible to test for permanently empty directories
  sysfs: Create mountpoints with sysfs_create_mount_point
  sysfs: Add support for permanently empty directories to serve as mount points.
  kernfs: Add support for always empty directories.
  proc: Allow creating permanently empty directories that serve as mount points
  sysctl: Allow creating permanently empty directories that serve as mountpoints.
  fs: Add helper functions for permanently empty directories.
  vfs: Ignore unlocked mounts in fs_fully_visible
  mnt: Modify fs_fully_visible to deal with locked ro nodev and atime
  mnt: Refactor the logic for mounting sysfs and proc in a user namespace
This commit is contained in:
Linus Torvalds
2015-07-03 15:20:57 -07:00
bovenliggende 2fee94b74b 93e3bce628
commit 0cbee99269
28 gewijzigde bestanden met toevoegingen van 340 en 101 verwijderingen

Bestand weergeven

@@ -373,6 +373,10 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
WARN(1, "create '/proc/%s' by hand\n", qstr.name);
return NULL;
}
if (is_empty_pde(*parent)) {
WARN(1, "attempt to add to permanently empty directory");
return NULL;
}
ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
if (!ent)
@@ -455,6 +459,25 @@ struct proc_dir_entry *proc_mkdir(const char *name,
}
EXPORT_SYMBOL(proc_mkdir);
struct proc_dir_entry *proc_create_mount_point(const char *name)
{
umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO;
struct proc_dir_entry *ent, *parent = NULL;
ent = __proc_create(&parent, name, mode, 2);
if (ent) {
ent->data = NULL;
ent->proc_fops = NULL;
ent->proc_iops = NULL;
if (proc_register(parent, ent) < 0) {
kfree(ent);
parent->nlink--;
ent = NULL;
}
}
return ent;
}
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct file_operations *proc_fops,

Bestand weergeven

@@ -422,6 +422,10 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
PROC_I(inode)->pde = de;
if (is_empty_pde(de)) {
make_empty_dir_inode(inode);
return inode;
}
if (de->mode) {
inode->i_mode = de->mode;
inode->i_uid = de->uid;

Bestand weergeven

@@ -191,6 +191,12 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
}
extern void pde_put(struct proc_dir_entry *);
static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
return S_ISDIR(pde->mode) && !pde->proc_iops;
}
struct proc_dir_entry *proc_create_mount_point(const char *name);
/*
* inode.c
*/

Bestand weergeven

@@ -19,6 +19,28 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
/* Support for permanently empty directories */
struct ctl_table sysctl_mount_point[] = {
{ }
};
static bool is_empty_dir(struct ctl_table_header *head)
{
return head->ctl_table[0].child == sysctl_mount_point;
}
static void set_empty_dir(struct ctl_dir *dir)
{
dir->header.ctl_table[0].child = sysctl_mount_point;
}
static void clear_empty_dir(struct ctl_dir *dir)
{
dir->header.ctl_table[0].child = NULL;
}
void proc_sys_poll_notify(struct ctl_table_poll *poll)
{
if (!poll)
@@ -187,6 +209,17 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
struct ctl_table *entry;
int err;
/* Is this a permanently empty directory? */
if (is_empty_dir(&dir->header))
return -EROFS;
/* Am I creating a permanently empty directory? */
if (header->ctl_table == sysctl_mount_point) {
if (!RB_EMPTY_ROOT(&dir->root))
return -EINVAL;
set_empty_dir(dir);
}
dir->header.nreg++;
header->parent = dir;
err = insert_links(header);
@@ -202,6 +235,8 @@ fail:
erase_header(header);
put_links(header);
fail_links:
if (header->ctl_table == sysctl_mount_point)
clear_empty_dir(dir);
header->parent = NULL;
drop_sysctl_table(&dir->header);
return err;
@@ -419,6 +454,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_mode |= S_IFDIR;
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
if (is_empty_dir(head))
make_empty_dir_inode(inode);
}
out:
return inode;

Bestand weergeven

@@ -112,9 +112,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
ns = task_active_pid_ns(current);
options = data;
if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
return ERR_PTR(-EPERM);
/* Does the mounter have privilege over the pid namespace? */
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -159,7 +156,7 @@ static struct file_system_type proc_fs_type = {
.name = "proc",
.mount = proc_mount,
.kill_sb = proc_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
.fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
@@ -182,10 +179,10 @@ void __init proc_root_init(void)
#endif
proc_mkdir("fs", NULL);
proc_mkdir("driver", NULL);
proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
/* just give it a mountpoint */
proc_mkdir("openprom", NULL);
proc_create_mount_point("openprom");
#endif
proc_tty_init();
proc_mkdir("bus", NULL);