Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull user namespace changes from Eric Biederman:
 "While small this set of changes is very significant with respect to
  containers in general and user namespaces in particular.  The user
  space interface is now complete.

  This set of changes adds support for unprivileged users to create user
  namespaces and as a user namespace root to create other namespaces.
  The tyranny of supporting suid root preventing unprivileged users from
  using cool new kernel features is broken.

  This set of changes completes the work on setns, adding support for
  the pid, user, mount namespaces.

  This set of changes includes a bunch of basic pid namespace
  cleanups/simplifications.  Of particular significance is the rework of
  the pid namespace cleanup so it no longer requires sending out
  tendrils into all kinds of unexpected cleanup paths for operation.  At
  least one case of broken error handling is fixed by this cleanup.

  The files under /proc/<pid>/ns/ have been converted from regular files
  to magic symlinks which prevents incorrect caching by the VFS,
  ensuring the files always refer to the namespace the process is
  currently using and ensuring that the ptrace_mayaccess permission
  checks are always applied.

  The files under /proc/<pid>/ns/ have been given stable inode numbers
  so it is now possible to see if different processes share the same
  namespaces.

  Through the David Miller's net tree are changes to relax many of the
  permission checks in the networking stack to allowing the user
  namespace root to usefully use the networking stack.  Similar changes
  for the mount namespace and the pid namespace are coming through my
  tree.

  Two small changes to add user namespace support were commited here adn
  in David Miller's -net tree so that I could complete the work on the
  /proc/<pid>/ns/ files in this tree.

  Work remains to make it safe to build user namespaces and 9p, afs,
  ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the
  Kconfig guard remains in place preventing that user namespaces from
  being built when any of those filesystems are enabled.

  Future design work remains to allow root users outside of the initial
  user namespace to mount more than just /proc and /sys."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits)
  proc: Usable inode numbers for the namespace file descriptors.
  proc: Fix the namespace inode permission checks.
  proc: Generalize proc inode allocation
  userns: Allow unprivilged mounts of proc and sysfs
  userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file
  procfs: Print task uids and gids in the userns that opened the proc file
  userns: Implement unshare of the user namespace
  userns: Implent proc namespace operations
  userns: Kill task_user_ns
  userns: Make create_new_namespaces take a user_ns parameter
  userns: Allow unprivileged use of setns.
  userns: Allow unprivileged users to create new namespaces
  userns: Allow setting a userns mapping to your current uid.
  userns: Allow chown and setgid preservation
  userns: Allow unprivileged users to create user namespaces.
  userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped
  userns: fix return value on mntns_install() failure
  vfs: Allow unprivileged manipulation of the mount namespace.
  vfs: Only support slave subtrees across different user namespaces
  vfs: Add a user namespace reference from struct mnt_namespace
  ...
This commit is contained in:
Linus Torvalds
2012-12-17 15:44:47 -08:00
59 changed files with 996 additions and 451 deletions

View File

@@ -2345,146 +2345,6 @@ static const struct file_operations proc_coredump_filter_operations = {
};
#endif
/*
* /proc/self:
*/
static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
int buflen)
{
struct pid_namespace *ns = dentry->d_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
char tmp[PROC_NUMBUF];
if (!tgid)
return -ENOENT;
sprintf(tmp, "%d", tgid);
return vfs_readlink(dentry,buffer,buflen,tmp);
}
static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct pid_namespace *ns = dentry->d_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name = ERR_PTR(-ENOENT);
if (tgid) {
/* 11 for max length of signed int in decimal + NULL term */
name = kmalloc(12, GFP_KERNEL);
if (!name)
name = ERR_PTR(-ENOMEM);
else
sprintf(name, "%d", tgid);
}
nd_set_link(nd, name);
return NULL;
}
static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
void *cookie)
{
char *s = nd_get_link(nd);
if (!IS_ERR(s))
kfree(s);
}
static const struct inode_operations proc_self_inode_operations = {
.readlink = proc_self_readlink,
.follow_link = proc_self_follow_link,
.put_link = proc_self_put_link,
};
/*
* proc base
*
* These are the directory entries in the root directory of /proc
* that properly belong to the /proc filesystem, as they describe
* describe something that is process related.
*/
static const struct pid_entry proc_base_stuff[] = {
NOD("self", S_IFLNK|S_IRWXUGO,
&proc_self_inode_operations, NULL, {}),
};
static struct dentry *proc_base_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
const struct pid_entry *p = ptr;
struct inode *inode;
struct proc_inode *ei;
struct dentry *error;
/* Allocate the inode */
error = ERR_PTR(-ENOMEM);
inode = new_inode(dir->i_sb);
if (!inode)
goto out;
/* Initialize the inode */
ei = PROC_I(inode);
inode->i_ino = get_next_ino();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
/*
* grab the reference to the task.
*/
ei->pid = get_task_pid(task, PIDTYPE_PID);
if (!ei->pid)
goto out_iput;
inode->i_mode = p->mode;
if (S_ISDIR(inode->i_mode))
set_nlink(inode, 2);
if (S_ISLNK(inode->i_mode))
inode->i_size = 64;
if (p->iop)
inode->i_op = p->iop;
if (p->fop)
inode->i_fop = p->fop;
ei->op = p->op;
d_add(dentry, inode);
error = NULL;
out:
return error;
out_iput:
iput(inode);
goto out;
}
static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
{
struct dentry *error;
struct task_struct *task = get_proc_task(dir);
const struct pid_entry *p, *last;
error = ERR_PTR(-ENOENT);
if (!task)
goto out_no_task;
/* Lookup the directory entry */
last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
for (p = proc_base_stuff; p <= last; p++) {
if (p->len != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, p->name, p->len))
break;
}
if (p > last)
goto out;
error = proc_base_instantiate(dir, dentry, task, p);
out:
put_task_struct(task);
out_no_task:
return error;
}
static int proc_base_fill_cache(struct file *filp, void *dirent,
filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
{
return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
proc_base_instantiate, task, p);
}
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
{
@@ -2839,10 +2699,6 @@ void proc_flush_task(struct task_struct *task)
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
upid = &pid->numbers[pid->level];
if (upid->nr == 1)
pid_ns_release_proc(upid->ns);
}
static struct dentry *proc_pid_instantiate(struct inode *dir,
@@ -2876,15 +2732,11 @@ out:
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
struct dentry *result;
struct dentry *result = NULL;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
result = proc_base_lookup(dir, dentry);
if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
goto out;
tgid = name_to_int(dentry);
if (tgid == ~0U)
goto out;
@@ -2947,7 +2799,7 @@ retry:
return iter;
}
#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
struct tgid_iter iter)
@@ -2967,25 +2819,12 @@ static int fake_filldir(void *buf, const char *name, int namelen,
/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
unsigned int nr;
struct task_struct *reaper;
struct tgid_iter iter;
struct pid_namespace *ns;
filldir_t __filldir;
if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
goto out_no_task;
nr = filp->f_pos - FIRST_PROCESS_ENTRY;
reaper = get_proc_task(filp->f_path.dentry->d_inode);
if (!reaper)
goto out_no_task;
for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
const struct pid_entry *p = &proc_base_stuff[nr];
if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
goto out;
}
goto out;
ns = filp->f_dentry->d_sb->s_fs_info;
iter.task = NULL;
@@ -3006,8 +2845,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
}
filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
out:
put_task_struct(reaper);
out_no_task:
return 0;
}