Merge branch 'for-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

 - Fixes and a lot of cleanups.  Locking cleanup is finally complete.
   cgroup_mutex is no longer exposed to individual controlelrs which
   used to cause nasty deadlock issues.  Li fixed and cleaned up quite a
   bit including long standing ones like racy cgroup_path().

 - device cgroup now supports proper hierarchy thanks to Aristeu.

 - perf_event cgroup now supports proper hierarchy.

 - A new mount option "__DEVEL__sane_behavior" is added.  As indicated
   by the name, this option is to be used for development only at this
   point and generates a warning message when used.  Unfortunately,
   cgroup interface currently has too many brekages and inconsistencies
   to implement a consistent and unified hierarchy on top.  The new flag
   is used to collect the behavior changes which are necessary to
   implement consistent unified hierarchy.  It's likely that this flag
   won't be used verbatim when it becomes ready but will be enabled
   implicitly along with unified hierarchy.

   The option currently disables some of broken behaviors in cgroup core
   and also .use_hierarchy switch in memcg (will be routed through -mm),
   which can be used to make very unusual hierarchy where nesting is
   partially honored.  It will also be used to implement hierarchy
   support for blk-throttle which would be impossible otherwise without
   introducing a full separate set of control knobs.

   This is essentially versioning of interface which isn't very nice but
   at this point I can't see any other options which would allow keeping
   the interface the same while moving towards hierarchy behavior which
   is at least somewhat sane.  The planned unified hierarchy is likely
   to require some level of adaptation from userland anyway, so I think
   it'd be best to take the chance and update the interface such that
   it's supportable in the long term.

   Maintaining the existing interface does complicate cgroup core but
   shouldn't put too much strain on individual controllers and I think
   it'd be manageable for the foreseeable future.  Maybe we'll be able
   to drop it in a decade.

Fix up conflicts (including a semantic one adding a new #include to ppc
that was uncovered by header the file changes) as per Tejun.

* 'for-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (45 commits)
  cpuset: fix compile warning when CONFIG_SMP=n
  cpuset: fix cpu hotplug vs rebuild_sched_domains() race
  cpuset: use rebuild_sched_domains() in cpuset_hotplug_workfn()
  cgroup: restore the call to eventfd->poll()
  cgroup: fix use-after-free when umounting cgroupfs
  cgroup: fix broken file xattrs
  devcg: remove parent_cgroup.
  memcg: force use_hierarchy if sane_behavior
  cgroup: remove cgrp->top_cgroup
  cgroup: introduce sane_behavior mount option
  move cgroupfs_root to include/linux/cgroup.h
  cgroup: convert cgroupfs_root flag bits to masks and add CGRP_ prefix
  cgroup: make cgroup_path() not print double slashes
  Revert "cgroup: remove bind() method from cgroup_subsys."
  perf: make perf_event cgroup hierarchical
  cgroup: implement cgroup_is_descendant()
  cgroup: make sure parent won't be destroyed before its children
  cgroup: remove bind() method from cgroup_subsys.
  devcg: remove broken_hierarchy tag
  cgroup: remove cgroup_lock_is_held()
  ...
This commit is contained in:
Linus Torvalds
2013-04-29 19:14:20 -07:00
12 changed files with 832 additions and 639 deletions

View File

@@ -264,17 +264,6 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
* cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
* buffers. They are statically allocated to prevent using excess stack
* when calling cpuset_print_task_mems_allowed().
*/
#define CPUSET_NAME_LEN (128)
#define CPUSET_NODELIST_LEN (256)
static char cpuset_name[CPUSET_NAME_LEN];
static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
/*
* CPU / memory hotplug is handled asynchronously.
*/
@@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void)
lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
/*
* We have raced with CPU hotplug. Don't do anything to avoid
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
out:
put_online_cpus();
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
{
}
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
*domains = NULL;
return 1;
}
#endif /* CONFIG_SMP */
void rebuild_sched_domains(void)
@@ -2005,50 +1995,6 @@ int __init cpuset_init(void)
return 0;
}
/**
* cpuset_do_move_task - move a given task to another cpuset
* @tsk: pointer to task_struct the task to move
* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
*
* Called by cgroup_scan_tasks() for each task in a cgroup.
* Return nonzero to stop the walk through the tasks.
*/
static void cpuset_do_move_task(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
struct cgroup *new_cgroup = scan->data;
cgroup_lock();
cgroup_attach_task(new_cgroup, tsk);
cgroup_unlock();
}
/**
* move_member_tasks_to_cpuset - move tasks from one cpuset to another
* @from: cpuset in which the tasks currently reside
* @to: cpuset to which the tasks will be moved
*
* Called with cpuset_mutex held
* callback_mutex must not be held, as cpuset_attach() will take it.
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
*/
static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
{
struct cgroup_scanner scan;
scan.cg = from->css.cgroup;
scan.test_task = NULL; /* select all tasks in cgroup */
scan.process_task = cpuset_do_move_task;
scan.heap = NULL;
scan.data = to->css.cgroup;
if (cgroup_scan_tasks(&scan))
printk(KERN_ERR "move_member_tasks_to_cpuset: "
"cgroup_scan_tasks failed\n");
}
/*
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
nodes_empty(parent->mems_allowed))
parent = parent_cs(parent);
move_member_tasks_to_cpuset(cs, parent);
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
rcu_read_lock();
printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
cgroup_name(cs->css.cgroup));
rcu_read_unlock();
}
}
/**
@@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
flush_workqueue(cpuset_propagate_hotplug_wq);
/* rebuild sched domains if cpus_allowed has changed */
if (cpus_updated) {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
mutex_lock(&cpuset_mutex);
ndoms = generate_sched_domains(&doms, &attr);
mutex_unlock(&cpuset_mutex);
partition_sched_domains(ndoms, doms, attr);
}
if (cpus_updated)
rebuild_sched_domains();
}
void cpuset_update_active_cpus(bool cpu_online)
@@ -2594,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
#define CPUSET_NODELIST_LEN (256)
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
* @task: pointer to task_struct of some task.
@@ -2604,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
struct dentry *dentry;
/* Statically allocated to prevent using excess stack. */
static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock);
dentry = task_cs(tsk)->css.cgroup->dentry;
struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
rcu_read_lock();
spin_lock(&cpuset_buffer_lock);
if (!dentry) {
strcpy(cpuset_name, "/");
} else {
spin_lock(&dentry->d_lock);
strlcpy(cpuset_name, (const char *)dentry->d_name.name,
CPUSET_NAME_LEN);
spin_unlock(&dentry->d_lock);
}
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
tsk->comm, cpuset_name, cpuset_nodelist);
tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
spin_unlock(&cpuset_buffer_lock);
rcu_read_unlock();
}
/*