|
@@ -63,7 +63,24 @@
|
|
|
|
|
|
#include <linux/atomic.h>
|
|
|
|
|
|
+/*
|
|
|
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
|
|
|
+ * hierarchy must be performed while holding it.
|
|
|
+ *
|
|
|
+ * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
|
|
|
+ * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
|
|
|
+ * release_agent_path and so on. Modifying requires both cgroup_mutex and
|
|
|
+ * cgroup_root_mutex. Readers can acquire either of the two. This is to
|
|
|
+ * break the following locking order cycle.
|
|
|
+ *
|
|
|
+ * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
|
|
|
+ * B. namespace_sem -> cgroup_mutex
|
|
|
+ *
|
|
|
+ * B happens only through cgroup_show_options() and using cgroup_root_mutex
|
|
|
+ * breaks it.
|
|
|
+ */
|
|
|
static DEFINE_MUTEX(cgroup_mutex);
|
|
|
+static DEFINE_MUTEX(cgroup_root_mutex);
|
|
|
|
|
|
/*
|
|
|
* Generate an array of cgroup subsystem pointers. At boot time, this is
|
|
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
|
|
|
*
|
|
|
* CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
|
|
|
*/
|
|
|
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
|
|
|
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
|
|
|
|
|
|
static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
|
|
|
{
|
|
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
|
|
|
int i;
|
|
|
|
|
|
BUG_ON(!mutex_is_locked(&cgroup_mutex));
|
|
|
+ BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
|
|
|
|
|
|
removed_bits = root->actual_subsys_bits & ~final_bits;
|
|
|
added_bits = final_bits & ~root->actual_subsys_bits;
|
|
@@ -1043,7 +1061,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
|
|
|
struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
|
|
|
struct cgroup_subsys *ss;
|
|
|
|
|
|
- mutex_lock(&cgroup_mutex);
|
|
|
+ mutex_lock(&cgroup_root_mutex);
|
|
|
for_each_subsys(root, ss)
|
|
|
seq_printf(seq, ",%s", ss->name);
|
|
|
if (test_bit(ROOT_NOPREFIX, &root->flags))
|
|
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
|
|
|
seq_puts(seq, ",clone_children");
|
|
|
if (strlen(root->name))
|
|
|
seq_printf(seq, ",name=%s", root->name);
|
|
|
- mutex_unlock(&cgroup_mutex);
|
|
|
+ mutex_unlock(&cgroup_root_mutex);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
|
|
|
|
|
|
/*
|
|
|
* If the 'all' option was specified select all the subsystems,
|
|
|
- * otherwise 'all, 'none' and a subsystem name options were not
|
|
|
- * specified, let's default to 'all'
|
|
|
+ * otherwise if 'none', 'name=' and a subsystem name options
|
|
|
+ * were not specified, let's default to 'all'
|
|
|
*/
|
|
|
- if (all_ss || (!all_ss && !one_ss && !opts->none)) {
|
|
|
+ if (all_ss || (!one_ss && !opts->none && !opts->name)) {
|
|
|
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
|
|
|
struct cgroup_subsys *ss = subsys[i];
|
|
|
if (ss == NULL)
|
|
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
|
|
|
|
|
|
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
|
|
|
mutex_lock(&cgroup_mutex);
|
|
|
+ mutex_lock(&cgroup_root_mutex);
|
|
|
|
|
|
/* See what subsystems are wanted */
|
|
|
ret = parse_cgroupfs_options(data, &opts);
|
|
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
|
|
|
out_unlock:
|
|
|
kfree(opts.release_agent);
|
|
|
kfree(opts.name);
|
|
|
+ mutex_unlock(&cgroup_root_mutex);
|
|
|
mutex_unlock(&cgroup_mutex);
|
|
|
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
|
|
|
return ret;
|
|
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
|
int ret = 0;
|
|
|
struct super_block *sb;
|
|
|
struct cgroupfs_root *new_root;
|
|
|
+ struct inode *inode;
|
|
|
|
|
|
/* First find the desired set of subsystems */
|
|
|
mutex_lock(&cgroup_mutex);
|
|
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
|
/* We used the new root structure, so this is a new hierarchy */
|
|
|
struct list_head tmp_cg_links;
|
|
|
struct cgroup *root_cgrp = &root->top_cgroup;
|
|
|
- struct inode *inode;
|
|
|
struct cgroupfs_root *existing_root;
|
|
|
const struct cred *cred;
|
|
|
int i;
|
|
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
|
|
|
|
mutex_lock(&inode->i_mutex);
|
|
|
mutex_lock(&cgroup_mutex);
|
|
|
+ mutex_lock(&cgroup_root_mutex);
|
|
|
|
|
|
- if (strlen(root->name)) {
|
|
|
- /* Check for name clashes with existing mounts */
|
|
|
- for_each_active_root(existing_root) {
|
|
|
- if (!strcmp(existing_root->name, root->name)) {
|
|
|
- ret = -EBUSY;
|
|
|
- mutex_unlock(&cgroup_mutex);
|
|
|
- mutex_unlock(&inode->i_mutex);
|
|
|
- goto drop_new_super;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+ /* Check for name clashes with existing mounts */
|
|
|
+ ret = -EBUSY;
|
|
|
+ if (strlen(root->name))
|
|
|
+ for_each_active_root(existing_root)
|
|
|
+ if (!strcmp(existing_root->name, root->name))
|
|
|
+ goto unlock_drop;
|
|
|
|
|
|
/*
|
|
|
* We're accessing css_set_count without locking
|
|
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
|
* have some link structures left over
|
|
|
*/
|
|
|
ret = allocate_cg_links(css_set_count, &tmp_cg_links);
|
|
|
- if (ret) {
|
|
|
- mutex_unlock(&cgroup_mutex);
|
|
|
- mutex_unlock(&inode->i_mutex);
|
|
|
- goto drop_new_super;
|
|
|
- }
|
|
|
+ if (ret)
|
|
|
+ goto unlock_drop;
|
|
|
|
|
|
ret = rebind_subsystems(root, root->subsys_bits);
|
|
|
if (ret == -EBUSY) {
|
|
|
- mutex_unlock(&cgroup_mutex);
|
|
|
- mutex_unlock(&inode->i_mutex);
|
|
|
free_cg_links(&tmp_cg_links);
|
|
|
- goto drop_new_super;
|
|
|
+ goto unlock_drop;
|
|
|
}
|
|
|
/*
|
|
|
* There must be no failure case after here, since rebinding
|
|
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
|
cred = override_creds(&init_cred);
|
|
|
cgroup_populate_dir(root_cgrp);
|
|
|
revert_creds(cred);
|
|
|
+ mutex_unlock(&cgroup_root_mutex);
|
|
|
mutex_unlock(&cgroup_mutex);
|
|
|
mutex_unlock(&inode->i_mutex);
|
|
|
} else {
|
|
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
|
kfree(opts.name);
|
|
|
return dget(sb->s_root);
|
|
|
|
|
|
+ unlock_drop:
|
|
|
+ mutex_unlock(&cgroup_root_mutex);
|
|
|
+ mutex_unlock(&cgroup_mutex);
|
|
|
+ mutex_unlock(&inode->i_mutex);
|
|
|
drop_new_super:
|
|
|
deactivate_locked_super(sb);
|
|
|
drop_modules:
|
|
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
|
|
|
BUG_ON(!list_empty(&cgrp->sibling));
|
|
|
|
|
|
mutex_lock(&cgroup_mutex);
|
|
|
+ mutex_lock(&cgroup_root_mutex);
|
|
|
|
|
|
/* Rebind all subsystems back to the default hierarchy */
|
|
|
ret = rebind_subsystems(root, 0);
|
|
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
|
|
|
root_count--;
|
|
|
}
|
|
|
|
|
|
+ mutex_unlock(&cgroup_root_mutex);
|
|
|
mutex_unlock(&cgroup_mutex);
|
|
|
|
|
|
kill_litter_super(sb);
|
|
@@ -1739,12 +1757,91 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(cgroup_path);
|
|
|
|
|
|
+/*
|
|
|
+ * Control Group taskset
|
|
|
+ */
|
|
|
+struct task_and_cgroup {
|
|
|
+ struct task_struct *task;
|
|
|
+ struct cgroup *cgrp;
|
|
|
+};
|
|
|
+
|
|
|
+struct cgroup_taskset {
|
|
|
+ struct task_and_cgroup single;
|
|
|
+ struct flex_array *tc_array;
|
|
|
+ int tc_array_len;
|
|
|
+ int idx;
|
|
|
+ struct cgroup *cur_cgrp;
|
|
|
+};
|
|
|
+
|
|
|
+/**
|
|
|
+ * cgroup_taskset_first - reset taskset and return the first task
|
|
|
+ * @tset: taskset of interest
|
|
|
+ *
|
|
|
+ * @tset iteration is initialized and the first task is returned.
|
|
|
+ */
|
|
|
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
|
|
|
+{
|
|
|
+ if (tset->tc_array) {
|
|
|
+ tset->idx = 0;
|
|
|
+ return cgroup_taskset_next(tset);
|
|
|
+ } else {
|
|
|
+ tset->cur_cgrp = tset->single.cgrp;
|
|
|
+ return tset->single.task;
|
|
|
+ }
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
|
|
|
+
|
|
|
+/**
|
|
|
+ * cgroup_taskset_next - iterate to the next task in taskset
|
|
|
+ * @tset: taskset of interest
|
|
|
+ *
|
|
|
+ * Return the next task in @tset. Iteration must have been initialized
|
|
|
+ * with cgroup_taskset_first().
|
|
|
+ */
|
|
|
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
|
|
|
+{
|
|
|
+ struct task_and_cgroup *tc;
|
|
|
+
|
|
|
+ if (!tset->tc_array || tset->idx >= tset->tc_array_len)
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ tc = flex_array_get(tset->tc_array, tset->idx++);
|
|
|
+ tset->cur_cgrp = tc->cgrp;
|
|
|
+ return tc->task;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
|
|
|
+
|
|
|
+/**
|
|
|
+ * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
|
|
|
+ * @tset: taskset of interest
|
|
|
+ *
|
|
|
+ * Return the cgroup for the current (last returned) task of @tset. This
|
|
|
+ * function must be preceded by either cgroup_taskset_first() or
|
|
|
+ * cgroup_taskset_next().
|
|
|
+ */
|
|
|
+struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
|
|
|
+{
|
|
|
+ return tset->cur_cgrp;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
|
|
|
+
|
|
|
+/**
|
|
|
+ * cgroup_taskset_size - return the number of tasks in taskset
|
|
|
+ * @tset: taskset of interest
|
|
|
+ */
|
|
|
+int cgroup_taskset_size(struct cgroup_taskset *tset)
|
|
|
+{
|
|
|
+ return tset->tc_array ? tset->tc_array_len : 1;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(cgroup_taskset_size);
|
|
|
+
|
|
|
+
|
|
|
/*
|
|
|
* cgroup_task_migrate - move a task from one cgroup to another.
|
|
|
*
|
|
|
* 'guarantee' is set if the caller promises that a new css_set for the task
|
|
|
* will already exist. If not set, this function might sleep, and can fail with
|
|
|
- * -ENOMEM. Otherwise, it can only fail with -ESRCH.
|
|
|
+ * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
|
|
|
*/
|
|
|
static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
|
|
|
struct task_struct *tsk, bool guarantee)
|
|
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
|
|
|
struct css_set *newcg;
|
|
|
|
|
|
/*
|
|
|
- * get old css_set. we need to take task_lock and refcount it, because
|
|
|
- * an exiting task can change its css_set to init_css_set and drop its
|
|
|
- * old one without taking cgroup_mutex.
|
|
|
+ * We are synchronized through threadgroup_lock() against PF_EXITING
|
|
|
+ * setting such that we can't race against cgroup_exit() changing the
|
|
|
+ * css_set to init_css_set and dropping the old one.
|
|
|
*/
|
|
|
- task_lock(tsk);
|
|
|
+ WARN_ON_ONCE(tsk->flags & PF_EXITING);
|
|
|
oldcg = tsk->cgroups;
|
|
|
- get_css_set(oldcg);
|
|
|
- task_unlock(tsk);
|
|
|
|
|
|
/* locate or allocate a new css_set for this task. */
|
|
|
if (guarantee) {
|
|
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
|
|
|
might_sleep();
|
|
|
/* find_css_set will give us newcg already referenced. */
|
|
|
newcg = find_css_set(oldcg, cgrp);
|
|
|
- if (!newcg) {
|
|
|
- put_css_set(oldcg);
|
|
|
+ if (!newcg)
|
|
|
return -ENOMEM;
|
|
|
- }
|
|
|
}
|
|
|
- put_css_set(oldcg);
|
|
|
|
|
|
- /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
|
|
|
task_lock(tsk);
|
|
|
- if (tsk->flags & PF_EXITING) {
|
|
|
- task_unlock(tsk);
|
|
|
- put_css_set(newcg);
|
|
|
- return -ESRCH;
|
|
|
- }
|
|
|
rcu_assign_pointer(tsk->cgroups, newcg);
|
|
|
task_unlock(tsk);
|
|
|
|
|
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
|
|
|
* @cgrp: the cgroup the task is attaching to
|
|
|
* @tsk: the task to be attached
|
|
|
*
|
|
|
- * Call holding cgroup_mutex. May take task_lock of
|
|
|
- * the task 'tsk' during call.
|
|
|
+ * Call with cgroup_mutex and threadgroup locked. May take task_lock of
|
|
|
+ * @tsk during call.
|
|
|
*/
|
|
|
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
|
|
|
{
|
|
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
|
|
|
struct cgroup_subsys *ss, *failed_ss = NULL;
|
|
|
struct cgroup *oldcgrp;
|
|
|
struct cgroupfs_root *root = cgrp->root;
|
|
|
+ struct cgroup_taskset tset = { };
|
|
|
+
|
|
|
+ /* @tsk either already exited or can't exit until the end */
|
|
|
+ if (tsk->flags & PF_EXITING)
|
|
|
+ return -ESRCH;
|
|
|
|
|
|
/* Nothing to do if the task is already in that cgroup */
|
|
|
oldcgrp = task_cgroup_from_root(tsk, root);
|
|
|
if (cgrp == oldcgrp)
|
|
|
return 0;
|
|
|
|
|
|
+ tset.single.task = tsk;
|
|
|
+ tset.single.cgrp = oldcgrp;
|
|
|
+
|
|
|
for_each_subsys(root, ss) {
|
|
|
if (ss->can_attach) {
|
|
|
- retval = ss->can_attach(ss, cgrp, tsk);
|
|
|
+ retval = ss->can_attach(ss, cgrp, &tset);
|
|
|
if (retval) {
|
|
|
/*
|
|
|
* Remember on which subsystem the can_attach()
|
|
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
|
|
|
goto out;
|
|
|
}
|
|
|
}
|
|
|
- if (ss->can_attach_task) {
|
|
|
- retval = ss->can_attach_task(cgrp, tsk);
|
|
|
- if (retval) {
|
|
|
- failed_ss = ss;
|
|
|
- goto out;
|
|
|
- }
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
|
|
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
|
|
|
goto out;
|
|
|
|
|
|
for_each_subsys(root, ss) {
|
|
|
- if (ss->pre_attach)
|
|
|
- ss->pre_attach(cgrp);
|
|
|
- if (ss->attach_task)
|
|
|
- ss->attach_task(cgrp, tsk);
|
|
|
if (ss->attach)
|
|
|
- ss->attach(ss, cgrp, oldcgrp, tsk);
|
|
|
+ ss->attach(ss, cgrp, &tset);
|
|
|
}
|
|
|
|
|
|
synchronize_rcu();
|
|
@@ -1884,7 +1967,7 @@ out:
|
|
|
*/
|
|
|
break;
|
|
|
if (ss->cancel_attach)
|
|
|
- ss->cancel_attach(ss, cgrp, tsk);
|
|
|
+ ss->cancel_attach(ss, cgrp, &tset);
|
|
|
}
|
|
|
}
|
|
|
return retval;
|
|
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
|
|
|
|
|
|
read_lock(&css_set_lock);
|
|
|
newcg = find_existing_css_set(cg, cgrp, template);
|
|
|
- if (newcg)
|
|
|
- get_css_set(newcg);
|
|
|
read_unlock(&css_set_lock);
|
|
|
|
|
|
/* doesn't exist at all? */
|
|
|
if (!newcg)
|
|
|
return false;
|
|
|
/* see if it's already in the list */
|
|
|
- list_for_each_entry(cg_entry, newcg_list, links) {
|
|
|
- if (cg_entry->cg == newcg) {
|
|
|
- put_css_set(newcg);
|
|
|
+ list_for_each_entry(cg_entry, newcg_list, links)
|
|
|
+ if (cg_entry->cg == newcg)
|
|
|
return true;
|
|
|
- }
|
|
|
- }
|
|
|
|
|
|
/* not found */
|
|
|
- put_css_set(newcg);
|
|
|
return false;
|
|
|
}
|
|
|
|
|
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
|
|
|
* @cgrp: the cgroup to attach to
|
|
|
* @leader: the threadgroup leader task_struct of the group to be attached
|
|
|
*
|
|
|
- * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
|
|
|
- * take task_lock of each thread in leader's threadgroup individually in turn.
|
|
|
+ * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
|
|
|
+ * task_lock of each thread in leader's threadgroup individually in turn.
|
|
|
*/
|
|
|
-int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
|
|
|
+static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
|
|
|
{
|
|
|
int retval, i, group_size;
|
|
|
struct cgroup_subsys *ss, *failed_ss = NULL;
|
|
|
- bool cancel_failed_ss = false;
|
|
|
/* guaranteed to be initialized later, but the compiler needs this */
|
|
|
- struct cgroup *oldcgrp = NULL;
|
|
|
struct css_set *oldcg;
|
|
|
struct cgroupfs_root *root = cgrp->root;
|
|
|
/* threadgroup list cursor and array */
|
|
|
struct task_struct *tsk;
|
|
|
+ struct task_and_cgroup *tc;
|
|
|
struct flex_array *group;
|
|
|
+ struct cgroup_taskset tset = { };
|
|
|
/*
|
|
|
* we need to make sure we have css_sets for all the tasks we're
|
|
|
* going to move -before- we actually start moving them, so that in
|
|
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
|
|
|
* step 0: in order to do expensive, possibly blocking operations for
|
|
|
* every thread, we cannot iterate the thread group list, since it needs
|
|
|
* rcu or tasklist locked. instead, build an array of all threads in the
|
|
|
- * group - threadgroup_fork_lock prevents new threads from appearing,
|
|
|
- * and if threads exit, this will just be an over-estimate.
|
|
|
+ * group - group_rwsem prevents new threads from appearing, and if
|
|
|
+ * threads exit, this will just be an over-estimate.
|
|
|
*/
|
|
|
group_size = get_nr_threads(leader);
|
|
|
/* flex_array supports very large thread-groups better than kmalloc. */
|
|
|
- group = flex_array_alloc(sizeof(struct task_struct *), group_size,
|
|
|
- GFP_KERNEL);
|
|
|
+ group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
|
|
|
if (!group)
|
|
|
return -ENOMEM;
|
|
|
/* pre-allocate to guarantee space while iterating in rcu read-side. */
|
|
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
|
|
|
retval = -EAGAIN;
|
|
|
goto out_free_group_list;
|
|
|
}
|
|
|
- /* take a reference on each task in the group to go in the array. */
|
|
|
+
|
|
|
tsk = leader;
|
|
|
i = 0;
|
|
|
do {
|
|
|
+ struct task_and_cgroup ent;
|
|
|
+
|
|
|
+ /* @tsk either already exited or can't exit until the end */
|
|
|
+ if (tsk->flags & PF_EXITING)
|
|
|
+ continue;
|
|
|
+
|
|
|
/* as per above, nr_threads may decrease, but not increase. */
|
|
|
BUG_ON(i >= group_size);
|
|
|
- get_task_struct(tsk);
|
|
|
/*
|
|
|
* saying GFP_ATOMIC has no effect here because we did prealloc
|
|
|
* earlier, but it's good form to communicate our expectations.
|
|
|
*/
|
|
|
- retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
|
|
|
+ ent.task = tsk;
|
|
|
+ ent.cgrp = task_cgroup_from_root(tsk, root);
|
|
|
+ /* nothing to do if this task is already in the cgroup */
|
|
|
+ if (ent.cgrp == cgrp)
|
|
|
+ continue;
|
|
|
+ retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
|
|
|
BUG_ON(retval != 0);
|
|
|
i++;
|
|
|
} while_each_thread(leader, tsk);
|
|
|
/* remember the number of threads in the array for later. */
|
|
|
group_size = i;
|
|
|
+ tset.tc_array = group;
|
|
|
+ tset.tc_array_len = group_size;
|
|
|
read_unlock(&tasklist_lock);
|
|
|
|
|
|
+ /* methods shouldn't be called if no task is actually migrating */
|
|
|
+ retval = 0;
|
|
|
+ if (!group_size)
|
|
|
+ goto out_free_group_list;
|
|
|
+
|
|
|
/*
|
|
|
* step 1: check that we can legitimately attach to the cgroup.
|
|
|
*/
|
|
|
for_each_subsys(root, ss) {
|
|
|
if (ss->can_attach) {
|
|
|
- retval = ss->can_attach(ss, cgrp, leader);
|
|
|
+ retval = ss->can_attach(ss, cgrp, &tset);
|
|
|
if (retval) {
|
|
|
failed_ss = ss;
|
|
|
goto out_cancel_attach;
|
|
|
}
|
|
|
}
|
|
|
- /* a callback to be run on every thread in the threadgroup. */
|
|
|
- if (ss->can_attach_task) {
|
|
|
- /* run on each task in the threadgroup. */
|
|
|
- for (i = 0; i < group_size; i++) {
|
|
|
- tsk = flex_array_get_ptr(group, i);
|
|
|
- retval = ss->can_attach_task(cgrp, tsk);
|
|
|
- if (retval) {
|
|
|
- failed_ss = ss;
|
|
|
- cancel_failed_ss = true;
|
|
|
- goto out_cancel_attach;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2091,67 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
|
|
|
*/
|
|
|
INIT_LIST_HEAD(&newcg_list);
|
|
|
for (i = 0; i < group_size; i++) {
|
|
|
- tsk = flex_array_get_ptr(group, i);
|
|
|
- /* nothing to do if this task is already in the cgroup */
|
|
|
- oldcgrp = task_cgroup_from_root(tsk, root);
|
|
|
- if (cgrp == oldcgrp)
|
|
|
- continue;
|
|
|
- /* get old css_set pointer */
|
|
|
- task_lock(tsk);
|
|
|
- oldcg = tsk->cgroups;
|
|
|
- get_css_set(oldcg);
|
|
|
- task_unlock(tsk);
|
|
|
- /* see if the new one for us is already in the list? */
|
|
|
- if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
|
|
|
- /* was already there, nothing to do. */
|
|
|
- put_css_set(oldcg);
|
|
|
- } else {
|
|
|
- /* we don't already have it. get new one. */
|
|
|
+ tc = flex_array_get(group, i);
|
|
|
+ oldcg = tc->task->cgroups;
|
|
|
+
|
|
|
+ /* if we don't already have it in the list get a new one */
|
|
|
+ if (!css_set_check_fetched(cgrp, tc->task, oldcg,
|
|
|
+ &newcg_list)) {
|
|
|
retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
|
|
|
- put_css_set(oldcg);
|
|
|
if (retval)
|
|
|
goto out_list_teardown;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * step 3: now that we're guaranteed success wrt the css_sets, proceed
|
|
|
- * to move all tasks to the new cgroup, calling ss->attach_task for each
|
|
|
- * one along the way. there are no failure cases after here, so this is
|
|
|
- * the commit point.
|
|
|
+ * step 3: now that we're guaranteed success wrt the css_sets,
|
|
|
+ * proceed to move all tasks to the new cgroup. There are no
|
|
|
+ * failure cases after here, so this is the commit point.
|
|
|
*/
|
|
|
- for_each_subsys(root, ss) {
|
|
|
- if (ss->pre_attach)
|
|
|
- ss->pre_attach(cgrp);
|
|
|
- }
|
|
|
for (i = 0; i < group_size; i++) {
|
|
|
- tsk = flex_array_get_ptr(group, i);
|
|
|
- /* leave current thread as it is if it's already there */
|
|
|
- oldcgrp = task_cgroup_from_root(tsk, root);
|
|
|
- if (cgrp == oldcgrp)
|
|
|
- continue;
|
|
|
- /* if the thread is PF_EXITING, it can just get skipped. */
|
|
|
- retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
|
|
|
- if (retval == 0) {
|
|
|
- /* attach each task to each subsystem */
|
|
|
- for_each_subsys(root, ss) {
|
|
|
- if (ss->attach_task)
|
|
|
- ss->attach_task(cgrp, tsk);
|
|
|
- }
|
|
|
- } else {
|
|
|
- BUG_ON(retval != -ESRCH);
|
|
|
- }
|
|
|
+ tc = flex_array_get(group, i);
|
|
|
+ retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
|
|
|
+ BUG_ON(retval);
|
|
|
}
|
|
|
/* nothing is sensitive to fork() after this point. */
|
|
|
|
|
|
/*
|
|
|
- * step 4: do expensive, non-thread-specific subsystem callbacks.
|
|
|
- * TODO: if ever a subsystem needs to know the oldcgrp for each task
|
|
|
- * being moved, this call will need to be reworked to communicate that.
|
|
|
+ * step 4: do subsystem attach callbacks.
|
|
|
*/
|
|
|
for_each_subsys(root, ss) {
|
|
|
if (ss->attach)
|
|
|
- ss->attach(ss, cgrp, oldcgrp, leader);
|
|
|
+ ss->attach(ss, cgrp, &tset);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2171,20 +2220,12 @@ out_cancel_attach:
|
|
|
/* same deal as in cgroup_attach_task */
|
|
|
if (retval) {
|
|
|
for_each_subsys(root, ss) {
|
|
|
- if (ss == failed_ss) {
|
|
|
- if (cancel_failed_ss && ss->cancel_attach)
|
|
|
- ss->cancel_attach(ss, cgrp, leader);
|
|
|
+ if (ss == failed_ss)
|
|
|
break;
|
|
|
- }
|
|
|
if (ss->cancel_attach)
|
|
|
- ss->cancel_attach(ss, cgrp, leader);
|
|
|
+ ss->cancel_attach(ss, cgrp, &tset);
|
|
|
}
|
|
|
}
|
|
|
- /* clean up the array of referenced threads in the group. */
|
|
|
- for (i = 0; i < group_size; i++) {
|
|
|
- tsk = flex_array_get_ptr(group, i);
|
|
|
- put_task_struct(tsk);
|
|
|
- }
|
|
|
out_free_group_list:
|
|
|
flex_array_free(group);
|
|
|
return retval;
|
|
@@ -2192,8 +2233,8 @@ out_free_group_list:
|
|
|
|
|
|
/*
|
|
|
* Find the task_struct of the task to attach by vpid and pass it along to the
|
|
|
- * function to attach either it or all tasks in its threadgroup. Will take
|
|
|
- * cgroup_mutex; may take task_lock of task.
|
|
|
+ * function to attach either it or all tasks in its threadgroup. Will lock
|
|
|
+ * cgroup_mutex and threadgroup; may take task_lock of task.
|
|
|
*/
|
|
|
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
|
|
|
{
|
|
@@ -2220,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
|
|
|
* detect it later.
|
|
|
*/
|
|
|
tsk = tsk->group_leader;
|
|
|
- } else if (tsk->flags & PF_EXITING) {
|
|
|
- /* optimization for the single-task-only case */
|
|
|
- rcu_read_unlock();
|
|
|
- cgroup_unlock();
|
|
|
- return -ESRCH;
|
|
|
}
|
|
|
-
|
|
|
/*
|
|
|
* even if we're attaching all tasks in the thread group, we
|
|
|
* only need to check permissions on one of them.
|
|
@@ -2249,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
|
|
|
get_task_struct(tsk);
|
|
|
}
|
|
|
|
|
|
- if (threadgroup) {
|
|
|
- threadgroup_fork_write_lock(tsk);
|
|
|
+ threadgroup_lock(tsk);
|
|
|
+
|
|
|
+ if (threadgroup)
|
|
|
ret = cgroup_attach_proc(cgrp, tsk);
|
|
|
- threadgroup_fork_write_unlock(tsk);
|
|
|
- } else {
|
|
|
+ else
|
|
|
ret = cgroup_attach_task(cgrp, tsk);
|
|
|
- }
|
|
|
+
|
|
|
+ threadgroup_unlock(tsk);
|
|
|
+
|
|
|
put_task_struct(tsk);
|
|
|
cgroup_unlock();
|
|
|
return ret;
|
|
@@ -2306,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
|
|
|
return -EINVAL;
|
|
|
if (!cgroup_lock_live_group(cgrp))
|
|
|
return -ENODEV;
|
|
|
+ mutex_lock(&cgroup_root_mutex);
|
|
|
strcpy(cgrp->root->release_agent_path, buffer);
|
|
|
+ mutex_unlock(&cgroup_root_mutex);
|
|
|
cgroup_unlock();
|
|
|
return 0;
|
|
|
}
|
|
@@ -2789,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
|
|
|
}
|
|
|
|
|
|
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
|
|
|
+ __acquires(css_set_lock)
|
|
|
{
|
|
|
/*
|
|
|
* The first time anyone tries to iterate across a cgroup,
|
|
@@ -2828,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
|
|
|
}
|
|
|
|
|
|
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
|
|
|
+ __releases(css_set_lock)
|
|
|
{
|
|
|
read_unlock(&css_set_lock);
|
|
|
}
|
|
@@ -4491,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
|
|
|
*
|
|
|
* A pointer to the shared css_set was automatically copied in
|
|
|
* fork.c by dup_task_struct(). However, we ignore that copy, since
|
|
|
- * it was not made under the protection of RCU or cgroup_mutex, so
|
|
|
- * might no longer be a valid cgroup pointer. cgroup_attach_task() might
|
|
|
- * have already changed current->cgroups, allowing the previously
|
|
|
- * referenced cgroup group to be removed and freed.
|
|
|
+ * it was not made under the protection of RCU, cgroup_mutex or
|
|
|
+ * threadgroup_change_begin(), so it might no longer be a valid
|
|
|
+ * cgroup pointer. cgroup_attach_task() might have already changed
|
|
|
+ * current->cgroups, allowing the previously referenced cgroup
|
|
|
+ * group to be removed and freed.
|
|
|
+ *
|
|
|
+ * Outside the pointer validity we also need to process the css_set
|
|
|
+ * inheritance between threadgoup_change_begin() and
|
|
|
+ * threadgoup_change_end(), this way there is no leak in any process
|
|
|
+ * wide migration performed by cgroup_attach_proc() that could otherwise
|
|
|
+ * miss a thread because it is too early or too late in the fork stage.
|
|
|
*
|
|
|
* At the point that cgroup_fork() is called, 'current' is the parent
|
|
|
* task, and the passed argument 'child' points to the child task.
|
|
|
*/
|
|
|
void cgroup_fork(struct task_struct *child)
|
|
|
{
|
|
|
- task_lock(current);
|
|
|
+ /*
|
|
|
+ * We don't need to task_lock() current because current->cgroups
|
|
|
+ * can't be changed concurrently here. The parent obviously hasn't
|
|
|
+ * exited and called cgroup_exit(), and we are synchronized against
|
|
|
+ * cgroup migration through threadgroup_change_begin().
|
|
|
+ */
|
|
|
child->cgroups = current->cgroups;
|
|
|
get_css_set(child->cgroups);
|
|
|
- task_unlock(current);
|
|
|
INIT_LIST_HEAD(&child->cg_list);
|
|
|
}
|
|
|
|
|
@@ -4546,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
|
|
|
{
|
|
|
if (use_task_css_set_links) {
|
|
|
write_lock(&css_set_lock);
|
|
|
- task_lock(child);
|
|
|
- if (list_empty(&child->cg_list))
|
|
|
+ if (list_empty(&child->cg_list)) {
|
|
|
+ /*
|
|
|
+ * It's safe to use child->cgroups without task_lock()
|
|
|
+ * here because we are protected through
|
|
|
+ * threadgroup_change_begin() against concurrent
|
|
|
+ * css_set change in cgroup_task_migrate(). Also
|
|
|
+ * the task can't exit at that point until
|
|
|
+ * wake_up_new_task() is called, so we are protected
|
|
|
+ * against cgroup_exit() setting child->cgroup to
|
|
|
+ * init_css_set.
|
|
|
+ */
|
|
|
list_add(&child->cg_list, &child->cgroups->tasks);
|
|
|
- task_unlock(child);
|
|
|
+ }
|
|
|
write_unlock(&css_set_lock);
|
|
|
}
|
|
|
}
|