|
@@ -97,12 +97,6 @@ struct cpuset {
|
|
|
|
|
|
struct cpuset *parent; /* my parent */
|
|
|
|
|
|
- /*
|
|
|
- * Copy of global cpuset_mems_generation as of the most
|
|
|
- * recent time this cpuset changed its mems_allowed.
|
|
|
- */
|
|
|
- int mems_generation;
|
|
|
-
|
|
|
struct fmeter fmeter; /* memory_pressure filter */
|
|
|
|
|
|
/* partition number for rebuild_sched_domains() */
|
|
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
|
|
|
return test_bit(CS_SPREAD_SLAB, &cs->flags);
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Increment this integer everytime any cpuset changes its
|
|
|
- * mems_allowed value. Users of cpusets can track this generation
|
|
|
- * number, and avoid having to lock and reload mems_allowed unless
|
|
|
- * the cpuset they're using changes generation.
|
|
|
- *
|
|
|
- * A single, global generation is needed because cpuset_attach_task() could
|
|
|
- * reattach a task to a different cpuset, which must not have its
|
|
|
- * generation numbers aliased with those of that tasks previous cpuset.
|
|
|
- *
|
|
|
- * Generations are needed for mems_allowed because one task cannot
|
|
|
- * modify another's memory placement. So we must enable every task,
|
|
|
- * on every visit to __alloc_pages(), to efficiently check whether
|
|
|
- * its current->cpuset->mems_allowed has changed, requiring an update
|
|
|
- * of its current->mems_allowed.
|
|
|
- *
|
|
|
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
|
|
|
- * there is no need to mark it atomic.
|
|
|
- */
|
|
|
-static int cpuset_mems_generation;
|
|
|
-
|
|
|
static struct cpuset top_cpuset = {
|
|
|
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
|
|
|
};
|
|
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
|
|
|
* If a task is only holding callback_mutex, then it has read-only
|
|
|
* access to cpusets.
|
|
|
*
|
|
|
- * The task_struct fields mems_allowed and mems_generation may only
|
|
|
- * be accessed in the context of that task, so require no locks.
|
|
|
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
|
|
|
+ * by other task, we use alloc_lock in the task_struct fields to protect
|
|
|
+ * them.
|
|
|
*
|
|
|
* The cpuset_common_file_read() handlers only hold callback_mutex across
|
|
|
* small pieces of code, such as when reading out possibly multi-word
|
|
@@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
|
|
|
tsk->flags &= ~PF_SPREAD_SLAB;
|
|
|
}
|
|
|
|
|
|
-/**
|
|
|
- * cpuset_update_task_memory_state - update task memory placement
|
|
|
- *
|
|
|
- * If the current tasks cpusets mems_allowed changed behind our
|
|
|
- * backs, update current->mems_allowed, mems_generation and task NUMA
|
|
|
- * mempolicy to the new value.
|
|
|
- *
|
|
|
- * Task mempolicy is updated by rebinding it relative to the
|
|
|
- * current->cpuset if a task has its memory placement changed.
|
|
|
- * Do not call this routine if in_interrupt().
|
|
|
- *
|
|
|
- * Call without callback_mutex or task_lock() held. May be
|
|
|
- * called with or without cgroup_mutex held. Thanks in part to
|
|
|
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
|
|
|
- * be NULL. This routine also might acquire callback_mutex during
|
|
|
- * call.
|
|
|
- *
|
|
|
- * Reading current->cpuset->mems_generation doesn't need task_lock
|
|
|
- * to guard the current->cpuset derefence, because it is guarded
|
|
|
- * from concurrent freeing of current->cpuset using RCU.
|
|
|
- *
|
|
|
- * The rcu_dereference() is technically probably not needed,
|
|
|
- * as I don't actually mind if I see a new cpuset pointer but
|
|
|
- * an old value of mems_generation. However this really only
|
|
|
- * matters on alpha systems using cpusets heavily. If I dropped
|
|
|
- * that rcu_dereference(), it would save them a memory barrier.
|
|
|
- * For all other arch's, rcu_dereference is a no-op anyway, and for
|
|
|
- * alpha systems not using cpusets, another planned optimization,
|
|
|
- * avoiding the rcu critical section for tasks in the root cpuset
|
|
|
- * which is statically allocated, so can't vanish, will make this
|
|
|
- * irrelevant. Better to use RCU as intended, than to engage in
|
|
|
- * some cute trick to save a memory barrier that is impossible to
|
|
|
- * test, for alpha systems using cpusets heavily, which might not
|
|
|
- * even exist.
|
|
|
- *
|
|
|
- * This routine is needed to update the per-task mems_allowed data,
|
|
|
- * within the tasks context, when it is trying to allocate memory
|
|
|
- * (in various mm/mempolicy.c routines) and notices that some other
|
|
|
- * task has been modifying its cpuset.
|
|
|
- */
|
|
|
-
|
|
|
-void cpuset_update_task_memory_state(void)
|
|
|
-{
|
|
|
- int my_cpusets_mem_gen;
|
|
|
- struct task_struct *tsk = current;
|
|
|
- struct cpuset *cs;
|
|
|
-
|
|
|
- rcu_read_lock();
|
|
|
- my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
|
|
|
- rcu_read_unlock();
|
|
|
-
|
|
|
- if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
|
|
|
- mutex_lock(&callback_mutex);
|
|
|
- task_lock(tsk);
|
|
|
- cs = task_cs(tsk); /* Maybe changed when task not locked */
|
|
|
- guarantee_online_mems(cs, &tsk->mems_allowed);
|
|
|
- tsk->cpuset_mems_generation = cs->mems_generation;
|
|
|
- task_unlock(tsk);
|
|
|
- mutex_unlock(&callback_mutex);
|
|
|
- mpol_rebind_task(tsk, &tsk->mems_allowed);
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
|
|
|
*
|
|
@@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
* other task, the task_struct mems_allowed that we are hacking
|
|
|
* is for our current task, which must allocate new pages for that
|
|
|
* migrating memory region.
|
|
|
- *
|
|
|
- * We call cpuset_update_task_memory_state() before hacking
|
|
|
- * our tasks mems_allowed, so that we are assured of being in
|
|
|
- * sync with our tasks cpuset, and in particular, callbacks to
|
|
|
- * cpuset_update_task_memory_state() from nested page allocations
|
|
|
- * won't see any mismatch of our cpuset and task mems_generation
|
|
|
- * values, so won't overwrite our hacked tasks mems_allowed
|
|
|
- * nodemask.
|
|
|
*/
|
|
|
|
|
|
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
|
|
@@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
|
|
|
{
|
|
|
struct task_struct *tsk = current;
|
|
|
|
|
|
- cpuset_update_task_memory_state();
|
|
|
-
|
|
|
- mutex_lock(&callback_mutex);
|
|
|
tsk->mems_allowed = *to;
|
|
|
- mutex_unlock(&callback_mutex);
|
|
|
|
|
|
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
|
|
|
|
|
|
- mutex_lock(&callback_mutex);
|
|
|
guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
|
|
|
- mutex_unlock(&callback_mutex);
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
|
|
|
- * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
|
|
|
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
|
|
|
+ * @tsk: the task to change
|
|
|
+ * @newmems: new nodes that the task will be set
|
|
|
+ *
|
|
|
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
|
|
|
+ * we structure updates as setting all new allowed nodes, then clearing newly
|
|
|
+ * disallowed ones.
|
|
|
+ *
|
|
|
+ * Called with task's alloc_lock held
|
|
|
+ */
|
|
|
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
|
|
|
+ nodemask_t *newmems)
|
|
|
+{
|
|
|
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
|
|
|
+ mpol_rebind_task(tsk, &tsk->mems_allowed);
|
|
|
+ mpol_rebind_task(tsk, newmems);
|
|
|
+ tsk->mems_allowed = *newmems;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
|
|
|
+ * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
|
|
|
+ * memory_migrate flag is set. Called with cgroup_mutex held.
|
|
|
*/
|
|
|
static void cpuset_change_nodemask(struct task_struct *p,
|
|
|
struct cgroup_scanner *scan)
|
|
@@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
|
|
|
struct cpuset *cs;
|
|
|
int migrate;
|
|
|
const nodemask_t *oldmem = scan->data;
|
|
|
+ nodemask_t newmems;
|
|
|
+
|
|
|
+ cs = cgroup_cs(scan->cg);
|
|
|
+ guarantee_online_mems(cs, &newmems);
|
|
|
+
|
|
|
+ task_lock(p);
|
|
|
+ cpuset_change_task_nodemask(p, &newmems);
|
|
|
+ task_unlock(p);
|
|
|
|
|
|
mm = get_task_mm(p);
|
|
|
if (!mm)
|
|
|
return;
|
|
|
|
|
|
- cs = cgroup_cs(scan->cg);
|
|
|
migrate = is_memory_migrate(cs);
|
|
|
|
|
|
mpol_rebind_mm(mm, &cs->mems_allowed);
|
|
@@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
|
|
|
/*
|
|
|
* Handle user request to change the 'mems' memory placement
|
|
|
* of a cpuset. Needs to validate the request, update the
|
|
|
- * cpusets mems_allowed and mems_generation, and for each
|
|
|
- * task in the cpuset, rebind any vma mempolicies and if
|
|
|
- * the cpuset is marked 'memory_migrate', migrate the tasks
|
|
|
- * pages to the new memory.
|
|
|
+ * cpusets mems_allowed, and for each task in the cpuset,
|
|
|
+ * update mems_allowed and rebind task's mempolicy and any vma
|
|
|
+ * mempolicies and if the cpuset is marked 'memory_migrate',
|
|
|
+ * migrate the tasks pages to the new memory.
|
|
|
*
|
|
|
* Call with cgroup_mutex held. May take callback_mutex during call.
|
|
|
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
|
|
@@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
|
|
|
mutex_lock(&callback_mutex);
|
|
|
cs->mems_allowed = trialcs->mems_allowed;
|
|
|
- cs->mems_generation = cpuset_mems_generation++;
|
|
|
mutex_unlock(&callback_mutex);
|
|
|
|
|
|
update_tasks_nodemask(cs, &oldmem, &heap);
|
|
@@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss,
|
|
|
|
|
|
if (cs == &top_cpuset) {
|
|
|
cpumask_copy(cpus_attach, cpu_possible_mask);
|
|
|
+ to = node_possible_map;
|
|
|
} else {
|
|
|
- mutex_lock(&callback_mutex);
|
|
|
guarantee_online_cpus(cs, cpus_attach);
|
|
|
- mutex_unlock(&callback_mutex);
|
|
|
+ guarantee_online_mems(cs, &to);
|
|
|
}
|
|
|
err = set_cpus_allowed_ptr(tsk, cpus_attach);
|
|
|
if (err)
|
|
|
return;
|
|
|
|
|
|
+ task_lock(tsk);
|
|
|
+ cpuset_change_task_nodemask(tsk, &to);
|
|
|
+ task_unlock(tsk);
|
|
|
cpuset_update_task_spread_flag(cs, tsk);
|
|
|
|
|
|
from = oldcs->mems_allowed;
|
|
@@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
|
|
|
struct cpuset *parent;
|
|
|
|
|
|
if (!cont->parent) {
|
|
|
- /* This is early initialization for the top cgroup */
|
|
|
- top_cpuset.mems_generation = cpuset_mems_generation++;
|
|
|
return &top_cpuset.css;
|
|
|
}
|
|
|
parent = cgroup_cs(cont->parent);
|
|
@@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
}
|
|
|
|
|
|
- cpuset_update_task_memory_state();
|
|
|
cs->flags = 0;
|
|
|
if (is_spread_page(parent))
|
|
|
set_bit(CS_SPREAD_PAGE, &cs->flags);
|
|
@@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
|
|
|
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
|
|
cpumask_clear(cs->cpus_allowed);
|
|
|
nodes_clear(cs->mems_allowed);
|
|
|
- cs->mems_generation = cpuset_mems_generation++;
|
|
|
fmeter_init(&cs->fmeter);
|
|
|
cs->relax_domain_level = -1;
|
|
|
|
|
@@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
|
{
|
|
|
struct cpuset *cs = cgroup_cs(cont);
|
|
|
|
|
|
- cpuset_update_task_memory_state();
|
|
|
-
|
|
|
if (is_sched_load_balance(cs))
|
|
|
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
|
|
|
|
|
@@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
|
|
|
.early_init = 1,
|
|
|
};
|
|
|
|
|
|
-/*
|
|
|
- * cpuset_init_early - just enough so that the calls to
|
|
|
- * cpuset_update_task_memory_state() in early init code
|
|
|
- * are harmless.
|
|
|
- */
|
|
|
-
|
|
|
-int __init cpuset_init_early(void)
|
|
|
-{
|
|
|
- alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
|
|
|
-
|
|
|
- top_cpuset.mems_generation = cpuset_mems_generation++;
|
|
|
- return 0;
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
/**
|
|
|
* cpuset_init - initialize cpusets at system boot
|
|
|
*
|
|
@@ -1936,11 +1842,13 @@ int __init cpuset_init(void)
|
|
|
{
|
|
|
int err = 0;
|
|
|
|
|
|
+ if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
|
|
|
+ BUG();
|
|
|
+
|
|
|
cpumask_setall(top_cpuset.cpus_allowed);
|
|
|
nodes_setall(top_cpuset.mems_allowed);
|
|
|
|
|
|
fmeter_init(&top_cpuset.fmeter);
|
|
|
- top_cpuset.mems_generation = cpuset_mems_generation++;
|
|
|
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
|
|
|
top_cpuset.relax_domain_level = -1;
|
|
|
|