|
@@ -4,7 +4,7 @@
|
|
* Processor and Memory placement constraints for sets of tasks.
|
|
* Processor and Memory placement constraints for sets of tasks.
|
|
*
|
|
*
|
|
* Copyright (C) 2003 BULL SA.
|
|
* Copyright (C) 2003 BULL SA.
|
|
- * Copyright (C) 2004-2006 Silicon Graphics, Inc.
|
|
|
|
|
|
+ * Copyright (C) 2004-2007 Silicon Graphics, Inc.
|
|
* Copyright (C) 2006 Google, Inc
|
|
* Copyright (C) 2006 Google, Inc
|
|
*
|
|
*
|
|
* Portions derived from Patrick Mochel's sysfs code.
|
|
* Portions derived from Patrick Mochel's sysfs code.
|
|
@@ -54,6 +54,7 @@
|
|
#include <asm/uaccess.h>
|
|
#include <asm/uaccess.h>
|
|
#include <asm/atomic.h>
|
|
#include <asm/atomic.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/mutex.h>
|
|
|
|
+#include <linux/kfifo.h>
|
|
|
|
|
|
/*
|
|
/*
|
|
* Tracks how many cpusets are currently defined in system.
|
|
* Tracks how many cpusets are currently defined in system.
|
|
@@ -91,6 +92,9 @@ struct cpuset {
|
|
int mems_generation;
|
|
int mems_generation;
|
|
|
|
|
|
struct fmeter fmeter; /* memory_pressure filter */
|
|
struct fmeter fmeter; /* memory_pressure filter */
|
|
|
|
+
|
|
|
|
+ /* partition number for rebuild_sched_domains() */
|
|
|
|
+ int pn;
|
|
};
|
|
};
|
|
|
|
|
|
/* Retrieve the cpuset for a cgroup */
|
|
/* Retrieve the cpuset for a cgroup */
|
|
@@ -113,6 +117,7 @@ typedef enum {
|
|
CS_CPU_EXCLUSIVE,
|
|
CS_CPU_EXCLUSIVE,
|
|
CS_MEM_EXCLUSIVE,
|
|
CS_MEM_EXCLUSIVE,
|
|
CS_MEMORY_MIGRATE,
|
|
CS_MEMORY_MIGRATE,
|
|
|
|
+ CS_SCHED_LOAD_BALANCE,
|
|
CS_SPREAD_PAGE,
|
|
CS_SPREAD_PAGE,
|
|
CS_SPREAD_SLAB,
|
|
CS_SPREAD_SLAB,
|
|
} cpuset_flagbits_t;
|
|
} cpuset_flagbits_t;
|
|
@@ -128,6 +133,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
|
|
return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
|
|
return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static inline int is_sched_load_balance(const struct cpuset *cs)
|
|
|
|
+{
|
|
|
|
+ return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
|
|
|
+}
|
|
|
|
+
|
|
static inline int is_memory_migrate(const struct cpuset *cs)
|
|
static inline int is_memory_migrate(const struct cpuset *cs)
|
|
{
|
|
{
|
|
return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
|
|
return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
|
|
@@ -481,6 +491,208 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/*
|
|
|
|
+ * Helper routine for rebuild_sched_domains().
|
|
|
|
+ * Do cpusets a, b have overlapping cpus_allowed masks?
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
|
|
|
|
+{
|
|
|
|
+ return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * rebuild_sched_domains()
|
|
|
|
+ *
|
|
|
|
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
|
|
|
|
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
|
|
|
|
+ * which has that flag enabled, or if any cpuset with a non-empty
|
|
|
|
+ * 'cpus' is removed, then call this routine to rebuild the
|
|
|
|
+ * scheduler's dynamic sched domains.
|
|
|
|
+ *
|
|
|
|
+ * This routine builds a partial partition of the systems CPUs
|
|
|
|
+ * (the set of non-overlappping cpumask_t's in the array 'part'
|
|
|
|
+ * below), and passes that partial partition to the kernel/sched.c
|
|
|
|
+ * partition_sched_domains() routine, which will rebuild the
|
|
|
|
+ * schedulers load balancing domains (sched domains) as specified
|
|
|
|
+ * by that partial partition. A 'partial partition' is a set of
|
|
|
|
+ * non-overlapping subsets whose union is a subset of that set.
|
|
|
|
+ *
|
|
|
|
+ * See "What is sched_load_balance" in Documentation/cpusets.txt
|
|
|
|
+ * for a background explanation of this.
|
|
|
|
+ *
|
|
|
|
+ * Does not return errors, on the theory that the callers of this
|
|
|
|
+ * routine would rather not worry about failures to rebuild sched
|
|
|
|
+ * domains when operating in the severe memory shortage situations
|
|
|
|
+ * that could cause allocation failures below.
|
|
|
|
+ *
|
|
|
|
+ * Call with cgroup_mutex held. May take callback_mutex during
|
|
|
|
+ * call due to the kfifo_alloc() and kmalloc() calls. May nest
|
|
|
|
+ * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
|
|
|
|
+ * Must not be called holding callback_mutex, because we must not
|
|
|
|
+ * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere
|
|
|
|
+ * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
|
|
|
|
+ * So the reverse nesting would risk an ABBA deadlock.
|
|
|
|
+ *
|
|
|
|
+ * The three key local variables below are:
|
|
|
|
+ * q - a kfifo queue of cpuset pointers, used to implement a
|
|
|
|
+ * top-down scan of all cpusets. This scan loads a pointer
|
|
|
|
+ * to each cpuset marked is_sched_load_balance into the
|
|
|
|
+ * array 'csa'. For our purposes, rebuilding the schedulers
|
|
|
|
+ * sched domains, we can ignore !is_sched_load_balance cpusets.
|
|
|
|
+ * csa - (for CpuSet Array) Array of pointers to all the cpusets
|
|
|
|
+ * that need to be load balanced, for convenient iterative
|
|
|
|
+ * access by the subsequent code that finds the best partition,
|
|
|
|
+ * i.e the set of domains (subsets) of CPUs such that the
|
|
|
|
+ * cpus_allowed of every cpuset marked is_sched_load_balance
|
|
|
|
+ * is a subset of one of these domains, while there are as
|
|
|
|
+ * many such domains as possible, each as small as possible.
|
|
|
|
+ * doms - Conversion of 'csa' to an array of cpumasks, for passing to
|
|
|
|
+ * the kernel/sched.c routine partition_sched_domains() in a
|
|
|
|
+ * convenient format, that can be easily compared to the prior
|
|
|
|
+ * value to determine what partition elements (sched domains)
|
|
|
|
+ * were changed (added or removed.)
|
|
|
|
+ *
|
|
|
|
+ * Finding the best partition (set of domains):
|
|
|
|
+ * The triple nested loops below over i, j, k scan over the
|
|
|
|
+ * load balanced cpusets (using the array of cpuset pointers in
|
|
|
|
+ * csa[]) looking for pairs of cpusets that have overlapping
|
|
|
|
+ * cpus_allowed, but which don't have the same 'pn' partition
|
|
|
|
+ * number and gives them in the same partition number. It keeps
|
|
|
|
+ * looping on the 'restart' label until it can no longer find
|
|
|
|
+ * any such pairs.
|
|
|
|
+ *
|
|
|
|
+ * The union of the cpus_allowed masks from the set of
|
|
|
|
+ * all cpusets having the same 'pn' value then form the one
|
|
|
|
+ * element of the partition (one sched domain) to be passed to
|
|
|
|
+ * partition_sched_domains().
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+static void rebuild_sched_domains(void)
|
|
|
|
+{
|
|
|
|
+ struct kfifo *q; /* queue of cpusets to be scanned */
|
|
|
|
+ struct cpuset *cp; /* scans q */
|
|
|
|
+ struct cpuset **csa; /* array of all cpuset ptrs */
|
|
|
|
+ int csn; /* how many cpuset ptrs in csa so far */
|
|
|
|
+ int i, j, k; /* indices for partition finding loops */
|
|
|
|
+ cpumask_t *doms; /* resulting partition; i.e. sched domains */
|
|
|
|
+ int ndoms; /* number of sched domains in result */
|
|
|
|
+ int nslot; /* next empty doms[] cpumask_t slot */
|
|
|
|
+
|
|
|
|
+ q = NULL;
|
|
|
|
+ csa = NULL;
|
|
|
|
+ doms = NULL;
|
|
|
|
+
|
|
|
|
+ /* Special case for the 99% of systems with one, full, sched domain */
|
|
|
|
+ if (is_sched_load_balance(&top_cpuset)) {
|
|
|
|
+ ndoms = 1;
|
|
|
|
+ doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
|
|
|
|
+ if (!doms)
|
|
|
|
+ goto rebuild;
|
|
|
|
+ *doms = top_cpuset.cpus_allowed;
|
|
|
|
+ goto rebuild;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
|
|
|
|
+ if (IS_ERR(q))
|
|
|
|
+ goto done;
|
|
|
|
+ csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
|
|
|
|
+ if (!csa)
|
|
|
|
+ goto done;
|
|
|
|
+ csn = 0;
|
|
|
|
+
|
|
|
|
+ cp = &top_cpuset;
|
|
|
|
+ __kfifo_put(q, (void *)&cp, sizeof(cp));
|
|
|
|
+ while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
|
|
|
|
+ struct cgroup *cont;
|
|
|
|
+ struct cpuset *child; /* scans child cpusets of cp */
|
|
|
|
+ if (is_sched_load_balance(cp))
|
|
|
|
+ csa[csn++] = cp;
|
|
|
|
+ list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
|
|
|
|
+ child = cgroup_cs(cont);
|
|
|
|
+ __kfifo_put(q, (void *)&child, sizeof(cp));
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < csn; i++)
|
|
|
|
+ csa[i]->pn = i;
|
|
|
|
+ ndoms = csn;
|
|
|
|
+
|
|
|
|
+restart:
|
|
|
|
+ /* Find the best partition (set of sched domains) */
|
|
|
|
+ for (i = 0; i < csn; i++) {
|
|
|
|
+ struct cpuset *a = csa[i];
|
|
|
|
+ int apn = a->pn;
|
|
|
|
+
|
|
|
|
+ for (j = 0; j < csn; j++) {
|
|
|
|
+ struct cpuset *b = csa[j];
|
|
|
|
+ int bpn = b->pn;
|
|
|
|
+
|
|
|
|
+ if (apn != bpn && cpusets_overlap(a, b)) {
|
|
|
|
+ for (k = 0; k < csn; k++) {
|
|
|
|
+ struct cpuset *c = csa[k];
|
|
|
|
+
|
|
|
|
+ if (c->pn == bpn)
|
|
|
|
+ c->pn = apn;
|
|
|
|
+ }
|
|
|
|
+ ndoms--; /* one less element */
|
|
|
|
+ goto restart;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /* Convert <csn, csa> to <ndoms, doms> */
|
|
|
|
+ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
|
|
|
|
+ if (!doms)
|
|
|
|
+ goto rebuild;
|
|
|
|
+
|
|
|
|
+ for (nslot = 0, i = 0; i < csn; i++) {
|
|
|
|
+ struct cpuset *a = csa[i];
|
|
|
|
+ int apn = a->pn;
|
|
|
|
+
|
|
|
|
+ if (apn >= 0) {
|
|
|
|
+ cpumask_t *dp = doms + nslot;
|
|
|
|
+
|
|
|
|
+ if (nslot == ndoms) {
|
|
|
|
+ static int warnings = 10;
|
|
|
|
+ if (warnings) {
|
|
|
|
+ printk(KERN_WARNING
|
|
|
|
+ "rebuild_sched_domains confused:"
|
|
|
|
+ " nslot %d, ndoms %d, csn %d, i %d,"
|
|
|
|
+ " apn %d\n",
|
|
|
|
+ nslot, ndoms, csn, i, apn);
|
|
|
|
+ warnings--;
|
|
|
|
+ }
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ cpus_clear(*dp);
|
|
|
|
+ for (j = i; j < csn; j++) {
|
|
|
|
+ struct cpuset *b = csa[j];
|
|
|
|
+
|
|
|
|
+ if (apn == b->pn) {
|
|
|
|
+ cpus_or(*dp, *dp, b->cpus_allowed);
|
|
|
|
+ b->pn = -1;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ nslot++;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ BUG_ON(nslot != ndoms);
|
|
|
|
+
|
|
|
|
+rebuild:
|
|
|
|
+ /* Have scheduler rebuild sched domains */
|
|
|
|
+ lock_cpu_hotplug();
|
|
|
|
+ partition_sched_domains(ndoms, doms);
|
|
|
|
+ unlock_cpu_hotplug();
|
|
|
|
+
|
|
|
|
+done:
|
|
|
|
+ if (q && !IS_ERR(q))
|
|
|
|
+ kfifo_free(q);
|
|
|
|
+ kfree(csa);
|
|
|
|
+ /* Don't kfree(doms) -- partition_sched_domains() does that. */
|
|
|
|
+}
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* Call with manage_mutex held. May take callback_mutex during call.
|
|
* Call with manage_mutex held. May take callback_mutex during call.
|
|
*/
|
|
*/
|
|
@@ -489,6 +701,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
|
|
{
|
|
{
|
|
struct cpuset trialcs;
|
|
struct cpuset trialcs;
|
|
int retval;
|
|
int retval;
|
|
|
|
+ int cpus_changed, is_load_balanced;
|
|
|
|
|
|
/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
|
|
/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
|
|
if (cs == &top_cpuset)
|
|
if (cs == &top_cpuset)
|
|
@@ -516,9 +729,17 @@ static int update_cpumask(struct cpuset *cs, char *buf)
|
|
retval = validate_change(cs, &trialcs);
|
|
retval = validate_change(cs, &trialcs);
|
|
if (retval < 0)
|
|
if (retval < 0)
|
|
return retval;
|
|
return retval;
|
|
|
|
+
|
|
|
|
+ cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
|
|
|
|
+ is_load_balanced = is_sched_load_balance(&trialcs);
|
|
|
|
+
|
|
mutex_lock(&callback_mutex);
|
|
mutex_lock(&callback_mutex);
|
|
cs->cpus_allowed = trialcs.cpus_allowed;
|
|
cs->cpus_allowed = trialcs.cpus_allowed;
|
|
mutex_unlock(&callback_mutex);
|
|
mutex_unlock(&callback_mutex);
|
|
|
|
+
|
|
|
|
+ if (cpus_changed && is_load_balanced)
|
|
|
|
+ rebuild_sched_domains();
|
|
|
|
+
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -752,6 +973,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
|
|
/*
|
|
/*
|
|
* update_flag - read a 0 or a 1 in a file and update associated flag
|
|
* update_flag - read a 0 or a 1 in a file and update associated flag
|
|
* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
|
|
* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
|
|
|
|
+ * CS_SCHED_LOAD_BALANCE,
|
|
* CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
|
|
* CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
|
|
* CS_SPREAD_PAGE, CS_SPREAD_SLAB)
|
|
* CS_SPREAD_PAGE, CS_SPREAD_SLAB)
|
|
* cs: the cpuset to update
|
|
* cs: the cpuset to update
|
|
@@ -765,6 +987,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
|
|
int turning_on;
|
|
int turning_on;
|
|
struct cpuset trialcs;
|
|
struct cpuset trialcs;
|
|
int err;
|
|
int err;
|
|
|
|
+ int cpus_nonempty, balance_flag_changed;
|
|
|
|
|
|
turning_on = (simple_strtoul(buf, NULL, 10) != 0);
|
|
turning_on = (simple_strtoul(buf, NULL, 10) != 0);
|
|
|
|
|
|
@@ -777,10 +1000,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
|
|
err = validate_change(cs, &trialcs);
|
|
err = validate_change(cs, &trialcs);
|
|
if (err < 0)
|
|
if (err < 0)
|
|
return err;
|
|
return err;
|
|
|
|
+
|
|
|
|
+ cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
|
|
|
|
+ balance_flag_changed = (is_sched_load_balance(cs) !=
|
|
|
|
+ is_sched_load_balance(&trialcs));
|
|
|
|
+
|
|
mutex_lock(&callback_mutex);
|
|
mutex_lock(&callback_mutex);
|
|
cs->flags = trialcs.flags;
|
|
cs->flags = trialcs.flags;
|
|
mutex_unlock(&callback_mutex);
|
|
mutex_unlock(&callback_mutex);
|
|
|
|
|
|
|
|
+ if (cpus_nonempty && balance_flag_changed)
|
|
|
|
+ rebuild_sched_domains();
|
|
|
|
+
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -928,6 +1159,7 @@ typedef enum {
|
|
FILE_MEMLIST,
|
|
FILE_MEMLIST,
|
|
FILE_CPU_EXCLUSIVE,
|
|
FILE_CPU_EXCLUSIVE,
|
|
FILE_MEM_EXCLUSIVE,
|
|
FILE_MEM_EXCLUSIVE,
|
|
|
|
+ FILE_SCHED_LOAD_BALANCE,
|
|
FILE_MEMORY_PRESSURE_ENABLED,
|
|
FILE_MEMORY_PRESSURE_ENABLED,
|
|
FILE_MEMORY_PRESSURE,
|
|
FILE_MEMORY_PRESSURE,
|
|
FILE_SPREAD_PAGE,
|
|
FILE_SPREAD_PAGE,
|
|
@@ -946,7 +1178,7 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
|
|
int retval = 0;
|
|
int retval = 0;
|
|
|
|
|
|
/* Crude upper limit on largest legitimate cpulist user might write. */
|
|
/* Crude upper limit on largest legitimate cpulist user might write. */
|
|
- if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
|
|
|
|
|
|
+ if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
|
|
return -E2BIG;
|
|
return -E2BIG;
|
|
|
|
|
|
/* +1 for nul-terminator */
|
|
/* +1 for nul-terminator */
|
|
@@ -979,6 +1211,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
|
|
case FILE_MEM_EXCLUSIVE:
|
|
case FILE_MEM_EXCLUSIVE:
|
|
retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
|
|
retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
|
|
break;
|
|
break;
|
|
|
|
+ case FILE_SCHED_LOAD_BALANCE:
|
|
|
|
+ retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
|
|
|
|
+ break;
|
|
case FILE_MEMORY_MIGRATE:
|
|
case FILE_MEMORY_MIGRATE:
|
|
retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
|
|
retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
|
|
break;
|
|
break;
|
|
@@ -1074,6 +1309,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
|
|
case FILE_MEM_EXCLUSIVE:
|
|
case FILE_MEM_EXCLUSIVE:
|
|
*s++ = is_mem_exclusive(cs) ? '1' : '0';
|
|
*s++ = is_mem_exclusive(cs) ? '1' : '0';
|
|
break;
|
|
break;
|
|
|
|
+ case FILE_SCHED_LOAD_BALANCE:
|
|
|
|
+ *s++ = is_sched_load_balance(cs) ? '1' : '0';
|
|
|
|
+ break;
|
|
case FILE_MEMORY_MIGRATE:
|
|
case FILE_MEMORY_MIGRATE:
|
|
*s++ = is_memory_migrate(cs) ? '1' : '0';
|
|
*s++ = is_memory_migrate(cs) ? '1' : '0';
|
|
break;
|
|
break;
|
|
@@ -1137,6 +1375,13 @@ static struct cftype cft_mem_exclusive = {
|
|
.private = FILE_MEM_EXCLUSIVE,
|
|
.private = FILE_MEM_EXCLUSIVE,
|
|
};
|
|
};
|
|
|
|
|
|
|
|
+static struct cftype cft_sched_load_balance = {
|
|
|
|
+ .name = "sched_load_balance",
|
|
|
|
+ .read = cpuset_common_file_read,
|
|
|
|
+ .write = cpuset_common_file_write,
|
|
|
|
+ .private = FILE_SCHED_LOAD_BALANCE,
|
|
|
|
+};
|
|
|
|
+
|
|
static struct cftype cft_memory_migrate = {
|
|
static struct cftype cft_memory_migrate = {
|
|
.name = "memory_migrate",
|
|
.name = "memory_migrate",
|
|
.read = cpuset_common_file_read,
|
|
.read = cpuset_common_file_read,
|
|
@@ -1186,6 +1431,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
return err;
|
|
return err;
|
|
if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
|
|
if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
|
|
return err;
|
|
return err;
|
|
|
|
+ if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
|
|
|
|
+ return err;
|
|
if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
|
|
if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
|
|
return err;
|
|
return err;
|
|
if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
|
|
if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
|
|
@@ -1267,6 +1514,7 @@ static struct cgroup_subsys_state *cpuset_create(
|
|
set_bit(CS_SPREAD_PAGE, &cs->flags);
|
|
set_bit(CS_SPREAD_PAGE, &cs->flags);
|
|
if (is_spread_slab(parent))
|
|
if (is_spread_slab(parent))
|
|
set_bit(CS_SPREAD_SLAB, &cs->flags);
|
|
set_bit(CS_SPREAD_SLAB, &cs->flags);
|
|
|
|
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
|
cs->cpus_allowed = CPU_MASK_NONE;
|
|
cs->cpus_allowed = CPU_MASK_NONE;
|
|
cs->mems_allowed = NODE_MASK_NONE;
|
|
cs->mems_allowed = NODE_MASK_NONE;
|
|
cs->mems_generation = cpuset_mems_generation++;
|
|
cs->mems_generation = cpuset_mems_generation++;
|
|
@@ -1277,11 +1525,27 @@ static struct cgroup_subsys_state *cpuset_create(
|
|
return &cs->css ;
|
|
return &cs->css ;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/*
|
|
|
|
+ * Locking note on the strange update_flag() call below:
|
|
|
|
+ *
|
|
|
|
+ * If the cpuset being removed has its flag 'sched_load_balance'
|
|
|
|
+ * enabled, then simulate turning sched_load_balance off, which
|
|
|
|
+ * will call rebuild_sched_domains(). The lock_cpu_hotplug()
|
|
|
|
+ * call in rebuild_sched_domains() must not be made while holding
|
|
|
|
+ * callback_mutex. Elsewhere the kernel nests callback_mutex inside
|
|
|
|
+ * lock_cpu_hotplug() calls. So the reverse nesting would risk an
|
|
|
|
+ * ABBA deadlock.
|
|
|
|
+ */
|
|
|
|
+
|
|
static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
{
|
|
{
|
|
struct cpuset *cs = cgroup_cs(cont);
|
|
struct cpuset *cs = cgroup_cs(cont);
|
|
|
|
|
|
cpuset_update_task_memory_state();
|
|
cpuset_update_task_memory_state();
|
|
|
|
+
|
|
|
|
+ if (is_sched_load_balance(cs))
|
|
|
|
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
|
|
|
|
+
|
|
number_of_cpusets--;
|
|
number_of_cpusets--;
|
|
kfree(cs);
|
|
kfree(cs);
|
|
}
|
|
}
|
|
@@ -1326,6 +1590,7 @@ int __init cpuset_init(void)
|
|
|
|
|
|
fmeter_init(&top_cpuset.fmeter);
|
|
fmeter_init(&top_cpuset.fmeter);
|
|
top_cpuset.mems_generation = cpuset_mems_generation++;
|
|
top_cpuset.mems_generation = cpuset_mems_generation++;
|
|
|
|
+ set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
|
|
|
|
|
|
err = register_filesystem(&cpuset_fs_type);
|
|
err = register_filesystem(&cpuset_fs_type);
|
|
if (err < 0)
|
|
if (err < 0)
|
|
@@ -1412,8 +1677,8 @@ static void common_cpu_mem_hotplug_unplug(void)
|
|
* cpu_online_map on each CPU hotplug (cpuhp) event.
|
|
* cpu_online_map on each CPU hotplug (cpuhp) event.
|
|
*/
|
|
*/
|
|
|
|
|
|
-static int cpuset_handle_cpuhp(struct notifier_block *nb,
|
|
|
|
- unsigned long phase, void *cpu)
|
|
|
|
|
|
+static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
|
|
|
|
+ unsigned long phase, void *unused_cpu)
|
|
{
|
|
{
|
|
if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
|
|
if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
|
|
return NOTIFY_DONE;
|
|
return NOTIFY_DONE;
|
|
@@ -1803,7 +2068,7 @@ void __cpuset_memory_pressure_bump(void)
|
|
* the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
|
|
* the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
|
|
* cpuset to top_cpuset.
|
|
* cpuset to top_cpuset.
|
|
*/
|
|
*/
|
|
-static int proc_cpuset_show(struct seq_file *m, void *v)
|
|
|
|
|
|
+static int proc_cpuset_show(struct seq_file *m, void *unused_v)
|
|
{
|
|
{
|
|
struct pid *pid;
|
|
struct pid *pid;
|
|
struct task_struct *tsk;
|
|
struct task_struct *tsk;
|