|
@@ -888,6 +888,17 @@ static unsigned int task_scan_max(struct task_struct *p)
|
|
|
*/
|
|
|
unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
|
|
|
|
|
|
+struct numa_group {
|
|
|
+ atomic_t refcount;
|
|
|
+
|
|
|
+ spinlock_t lock; /* nr_tasks, tasks */
|
|
|
+ int nr_tasks;
|
|
|
+ struct list_head task_list;
|
|
|
+
|
|
|
+ struct rcu_head rcu;
|
|
|
+ atomic_long_t faults[0];
|
|
|
+};
|
|
|
+
|
|
|
static inline int task_faults_idx(int nid, int priv)
|
|
|
{
|
|
|
return 2 * nid + priv;
|
|
@@ -1182,7 +1193,10 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
int priv, i;
|
|
|
|
|
|
for (priv = 0; priv < 2; priv++) {
|
|
|
+ long diff;
|
|
|
+
|
|
|
i = task_faults_idx(nid, priv);
|
|
|
+ diff = -p->numa_faults[i];
|
|
|
|
|
|
/* Decay existing window, copy faults since last scan */
|
|
|
p->numa_faults[i] >>= 1;
|
|
@@ -1190,6 +1204,11 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
p->numa_faults_buffer[i] = 0;
|
|
|
|
|
|
faults += p->numa_faults[i];
|
|
|
+ diff += p->numa_faults[i];
|
|
|
+ if (p->numa_group) {
|
|
|
+ /* safe because we can only change our own group */
|
|
|
+ atomic_long_add(diff, &p->numa_group->faults[i]);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
if (faults > max_faults) {
|
|
@@ -1207,6 +1226,131 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static inline int get_numa_group(struct numa_group *grp)
|
|
|
+{
|
|
|
+ return atomic_inc_not_zero(&grp->refcount);
|
|
|
+}
|
|
|
+
|
|
|
+static inline void put_numa_group(struct numa_group *grp)
|
|
|
+{
|
|
|
+ if (atomic_dec_and_test(&grp->refcount))
|
|
|
+ kfree_rcu(grp, rcu);
|
|
|
+}
|
|
|
+
|
|
|
+static void double_lock(spinlock_t *l1, spinlock_t *l2)
|
|
|
+{
|
|
|
+ if (l1 > l2)
|
|
|
+ swap(l1, l2);
|
|
|
+
|
|
|
+ spin_lock(l1);
|
|
|
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
|
|
|
+}
|
|
|
+
|
|
|
+static void task_numa_group(struct task_struct *p, int cpupid)
|
|
|
+{
|
|
|
+ struct numa_group *grp, *my_grp;
|
|
|
+ struct task_struct *tsk;
|
|
|
+ bool join = false;
|
|
|
+ int cpu = cpupid_to_cpu(cpupid);
|
|
|
+ int i;
|
|
|
+
|
|
|
+ if (unlikely(!p->numa_group)) {
|
|
|
+ unsigned int size = sizeof(struct numa_group) +
|
|
|
+ 2*nr_node_ids*sizeof(atomic_long_t);
|
|
|
+
|
|
|
+ grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
|
|
|
+ if (!grp)
|
|
|
+ return;
|
|
|
+
|
|
|
+ atomic_set(&grp->refcount, 1);
|
|
|
+ spin_lock_init(&grp->lock);
|
|
|
+ INIT_LIST_HEAD(&grp->task_list);
|
|
|
+
|
|
|
+ for (i = 0; i < 2*nr_node_ids; i++)
|
|
|
+ atomic_long_set(&grp->faults[i], p->numa_faults[i]);
|
|
|
+
|
|
|
+ list_add(&p->numa_entry, &grp->task_list);
|
|
|
+ grp->nr_tasks++;
|
|
|
+ rcu_assign_pointer(p->numa_group, grp);
|
|
|
+ }
|
|
|
+
|
|
|
+ rcu_read_lock();
|
|
|
+ tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
|
|
|
+
|
|
|
+ if (!cpupid_match_pid(tsk, cpupid))
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ grp = rcu_dereference(tsk->numa_group);
|
|
|
+ if (!grp)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ my_grp = p->numa_group;
|
|
|
+ if (grp == my_grp)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Only join the other group if its bigger; if we're the bigger group,
|
|
|
+ * the other task will join us.
|
|
|
+ */
|
|
|
+ if (my_grp->nr_tasks > grp->nr_tasks)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Tie-break on the grp address.
|
|
|
+ */
|
|
|
+ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ if (!get_numa_group(grp))
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ join = true;
|
|
|
+
|
|
|
+unlock:
|
|
|
+ rcu_read_unlock();
|
|
|
+
|
|
|
+ if (!join)
|
|
|
+ return;
|
|
|
+
|
|
|
+ for (i = 0; i < 2*nr_node_ids; i++) {
|
|
|
+ atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]);
|
|
|
+ atomic_long_add(p->numa_faults[i], &grp->faults[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+ double_lock(&my_grp->lock, &grp->lock);
|
|
|
+
|
|
|
+ list_move(&p->numa_entry, &grp->task_list);
|
|
|
+ my_grp->nr_tasks--;
|
|
|
+ grp->nr_tasks++;
|
|
|
+
|
|
|
+ spin_unlock(&my_grp->lock);
|
|
|
+ spin_unlock(&grp->lock);
|
|
|
+
|
|
|
+ rcu_assign_pointer(p->numa_group, grp);
|
|
|
+
|
|
|
+ put_numa_group(my_grp);
|
|
|
+}
|
|
|
+
|
|
|
+void task_numa_free(struct task_struct *p)
|
|
|
+{
|
|
|
+ struct numa_group *grp = p->numa_group;
|
|
|
+ int i;
|
|
|
+
|
|
|
+ if (grp) {
|
|
|
+ for (i = 0; i < 2*nr_node_ids; i++)
|
|
|
+ atomic_long_sub(p->numa_faults[i], &grp->faults[i]);
|
|
|
+
|
|
|
+ spin_lock(&grp->lock);
|
|
|
+ list_del(&p->numa_entry);
|
|
|
+ grp->nr_tasks--;
|
|
|
+ spin_unlock(&grp->lock);
|
|
|
+ rcu_assign_pointer(p->numa_group, NULL);
|
|
|
+ put_numa_group(grp);
|
|
|
+ }
|
|
|
+
|
|
|
+ kfree(p->numa_faults);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Got a PROT_NONE fault for a page on @node.
|
|
|
*/
|
|
@@ -1222,15 +1366,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated)
|
|
|
if (!p->mm)
|
|
|
return;
|
|
|
|
|
|
- /*
|
|
|
- * First accesses are treated as private, otherwise consider accesses
|
|
|
- * to be private if the accessing pid has not changed
|
|
|
- */
|
|
|
- if (!cpupid_pid_unset(last_cpupid))
|
|
|
- priv = ((p->pid & LAST__PID_MASK) == cpupid_to_pid(last_cpupid));
|
|
|
- else
|
|
|
- priv = 1;
|
|
|
-
|
|
|
/* Allocate buffer to track faults on a per-node basis */
|
|
|
if (unlikely(!p->numa_faults)) {
|
|
|
int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
|
|
@@ -1244,6 +1379,18 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated)
|
|
|
p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
|
|
|
}
|
|
|
|
|
|
+ /*
|
|
|
+ * First accesses are treated as private, otherwise consider accesses
|
|
|
+ * to be private if the accessing pid has not changed
|
|
|
+ */
|
|
|
+ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
|
|
|
+ priv = 1;
|
|
|
+ } else {
|
|
|
+ priv = cpupid_match_pid(p, last_cpupid);
|
|
|
+ if (!priv)
|
|
|
+ task_numa_group(p, last_cpupid);
|
|
|
+ }
|
|
|
+
|
|
|
/*
|
|
|
* If pages are properly placed (did not migrate) then scan slower.
|
|
|
* This is reset periodically in case of phase changes
|