|
@@ -26,6 +26,8 @@
|
|
|
#include <linux/slab.h>
|
|
|
#include <linux/profile.h>
|
|
|
#include <linux/interrupt.h>
|
|
|
+#include <linux/mempolicy.h>
|
|
|
+#include <linux/task_work.h>
|
|
|
|
|
|
#include <trace/events/sched.h>
|
|
|
|
|
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
* Scheduling class queueing methods:
|
|
|
*/
|
|
|
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+/*
|
|
|
+ * numa task sample period in ms: 5s
|
|
|
+ */
|
|
|
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
|
|
|
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
|
|
|
+
|
|
|
+static void task_numa_placement(struct task_struct *p)
|
|
|
+{
|
|
|
+ int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
|
|
|
+
|
|
|
+ if (p->numa_scan_seq == seq)
|
|
|
+ return;
|
|
|
+ p->numa_scan_seq = seq;
|
|
|
+
|
|
|
+ /* FIXME: Scheduling placement policy hints go here */
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Got a PROT_NONE fault for a page on @node.
|
|
|
+ */
|
|
|
+void task_numa_fault(int node, int pages)
|
|
|
+{
|
|
|
+ struct task_struct *p = current;
|
|
|
+
|
|
|
+ /* FIXME: Allocate task-specific structure for placement policy here */
|
|
|
+
|
|
|
+ task_numa_placement(p);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * The expensive part of numa migration is done from task_work context.
|
|
|
+ * Triggered from task_tick_numa().
|
|
|
+ */
|
|
|
+void task_numa_work(struct callback_head *work)
|
|
|
+{
|
|
|
+ unsigned long migrate, next_scan, now = jiffies;
|
|
|
+ struct task_struct *p = current;
|
|
|
+ struct mm_struct *mm = p->mm;
|
|
|
+
|
|
|
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
|
|
|
+
|
|
|
+ work->next = work; /* protect against double add */
|
|
|
+ /*
|
|
|
+ * Who cares about NUMA placement when they're dying.
|
|
|
+ *
|
|
|
+ * NOTE: make sure not to dereference p->mm before this check,
|
|
|
+ * exit_task_work() happens _after_ exit_mm() so we could be called
|
|
|
+ * without p->mm even though we still had it when we enqueued this
|
|
|
+ * work.
|
|
|
+ */
|
|
|
+ if (p->flags & PF_EXITING)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Enforce maximal scan/migration frequency..
|
|
|
+ */
|
|
|
+ migrate = mm->numa_next_scan;
|
|
|
+ if (time_before(now, migrate))
|
|
|
+ return;
|
|
|
+
|
|
|
+ if (p->numa_scan_period == 0)
|
|
|
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
|
|
|
+
|
|
|
+ next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
|
|
|
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
|
|
|
+ return;
|
|
|
+
|
|
|
+ ACCESS_ONCE(mm->numa_scan_seq)++;
|
|
|
+ {
|
|
|
+ struct vm_area_struct *vma;
|
|
|
+
|
|
|
+ down_read(&mm->mmap_sem);
|
|
|
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
|
|
+ if (!vma_migratable(vma))
|
|
|
+ continue;
|
|
|
+ change_prot_numa(vma, vma->vm_start, vma->vm_end);
|
|
|
+ }
|
|
|
+ up_read(&mm->mmap_sem);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Drive the periodic memory faults..
|
|
|
+ */
|
|
|
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
|
+{
|
|
|
+ struct callback_head *work = &curr->numa_work;
|
|
|
+ u64 period, now;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We don't care about NUMA placement if we don't have memory.
|
|
|
+ */
|
|
|
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Using runtime rather than walltime has the dual advantage that
|
|
|
+ * we (mostly) drive the selection from busy threads and that the
|
|
|
+ * task needs to have done some actual work before we bother with
|
|
|
+ * NUMA placement.
|
|
|
+ */
|
|
|
+ now = curr->se.sum_exec_runtime;
|
|
|
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
|
|
|
+
|
|
|
+ if (now - curr->node_stamp > period) {
|
|
|
+ curr->node_stamp = now;
|
|
|
+
|
|
|
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
|
|
|
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
|
|
|
+ task_work_add(curr, work, true);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+#else
|
|
|
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
|
+{
|
|
|
+}
|
|
|
+#endif /* CONFIG_NUMA_BALANCING */
|
|
|
+
|
|
|
static void
|
|
|
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
|
|
|
cfs_rq = cfs_rq_of(se);
|
|
|
entity_tick(cfs_rq, se, queued);
|
|
|
}
|
|
|
+
|
|
|
+ if (sched_feat_numa(NUMA))
|
|
|
+ task_tick_numa(rq, curr);
|
|
|
}
|
|
|
|
|
|
/*
|