|
@@ -818,11 +818,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
/*
|
|
|
- * numa task sample period in ms
|
|
|
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
|
|
|
+ * calculated based on the tasks virtual memory size and
|
|
|
+ * numa_balancing_scan_size.
|
|
|
*/
|
|
|
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
|
|
|
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
|
|
|
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
|
|
|
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
|
|
|
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
|
|
|
+unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
|
|
|
|
|
|
/* Portion of address space to scan in MB */
|
|
|
unsigned int sysctl_numa_balancing_scan_size = 256;
|
|
@@ -830,6 +832,51 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
|
|
|
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
|
|
|
unsigned int sysctl_numa_balancing_scan_delay = 1000;
|
|
|
|
|
|
+static unsigned int task_nr_scan_windows(struct task_struct *p)
|
|
|
+{
|
|
|
+ unsigned long rss = 0;
|
|
|
+ unsigned long nr_scan_pages;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Calculations based on RSS as non-present and empty pages are skipped
|
|
|
+ * by the PTE scanner and NUMA hinting faults should be trapped based
|
|
|
+ * on resident pages
|
|
|
+ */
|
|
|
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
|
|
|
+ rss = get_mm_rss(p->mm);
|
|
|
+ if (!rss)
|
|
|
+ rss = nr_scan_pages;
|
|
|
+
|
|
|
+ rss = round_up(rss, nr_scan_pages);
|
|
|
+ return rss / nr_scan_pages;
|
|
|
+}
|
|
|
+
|
|
|
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
|
|
|
+#define MAX_SCAN_WINDOW 2560
|
|
|
+
|
|
|
+static unsigned int task_scan_min(struct task_struct *p)
|
|
|
+{
|
|
|
+ unsigned int scan, floor;
|
|
|
+ unsigned int windows = 1;
|
|
|
+
|
|
|
+ if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
|
|
|
+ windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
|
|
|
+ floor = 1000 / windows;
|
|
|
+
|
|
|
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
|
|
|
+ return max_t(unsigned int, floor, scan);
|
|
|
+}
|
|
|
+
|
|
|
+static unsigned int task_scan_max(struct task_struct *p)
|
|
|
+{
|
|
|
+ unsigned int smin = task_scan_min(p);
|
|
|
+ unsigned int smax;
|
|
|
+
|
|
|
+ /* Watch for min being lower than max due to floor calculations */
|
|
|
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
|
|
|
+ return max(smin, smax);
|
|
|
+}
|
|
|
+
|
|
|
static void task_numa_placement(struct task_struct *p)
|
|
|
{
|
|
|
int seq;
|
|
@@ -840,6 +887,7 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
if (p->numa_scan_seq == seq)
|
|
|
return;
|
|
|
p->numa_scan_seq = seq;
|
|
|
+ p->numa_scan_period_max = task_scan_max(p);
|
|
|
|
|
|
/* FIXME: Scheduling placement policy hints go here */
|
|
|
}
|
|
@@ -860,9 +908,14 @@ void task_numa_fault(int node, int pages, bool migrated)
|
|
|
* If pages are properly placed (did not migrate) then scan slower.
|
|
|
* This is reset periodically in case of phase changes
|
|
|
*/
|
|
|
- if (!migrated)
|
|
|
- p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
|
|
|
- p->numa_scan_period + jiffies_to_msecs(10));
|
|
|
+ if (!migrated) {
|
|
|
+ /* Initialise if necessary */
|
|
|
+ if (!p->numa_scan_period_max)
|
|
|
+ p->numa_scan_period_max = task_scan_max(p);
|
|
|
+
|
|
|
+ p->numa_scan_period = min(p->numa_scan_period_max,
|
|
|
+ p->numa_scan_period + 10);
|
|
|
+ }
|
|
|
|
|
|
task_numa_placement(p);
|
|
|
}
|
|
@@ -884,6 +937,7 @@ void task_numa_work(struct callback_head *work)
|
|
|
struct mm_struct *mm = p->mm;
|
|
|
struct vm_area_struct *vma;
|
|
|
unsigned long start, end;
|
|
|
+ unsigned long nr_pte_updates = 0;
|
|
|
long pages;
|
|
|
|
|
|
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
|
|
@@ -915,7 +969,7 @@ void task_numa_work(struct callback_head *work)
|
|
|
*/
|
|
|
migrate = mm->numa_next_reset;
|
|
|
if (time_after(now, migrate)) {
|
|
|
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
|
|
|
+ p->numa_scan_period = task_scan_min(p);
|
|
|
next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
|
|
|
xchg(&mm->numa_next_reset, next_scan);
|
|
|
}
|
|
@@ -927,8 +981,10 @@ void task_numa_work(struct callback_head *work)
|
|
|
if (time_before(now, migrate))
|
|
|
return;
|
|
|
|
|
|
- if (p->numa_scan_period == 0)
|
|
|
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
|
|
|
+ if (p->numa_scan_period == 0) {
|
|
|
+ p->numa_scan_period_max = task_scan_max(p);
|
|
|
+ p->numa_scan_period = task_scan_min(p);
|
|
|
+ }
|
|
|
|
|
|
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
|
|
|
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
|
|
@@ -965,7 +1021,15 @@ void task_numa_work(struct callback_head *work)
|
|
|
start = max(start, vma->vm_start);
|
|
|
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
|
|
|
end = min(end, vma->vm_end);
|
|
|
- pages -= change_prot_numa(vma, start, end);
|
|
|
+ nr_pte_updates += change_prot_numa(vma, start, end);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Scan sysctl_numa_balancing_scan_size but ensure that
|
|
|
+ * at least one PTE is updated so that unused virtual
|
|
|
+ * address space is quickly skipped.
|
|
|
+ */
|
|
|
+ if (nr_pte_updates)
|
|
|
+ pages -= (end - start) >> PAGE_SHIFT;
|
|
|
|
|
|
start = end;
|
|
|
if (pages <= 0)
|
|
@@ -1012,7 +1076,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
|
|
|
|
if (now - curr->node_stamp > period) {
|
|
|
if (!curr->node_stamp)
|
|
|
- curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
|
|
|
+ curr->numa_scan_period = task_scan_min(curr);
|
|
|
curr->node_stamp += period;
|
|
|
|
|
|
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
|