|
@@ -90,6 +90,7 @@
|
|
|
#include <linux/syscalls.h>
|
|
|
#include <linux/ctype.h>
|
|
|
#include <linux/mm_inline.h>
|
|
|
+#include <linux/mmu_notifier.h>
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
#include <asm/uaccess.h>
|
|
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
|
|
|
.flags = MPOL_F_LOCAL,
|
|
|
};
|
|
|
|
|
|
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
|
|
|
+
|
|
|
+static struct mempolicy *get_task_policy(struct task_struct *p)
|
|
|
+{
|
|
|
+ struct mempolicy *pol = p->mempolicy;
|
|
|
+ int node;
|
|
|
+
|
|
|
+ if (!pol) {
|
|
|
+ node = numa_node_id();
|
|
|
+ if (node != -1)
|
|
|
+ pol = &preferred_node_policy[node];
|
|
|
+
|
|
|
+ /* preferred_node_policy is not initialised early in boot */
|
|
|
+ if (!pol->mode)
|
|
|
+ pol = NULL;
|
|
|
+ }
|
|
|
+
|
|
|
+ return pol;
|
|
|
+}
|
|
|
+
|
|
|
static const struct mempolicy_operations {
|
|
|
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
|
|
|
/*
|
|
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
|
|
if (mode == MPOL_DEFAULT) {
|
|
|
if (nodes && !nodes_empty(*nodes))
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
- return NULL; /* simply delete any existing policy */
|
|
|
+ return NULL;
|
|
|
}
|
|
|
VM_BUG_ON(!nodes);
|
|
|
|
|
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
|
|
(flags & MPOL_F_RELATIVE_NODES)))
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
}
|
|
|
+ } else if (mode == MPOL_LOCAL) {
|
|
|
+ if (!nodes_empty(*nodes))
|
|
|
+ return ERR_PTR(-EINVAL);
|
|
|
+ mode = MPOL_PREFERRED;
|
|
|
} else if (nodes_empty(*nodes))
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
|
|
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
|
|
|
+/*
|
|
|
+ * This is used to mark a range of virtual addresses to be inaccessible.
|
|
|
+ * These are later cleared by a NUMA hinting fault. Depending on these
|
|
|
+ * faults, pages may be migrated for better NUMA placement.
|
|
|
+ *
|
|
|
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
|
|
|
+ * an architecture makes a different choice, it will need further
|
|
|
+ * changes to the core.
|
|
|
+ */
|
|
|
+unsigned long change_prot_numa(struct vm_area_struct *vma,
|
|
|
+ unsigned long addr, unsigned long end)
|
|
|
+{
|
|
|
+ int nr_updated;
|
|
|
+ BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
|
|
|
+
|
|
|
+ nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
|
|
|
+ if (nr_updated)
|
|
|
+ count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
|
|
|
+
|
|
|
+ return nr_updated;
|
|
|
+}
|
|
|
+#else
|
|
|
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
|
|
|
+ unsigned long addr, unsigned long end)
|
|
|
+{
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
|
|
|
+
|
|
|
/*
|
|
|
* Check if all pages in a range are on a set of nodes.
|
|
|
* If pagelist != NULL then isolate pages from the LRU and
|
|
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
prev = NULL;
|
|
|
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
|
|
|
+ unsigned long endvma = vma->vm_end;
|
|
|
+
|
|
|
+ if (endvma > end)
|
|
|
+ endvma = end;
|
|
|
+ if (vma->vm_start > start)
|
|
|
+ start = vma->vm_start;
|
|
|
+
|
|
|
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
|
|
|
if (!vma->vm_next && vma->vm_end < end)
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
if (prev && prev->vm_end < vma->vm_start)
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
}
|
|
|
- if (!is_vm_hugetlb_page(vma) &&
|
|
|
- ((flags & MPOL_MF_STRICT) ||
|
|
|
+
|
|
|
+ if (is_vm_hugetlb_page(vma))
|
|
|
+ goto next;
|
|
|
+
|
|
|
+ if (flags & MPOL_MF_LAZY) {
|
|
|
+ change_prot_numa(vma, start, endvma);
|
|
|
+ goto next;
|
|
|
+ }
|
|
|
+
|
|
|
+ if ((flags & MPOL_MF_STRICT) ||
|
|
|
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
|
|
|
- vma_migratable(vma)))) {
|
|
|
- unsigned long endvma = vma->vm_end;
|
|
|
+ vma_migratable(vma))) {
|
|
|
|
|
|
- if (endvma > end)
|
|
|
- endvma = end;
|
|
|
- if (vma->vm_start > start)
|
|
|
- start = vma->vm_start;
|
|
|
err = check_pgd_range(vma, start, endvma, nodes,
|
|
|
flags, private);
|
|
|
if (err) {
|
|
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
+next:
|
|
|
prev = vma;
|
|
|
}
|
|
|
return first;
|
|
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
|
|
|
|
|
|
if (!list_empty(&pagelist)) {
|
|
|
err = migrate_pages(&pagelist, new_node_page, dest,
|
|
|
- false, MIGRATE_SYNC);
|
|
|
+ false, MIGRATE_SYNC,
|
|
|
+ MR_SYSCALL);
|
|
|
if (err)
|
|
|
putback_lru_pages(&pagelist);
|
|
|
}
|
|
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|
|
int err;
|
|
|
LIST_HEAD(pagelist);
|
|
|
|
|
|
- if (flags & ~(unsigned long)(MPOL_MF_STRICT |
|
|
|
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
|
|
|
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
|
|
|
return -EINVAL;
|
|
|
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
|
|
|
return -EPERM;
|
|
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|
|
if (IS_ERR(new))
|
|
|
return PTR_ERR(new);
|
|
|
|
|
|
+ if (flags & MPOL_MF_LAZY)
|
|
|
+ new->flags |= MPOL_F_MOF;
|
|
|
+
|
|
|
/*
|
|
|
* If we are using the default policy then operation
|
|
|
* on discontinuous address spaces is okay after all
|
|
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|
|
vma = check_range(mm, start, end, nmask,
|
|
|
flags | MPOL_MF_INVERT, &pagelist);
|
|
|
|
|
|
- err = PTR_ERR(vma);
|
|
|
- if (!IS_ERR(vma)) {
|
|
|
- int nr_failed = 0;
|
|
|
-
|
|
|
+ err = PTR_ERR(vma); /* maybe ... */
|
|
|
+ if (!IS_ERR(vma))
|
|
|
err = mbind_range(mm, start, end, new);
|
|
|
|
|
|
+ if (!err) {
|
|
|
+ int nr_failed = 0;
|
|
|
+
|
|
|
if (!list_empty(&pagelist)) {
|
|
|
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
|
|
|
nr_failed = migrate_pages(&pagelist, new_vma_page,
|
|
|
(unsigned long)vma,
|
|
|
- false, MIGRATE_SYNC);
|
|
|
+ false, MIGRATE_SYNC,
|
|
|
+ MR_MEMPOLICY_MBIND);
|
|
|
if (nr_failed)
|
|
|
putback_lru_pages(&pagelist);
|
|
|
}
|
|
|
|
|
|
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
|
|
|
+ if (nr_failed && (flags & MPOL_MF_STRICT))
|
|
|
err = -EIO;
|
|
|
} else
|
|
|
putback_lru_pages(&pagelist);
|
|
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
|
|
|
struct mempolicy *get_vma_policy(struct task_struct *task,
|
|
|
struct vm_area_struct *vma, unsigned long addr)
|
|
|
{
|
|
|
- struct mempolicy *pol = task->mempolicy;
|
|
|
+ struct mempolicy *pol = get_task_policy(task);
|
|
|
|
|
|
if (vma) {
|
|
|
if (vma->vm_ops && vma->vm_ops->get_policy) {
|
|
@@ -1956,7 +2028,7 @@ retry_cpuset:
|
|
|
*/
|
|
|
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
|
|
|
{
|
|
|
- struct mempolicy *pol = current->mempolicy;
|
|
|
+ struct mempolicy *pol = get_task_policy(current);
|
|
|
struct page *page;
|
|
|
unsigned int cpuset_mems_cookie;
|
|
|
|
|
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)
|
|
|
kmem_cache_free(sn_cache, n);
|
|
|
}
|
|
|
|
|
|
+/**
|
|
|
+ * mpol_misplaced - check whether current page node is valid in policy
|
|
|
+ *
|
|
|
+ * @page - page to be checked
|
|
|
+ * @vma - vm area where page mapped
|
|
|
+ * @addr - virtual address where page mapped
|
|
|
+ *
|
|
|
+ * Lookup current policy node id for vma,addr and "compare to" page's
|
|
|
+ * node id.
|
|
|
+ *
|
|
|
+ * Returns:
|
|
|
+ * -1 - not misplaced, page is in the right node
|
|
|
+ * node - node id where the page should be
|
|
|
+ *
|
|
|
+ * Policy determination "mimics" alloc_page_vma().
|
|
|
+ * Called from fault path where we know the vma and faulting address.
|
|
|
+ */
|
|
|
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
|
|
|
+{
|
|
|
+ struct mempolicy *pol;
|
|
|
+ struct zone *zone;
|
|
|
+ int curnid = page_to_nid(page);
|
|
|
+ unsigned long pgoff;
|
|
|
+ int polnid = -1;
|
|
|
+ int ret = -1;
|
|
|
+
|
|
|
+ BUG_ON(!vma);
|
|
|
+
|
|
|
+ pol = get_vma_policy(current, vma, addr);
|
|
|
+ if (!(pol->flags & MPOL_F_MOF))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ switch (pol->mode) {
|
|
|
+ case MPOL_INTERLEAVE:
|
|
|
+ BUG_ON(addr >= vma->vm_end);
|
|
|
+ BUG_ON(addr < vma->vm_start);
|
|
|
+
|
|
|
+ pgoff = vma->vm_pgoff;
|
|
|
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
|
|
|
+ polnid = offset_il_node(pol, vma, pgoff);
|
|
|
+ break;
|
|
|
+
|
|
|
+ case MPOL_PREFERRED:
|
|
|
+ if (pol->flags & MPOL_F_LOCAL)
|
|
|
+ polnid = numa_node_id();
|
|
|
+ else
|
|
|
+ polnid = pol->v.preferred_node;
|
|
|
+ break;
|
|
|
+
|
|
|
+ case MPOL_BIND:
|
|
|
+ /*
|
|
|
+ * allows binding to multiple nodes.
|
|
|
+ * use current page if in policy nodemask,
|
|
|
+ * else select nearest allowed node, if any.
|
|
|
+ * If no allowed nodes, use current [!misplaced].
|
|
|
+ */
|
|
|
+ if (node_isset(curnid, pol->v.nodes))
|
|
|
+ goto out;
|
|
|
+ (void)first_zones_zonelist(
|
|
|
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
|
|
|
+ gfp_zone(GFP_HIGHUSER),
|
|
|
+ &pol->v.nodes, &zone);
|
|
|
+ polnid = zone->node;
|
|
|
+ break;
|
|
|
+
|
|
|
+ default:
|
|
|
+ BUG();
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Migrate the page towards the node whose CPU is referencing it */
|
|
|
+ if (pol->flags & MPOL_F_MORON) {
|
|
|
+ int last_nid;
|
|
|
+
|
|
|
+ polnid = numa_node_id();
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Multi-stage node selection is used in conjunction
|
|
|
+ * with a periodic migration fault to build a temporal
|
|
|
+ * task<->page relation. By using a two-stage filter we
|
|
|
+ * remove short/unlikely relations.
|
|
|
+ *
|
|
|
+ * Using P(p) ~ n_p / n_t as per frequentist
|
|
|
+ * probability, we can equate a task's usage of a
|
|
|
+ * particular page (n_p) per total usage of this
|
|
|
+ * page (n_t) (in a given time-span) to a probability.
|
|
|
+ *
|
|
|
+ * Our periodic faults will sample this probability and
|
|
|
+ * getting the same result twice in a row, given these
|
|
|
+ * samples are fully independent, is then given by
|
|
|
+ * P(n)^2, provided our sample period is sufficiently
|
|
|
+ * short compared to the usage pattern.
|
|
|
+ *
|
|
|
+ * This quadric squishes small probabilities, making
|
|
|
+ * it less likely we act on an unlikely task<->page
|
|
|
+ * relation.
|
|
|
+ */
|
|
|
+ last_nid = page_xchg_last_nid(page, polnid);
|
|
|
+ if (last_nid != polnid)
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (curnid != polnid)
|
|
|
+ ret = polnid;
|
|
|
+out:
|
|
|
+ mpol_cond_put(pol);
|
|
|
+
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
|
|
|
{
|
|
|
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
|
|
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
|
|
|
mutex_unlock(&p->mutex);
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+static bool __initdata numabalancing_override;
|
|
|
+
|
|
|
+static void __init check_numabalancing_enable(void)
|
|
|
+{
|
|
|
+ bool numabalancing_default = false;
|
|
|
+
|
|
|
+ if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
|
|
|
+ numabalancing_default = true;
|
|
|
+
|
|
|
+ if (nr_node_ids > 1 && !numabalancing_override) {
|
|
|
+ printk(KERN_INFO "Enabling automatic NUMA balancing. "
|
|
|
+ "Configure with numa_balancing= or sysctl");
|
|
|
+ set_numabalancing_state(numabalancing_default);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static int __init setup_numabalancing(char *str)
|
|
|
+{
|
|
|
+ int ret = 0;
|
|
|
+ if (!str)
|
|
|
+ goto out;
|
|
|
+ numabalancing_override = true;
|
|
|
+
|
|
|
+ if (!strcmp(str, "enable")) {
|
|
|
+ set_numabalancing_state(true);
|
|
|
+ ret = 1;
|
|
|
+ } else if (!strcmp(str, "disable")) {
|
|
|
+ set_numabalancing_state(false);
|
|
|
+ ret = 1;
|
|
|
+ }
|
|
|
+out:
|
|
|
+ if (!ret)
|
|
|
+ printk(KERN_WARNING "Unable to parse numa_balancing=\n");
|
|
|
+
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+__setup("numa_balancing=", setup_numabalancing);
|
|
|
+#else
|
|
|
+static inline void __init check_numabalancing_enable(void)
|
|
|
+{
|
|
|
+}
|
|
|
+#endif /* CONFIG_NUMA_BALANCING */
|
|
|
+
|
|
|
/* assumes fs == KERNEL_DS */
|
|
|
void __init numa_policy_init(void)
|
|
|
{
|
|
@@ -2320,6 +2545,15 @@ void __init numa_policy_init(void)
|
|
|
sizeof(struct sp_node),
|
|
|
0, SLAB_PANIC, NULL);
|
|
|
|
|
|
+ for_each_node(nid) {
|
|
|
+ preferred_node_policy[nid] = (struct mempolicy) {
|
|
|
+ .refcnt = ATOMIC_INIT(1),
|
|
|
+ .mode = MPOL_PREFERRED,
|
|
|
+ .flags = MPOL_F_MOF | MPOL_F_MORON,
|
|
|
+ .v = { .preferred_node = nid, },
|
|
|
+ };
|
|
|
+ }
|
|
|
+
|
|
|
/*
|
|
|
* Set interleaving policy for system init. Interleaving is only
|
|
|
* enabled across suitably sized nodes (default is >= 16MB), or
|
|
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)
|
|
|
|
|
|
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
|
|
|
printk("numa_policy_init: interleaving failed\n");
|
|
|
+
|
|
|
+ check_numabalancing_enable();
|
|
|
}
|
|
|
|
|
|
/* Reset policy of current process to default */
|
|
@@ -2362,14 +2598,13 @@ void numa_default_policy(void)
|
|
|
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
|
|
|
* Used only for mpol_parse_str() and mpol_to_str()
|
|
|
*/
|
|
|
-#define MPOL_LOCAL MPOL_MAX
|
|
|
static const char * const policy_modes[] =
|
|
|
{
|
|
|
[MPOL_DEFAULT] = "default",
|
|
|
[MPOL_PREFERRED] = "prefer",
|
|
|
[MPOL_BIND] = "bind",
|
|
|
[MPOL_INTERLEAVE] = "interleave",
|
|
|
- [MPOL_LOCAL] = "local"
|
|
|
+ [MPOL_LOCAL] = "local",
|
|
|
};
|
|
|
|
|
|
|
|
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
|
|
|
if (flags)
|
|
|
*flags++ = '\0'; /* terminate mode string */
|
|
|
|
|
|
- for (mode = 0; mode <= MPOL_LOCAL; mode++) {
|
|
|
+ for (mode = 0; mode < MPOL_MAX; mode++) {
|
|
|
if (!strcmp(str, policy_modes[mode])) {
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
- if (mode > MPOL_LOCAL)
|
|
|
+ if (mode >= MPOL_MAX)
|
|
|
goto out;
|
|
|
|
|
|
switch (mode) {
|