|
@@ -90,6 +90,7 @@
|
|
|
#include <linux/syscalls.h>
|
|
|
#include <linux/ctype.h>
|
|
|
#include <linux/mm_inline.h>
|
|
|
+#include <linux/mmu_notifier.h>
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
#include <asm/uaccess.h>
|
|
@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
|
|
|
+/*
|
|
|
+ * Here we search for not shared page mappings (mapcount == 1) and we
|
|
|
+ * set up the pmd/pte_numa on those mappings so the very next access
|
|
|
+ * will fire a NUMA hinting page fault.
|
|
|
+ */
|
|
|
+static int
|
|
|
+change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
+ unsigned long address)
|
|
|
+{
|
|
|
+ pgd_t *pgd;
|
|
|
+ pud_t *pud;
|
|
|
+ pmd_t *pmd;
|
|
|
+ pte_t *pte, *_pte;
|
|
|
+ struct page *page;
|
|
|
+ unsigned long _address, end;
|
|
|
+ spinlock_t *ptl;
|
|
|
+ int ret = 0;
|
|
|
+
|
|
|
+ VM_BUG_ON(address & ~PAGE_MASK);
|
|
|
+
|
|
|
+ pgd = pgd_offset(mm, address);
|
|
|
+ if (!pgd_present(*pgd))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ pud = pud_offset(pgd, address);
|
|
|
+ if (!pud_present(*pud))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ pmd = pmd_offset(pud, address);
|
|
|
+ if (pmd_none(*pmd))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
|
|
|
+ int page_nid;
|
|
|
+ ret = HPAGE_PMD_NR;
|
|
|
+
|
|
|
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
|
|
|
+
|
|
|
+ if (pmd_numa(*pmd)) {
|
|
|
+ spin_unlock(&mm->page_table_lock);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ page = pmd_page(*pmd);
|
|
|
+
|
|
|
+ /* only check non-shared pages */
|
|
|
+ if (page_mapcount(page) != 1) {
|
|
|
+ spin_unlock(&mm->page_table_lock);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ page_nid = page_to_nid(page);
|
|
|
+
|
|
|
+ if (pmd_numa(*pmd)) {
|
|
|
+ spin_unlock(&mm->page_table_lock);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
|
|
|
+ ret += HPAGE_PMD_NR;
|
|
|
+ /* defer TLB flush to lower the overhead */
|
|
|
+ spin_unlock(&mm->page_table_lock);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (pmd_trans_unstable(pmd))
|
|
|
+ goto out;
|
|
|
+ VM_BUG_ON(!pmd_present(*pmd));
|
|
|
+
|
|
|
+ end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK);
|
|
|
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
|
|
|
+ for (_address = address, _pte = pte; _address < end;
|
|
|
+ _pte++, _address += PAGE_SIZE) {
|
|
|
+ pte_t pteval = *_pte;
|
|
|
+ if (!pte_present(pteval))
|
|
|
+ continue;
|
|
|
+ if (pte_numa(pteval))
|
|
|
+ continue;
|
|
|
+ page = vm_normal_page(vma, _address, pteval);
|
|
|
+ if (unlikely(!page))
|
|
|
+ continue;
|
|
|
+ /* only check non-shared pages */
|
|
|
+ if (page_mapcount(page) != 1)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
|
|
|
+
|
|
|
+ /* defer TLB flush to lower the overhead */
|
|
|
+ ret++;
|
|
|
+ }
|
|
|
+ pte_unmap_unlock(pte, ptl);
|
|
|
+
|
|
|
+ if (ret && !pmd_numa(*pmd)) {
|
|
|
+ spin_lock(&mm->page_table_lock);
|
|
|
+ set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
|
|
|
+ spin_unlock(&mm->page_table_lock);
|
|
|
+ /* defer TLB flush to lower the overhead */
|
|
|
+ }
|
|
|
+
|
|
|
+out:
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+/* Assumes mmap_sem is held */
|
|
|
+void
|
|
|
+change_prot_numa(struct vm_area_struct *vma,
|
|
|
+ unsigned long address, unsigned long end)
|
|
|
+{
|
|
|
+ struct mm_struct *mm = vma->vm_mm;
|
|
|
+ int progress = 0;
|
|
|
+
|
|
|
+ while (address < end) {
|
|
|
+ VM_BUG_ON(address < vma->vm_start ||
|
|
|
+ address + PAGE_SIZE > vma->vm_end);
|
|
|
+
|
|
|
+ progress += change_prot_numa_range(mm, vma, address);
|
|
|
+ address = (address + PMD_SIZE) & PMD_MASK;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Flush the TLB for the mm to start the NUMA hinting
|
|
|
+ * page faults after we finish scanning this vma part
|
|
|
+ * if there were any PTE updates
|
|
|
+ */
|
|
|
+ if (progress) {
|
|
|
+ mmu_notifier_invalidate_range_start(vma->vm_mm, address, end);
|
|
|
+ flush_tlb_range(vma, address, end);
|
|
|
+ mmu_notifier_invalidate_range_end(vma->vm_mm, address, end);
|
|
|
+ }
|
|
|
+}
|
|
|
+#else
|
|
|
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
|
|
|
+ unsigned long addr, unsigned long end)
|
|
|
+{
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
|
|
|
+
|
|
|
/*
|
|
|
* Check if all pages in a range are on a set of nodes.
|
|
|
* If pagelist != NULL then isolate pages from the LRU and
|
|
@@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
prev = NULL;
|
|
|
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
|
|
|
+ unsigned long endvma = vma->vm_end;
|
|
|
+
|
|
|
+ if (endvma > end)
|
|
|
+ endvma = end;
|
|
|
+ if (vma->vm_start > start)
|
|
|
+ start = vma->vm_start;
|
|
|
+
|
|
|
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
|
|
|
if (!vma->vm_next && vma->vm_end < end)
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
if (prev && prev->vm_end < vma->vm_start)
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
}
|
|
|
- if (!is_vm_hugetlb_page(vma) &&
|
|
|
- ((flags & MPOL_MF_STRICT) ||
|
|
|
+
|
|
|
+ if (is_vm_hugetlb_page(vma))
|
|
|
+ goto next;
|
|
|
+
|
|
|
+ if (flags & MPOL_MF_LAZY) {
|
|
|
+ change_prot_numa(vma, start, endvma);
|
|
|
+ goto next;
|
|
|
+ }
|
|
|
+
|
|
|
+ if ((flags & MPOL_MF_STRICT) ||
|
|
|
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
|
|
|
- vma_migratable(vma)))) {
|
|
|
- unsigned long endvma = vma->vm_end;
|
|
|
+ vma_migratable(vma))) {
|
|
|
|
|
|
- if (endvma > end)
|
|
|
- endvma = end;
|
|
|
- if (vma->vm_start > start)
|
|
|
- start = vma->vm_start;
|
|
|
err = check_pgd_range(vma, start, endvma, nodes,
|
|
|
flags, private);
|
|
|
if (err) {
|
|
@@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
+next:
|
|
|
prev = vma;
|
|
|
}
|
|
|
return first;
|
|
@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|
|
int err;
|
|
|
LIST_HEAD(pagelist);
|
|
|
|
|
|
- if (flags & ~(unsigned long)(MPOL_MF_STRICT |
|
|
|
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
|
|
|
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
|
|
|
return -EINVAL;
|
|
|
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
|
|
|
return -EPERM;
|
|
@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|
|
if (IS_ERR(new))
|
|
|
return PTR_ERR(new);
|
|
|
|
|
|
+ if (flags & MPOL_MF_LAZY)
|
|
|
+ new->flags |= MPOL_F_MOF;
|
|
|
+
|
|
|
/*
|
|
|
* If we are using the default policy then operation
|
|
|
* on discontinuous address spaces is okay after all
|
|
@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|
|
vma = check_range(mm, start, end, nmask,
|
|
|
flags | MPOL_MF_INVERT, &pagelist);
|
|
|
|
|
|
- err = PTR_ERR(vma);
|
|
|
- if (!IS_ERR(vma)) {
|
|
|
- int nr_failed = 0;
|
|
|
-
|
|
|
+ err = PTR_ERR(vma); /* maybe ... */
|
|
|
+ if (!IS_ERR(vma) && mode != MPOL_NOOP)
|
|
|
err = mbind_range(mm, start, end, new);
|
|
|
|
|
|
+ if (!err) {
|
|
|
+ int nr_failed = 0;
|
|
|
+
|
|
|
if (!list_empty(&pagelist)) {
|
|
|
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
|
|
|
nr_failed = migrate_pages(&pagelist, new_vma_page,
|
|
|
(unsigned long)vma,
|
|
|
false, MIGRATE_SYNC,
|
|
@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|
|
putback_lru_pages(&pagelist);
|
|
|
}
|
|
|
|
|
|
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
|
|
|
+ if (nr_failed && (flags & MPOL_MF_STRICT))
|
|
|
err = -EIO;
|
|
|
} else
|
|
|
putback_lru_pages(&pagelist);
|