|
@@ -3448,6 +3448,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
|
|
|
}
|
|
|
|
|
|
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
+ unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
|
|
|
+{
|
|
|
+ struct page *page;
|
|
|
+ spinlock_t *ptl;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The "pte" at this point cannot be used safely without
|
|
|
+ * validation through pte_unmap_same(). It's of NUMA type but
|
|
|
+ * the pfn may be screwed if the read is non atomic.
|
|
|
+ *
|
|
|
+ * ptep_modify_prot_start is not called as this is clearing
|
|
|
+ * the _PAGE_NUMA bit and it is not really expected that there
|
|
|
+ * would be concurrent hardware modifications to the PTE.
|
|
|
+ */
|
|
|
+ ptl = pte_lockptr(mm, pmd);
|
|
|
+ spin_lock(ptl);
|
|
|
+ if (unlikely(!pte_same(*ptep, pte)))
|
|
|
+ goto out_unlock;
|
|
|
+ pte = pte_mknonnuma(pte);
|
|
|
+ set_pte_at(mm, addr, ptep, pte);
|
|
|
+ update_mmu_cache(vma, addr, ptep);
|
|
|
+
|
|
|
+ page = vm_normal_page(vma, addr, pte);
|
|
|
+ if (!page) {
|
|
|
+ pte_unmap_unlock(ptep, ptl);
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+out_unlock:
|
|
|
+ pte_unmap_unlock(ptep, ptl);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+/* NUMA hinting page fault entry point for regular pmds */
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
+ unsigned long addr, pmd_t *pmdp)
|
|
|
+{
|
|
|
+ pmd_t pmd;
|
|
|
+ pte_t *pte, *orig_pte;
|
|
|
+ unsigned long _addr = addr & PMD_MASK;
|
|
|
+ unsigned long offset;
|
|
|
+ spinlock_t *ptl;
|
|
|
+ bool numa = false;
|
|
|
+
|
|
|
+ spin_lock(&mm->page_table_lock);
|
|
|
+ pmd = *pmdp;
|
|
|
+ if (pmd_numa(pmd)) {
|
|
|
+ set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
|
|
|
+ numa = true;
|
|
|
+ }
|
|
|
+ spin_unlock(&mm->page_table_lock);
|
|
|
+
|
|
|
+ if (!numa)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* we're in a page fault so some vma must be in the range */
|
|
|
+ BUG_ON(!vma);
|
|
|
+ BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
|
|
|
+ offset = max(_addr, vma->vm_start) & ~PMD_MASK;
|
|
|
+ VM_BUG_ON(offset >= PMD_SIZE);
|
|
|
+ orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
|
|
|
+ pte += offset >> PAGE_SHIFT;
|
|
|
+ for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
|
|
|
+ pte_t pteval = *pte;
|
|
|
+ struct page *page;
|
|
|
+ if (!pte_present(pteval))
|
|
|
+ continue;
|
|
|
+ if (!pte_numa(pteval))
|
|
|
+ continue;
|
|
|
+ if (addr >= vma->vm_end) {
|
|
|
+ vma = find_vma(mm, addr);
|
|
|
+ /* there's a pte present so there must be a vma */
|
|
|
+ BUG_ON(!vma);
|
|
|
+ BUG_ON(addr < vma->vm_start);
|
|
|
+ }
|
|
|
+ if (pte_numa(pteval)) {
|
|
|
+ pteval = pte_mknonnuma(pteval);
|
|
|
+ set_pte_at(mm, addr, pte, pteval);
|
|
|
+ }
|
|
|
+ page = vm_normal_page(vma, addr, pteval);
|
|
|
+ if (unlikely(!page))
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ pte_unmap_unlock(orig_pte, ptl);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+#else
|
|
|
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
|
+ unsigned long addr, pmd_t *pmdp)
|
|
|
+{
|
|
|
+ BUG();
|
|
|
+}
|
|
|
+#endif /* CONFIG_NUMA_BALANCING */
|
|
|
+
|
|
|
/*
|
|
|
* These routines also need to handle stuff like marking pages dirty
|
|
|
* and/or accessed for architectures that don't do it in hardware (most
|
|
@@ -3486,6 +3583,9 @@ int handle_pte_fault(struct mm_struct *mm,
|
|
|
pte, pmd, flags, entry);
|
|
|
}
|
|
|
|
|
|
+ if (pte_numa(entry))
|
|
|
+ return do_numa_page(mm, vma, address, entry, pte, pmd);
|
|
|
+
|
|
|
ptl = pte_lockptr(mm, pmd);
|
|
|
spin_lock(ptl);
|
|
|
if (unlikely(!pte_same(*pte, entry)))
|
|
@@ -3554,9 +3654,11 @@ retry:
|
|
|
|
|
|
barrier();
|
|
|
if (pmd_trans_huge(orig_pmd)) {
|
|
|
- if (flags & FAULT_FLAG_WRITE &&
|
|
|
- !pmd_write(orig_pmd) &&
|
|
|
- !pmd_trans_splitting(orig_pmd)) {
|
|
|
+ if (pmd_numa(*pmd))
|
|
|
+ return do_huge_pmd_numa_page(mm, address,
|
|
|
+ orig_pmd, pmd);
|
|
|
+
|
|
|
+ if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) {
|
|
|
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
|
|
|
orig_pmd);
|
|
|
/*
|
|
@@ -3568,10 +3670,14 @@ retry:
|
|
|
goto retry;
|
|
|
return ret;
|
|
|
}
|
|
|
+
|
|
|
return 0;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ if (pmd_numa(*pmd))
|
|
|
+ return do_pmd_numa_page(mm, vma, address, pmd);
|
|
|
+
|
|
|
/*
|
|
|
* Use __pte_alloc instead of pte_alloc_map, because we can't
|
|
|
* run pte_offset_map on the pmd, if an huge pmd could
|