|
@@ -11,6 +11,7 @@
|
|
|
#include <linux/swap.h>
|
|
|
#include <linux/swapops.h>
|
|
|
#include <linux/pagemap.h>
|
|
|
+#include <linux/pagevec.h>
|
|
|
#include <linux/mempolicy.h>
|
|
|
#include <linux/syscalls.h>
|
|
|
#include <linux/sched.h>
|
|
@@ -18,6 +19,8 @@
|
|
|
#include <linux/rmap.h>
|
|
|
#include <linux/mmzone.h>
|
|
|
#include <linux/hugetlb.h>
|
|
|
+#include <linux/memcontrol.h>
|
|
|
+#include <linux/mm_inline.h>
|
|
|
|
|
|
#include "internal.h"
|
|
|
|
|
@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Finish munlock after successful page isolation
|
|
|
+ *
|
|
|
+ * Page must be locked. This is a wrapper for try_to_munlock()
|
|
|
+ * and putback_lru_page() with munlock accounting.
|
|
|
+ */
|
|
|
+static void __munlock_isolated_page(struct page *page)
|
|
|
+{
|
|
|
+ int ret = SWAP_AGAIN;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Optimization: if the page was mapped just once, that's our mapping
|
|
|
+ * and we don't need to check all the other vmas.
|
|
|
+ */
|
|
|
+ if (page_mapcount(page) > 1)
|
|
|
+ ret = try_to_munlock(page);
|
|
|
+
|
|
|
+ /* Did try_to_unlock() succeed or punt? */
|
|
|
+ if (ret != SWAP_MLOCK)
|
|
|
+ count_vm_event(UNEVICTABLE_PGMUNLOCKED);
|
|
|
+
|
|
|
+ putback_lru_page(page);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Accounting for page isolation fail during munlock
|
|
|
+ *
|
|
|
+ * Performs accounting when page isolation fails in munlock. There is nothing
|
|
|
+ * else to do because it means some other task has already removed the page
|
|
|
+ * from the LRU. putback_lru_page() will take care of removing the page from
|
|
|
+ * the unevictable list, if necessary. vmscan [page_referenced()] will move
|
|
|
+ * the page back to the unevictable list if some other vma has it mlocked.
|
|
|
+ */
|
|
|
+static void __munlock_isolation_failed(struct page *page)
|
|
|
+{
|
|
|
+ if (PageUnevictable(page))
|
|
|
+ count_vm_event(UNEVICTABLE_PGSTRANDED);
|
|
|
+ else
|
|
|
+ count_vm_event(UNEVICTABLE_PGMUNLOCKED);
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* munlock_vma_page - munlock a vma page
|
|
|
* @page - page to be unlocked
|
|
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page)
|
|
|
unsigned int nr_pages = hpage_nr_pages(page);
|
|
|
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
|
|
page_mask = nr_pages - 1;
|
|
|
- if (!isolate_lru_page(page)) {
|
|
|
- int ret = SWAP_AGAIN;
|
|
|
-
|
|
|
- /*
|
|
|
- * Optimization: if the page was mapped just once,
|
|
|
- * that's our mapping and we don't need to check all the
|
|
|
- * other vmas.
|
|
|
- */
|
|
|
- if (page_mapcount(page) > 1)
|
|
|
- ret = try_to_munlock(page);
|
|
|
- /*
|
|
|
- * did try_to_unlock() succeed or punt?
|
|
|
- */
|
|
|
- if (ret != SWAP_MLOCK)
|
|
|
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
|
|
|
-
|
|
|
- putback_lru_page(page);
|
|
|
- } else {
|
|
|
- /*
|
|
|
- * Some other task has removed the page from the LRU.
|
|
|
- * putback_lru_page() will take care of removing the
|
|
|
- * page from the unevictable list, if necessary.
|
|
|
- * vmscan [page_referenced()] will move the page back
|
|
|
- * to the unevictable list if some other vma has it
|
|
|
- * mlocked.
|
|
|
- */
|
|
|
- if (PageUnevictable(page))
|
|
|
- count_vm_event(UNEVICTABLE_PGSTRANDED);
|
|
|
- else
|
|
|
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
|
|
|
- }
|
|
|
+ if (!isolate_lru_page(page))
|
|
|
+ __munlock_isolated_page(page);
|
|
|
+ else
|
|
|
+ __munlock_isolation_failed(page);
|
|
|
}
|
|
|
|
|
|
return page_mask;
|
|
@@ -209,6 +226,73 @@ static int __mlock_posix_error_return(long retval)
|
|
|
return retval;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Munlock a batch of pages from the same zone
|
|
|
+ *
|
|
|
+ * The work is split to two main phases. First phase clears the Mlocked flag
|
|
|
+ * and attempts to isolate the pages, all under a single zone lru lock.
|
|
|
+ * The second phase finishes the munlock only for pages where isolation
|
|
|
+ * succeeded.
|
|
|
+ *
|
|
|
+ * Note that pvec is modified during the process. Before returning
|
|
|
+ * pagevec_reinit() is called on it.
|
|
|
+ */
|
|
|
+static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+ int nr = pagevec_count(pvec);
|
|
|
+
|
|
|
+ /* Phase 1: page isolation */
|
|
|
+ spin_lock_irq(&zone->lru_lock);
|
|
|
+ for (i = 0; i < nr; i++) {
|
|
|
+ struct page *page = pvec->pages[i];
|
|
|
+
|
|
|
+ if (TestClearPageMlocked(page)) {
|
|
|
+ struct lruvec *lruvec;
|
|
|
+ int lru;
|
|
|
+
|
|
|
+ /* we have disabled interrupts */
|
|
|
+ __mod_zone_page_state(zone, NR_MLOCK, -1);
|
|
|
+
|
|
|
+ if (PageLRU(page)) {
|
|
|
+ lruvec = mem_cgroup_page_lruvec(page, zone);
|
|
|
+ lru = page_lru(page);
|
|
|
+
|
|
|
+ get_page(page);
|
|
|
+ ClearPageLRU(page);
|
|
|
+ del_page_from_lru_list(page, lruvec, lru);
|
|
|
+ } else {
|
|
|
+ __munlock_isolation_failed(page);
|
|
|
+ goto skip_munlock;
|
|
|
+ }
|
|
|
+
|
|
|
+ } else {
|
|
|
+skip_munlock:
|
|
|
+ /*
|
|
|
+ * We won't be munlocking this page in the next phase
|
|
|
+ * but we still need to release the follow_page_mask()
|
|
|
+ * pin.
|
|
|
+ */
|
|
|
+ pvec->pages[i] = NULL;
|
|
|
+ put_page(page);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ spin_unlock_irq(&zone->lru_lock);
|
|
|
+
|
|
|
+ /* Phase 2: page munlock and putback */
|
|
|
+ for (i = 0; i < nr; i++) {
|
|
|
+ struct page *page = pvec->pages[i];
|
|
|
+
|
|
|
+ if (page) {
|
|
|
+ lock_page(page);
|
|
|
+ __munlock_isolated_page(page);
|
|
|
+ unlock_page(page);
|
|
|
+ put_page(page); /* pin from follow_page_mask() */
|
|
|
+ }
|
|
|
+ }
|
|
|
+ pagevec_reinit(pvec);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* munlock_vma_pages_range() - munlock all pages in the vma range.'
|
|
|
* @vma - vma containing range to be munlock()ed.
|
|
@@ -230,11 +314,16 @@ static int __mlock_posix_error_return(long retval)
|
|
|
void munlock_vma_pages_range(struct vm_area_struct *vma,
|
|
|
unsigned long start, unsigned long end)
|
|
|
{
|
|
|
+ struct pagevec pvec;
|
|
|
+ struct zone *zone = NULL;
|
|
|
+
|
|
|
+ pagevec_init(&pvec, 0);
|
|
|
vma->vm_flags &= ~VM_LOCKED;
|
|
|
|
|
|
while (start < end) {
|
|
|
struct page *page;
|
|
|
unsigned int page_mask, page_increm;
|
|
|
+ struct zone *pagezone;
|
|
|
|
|
|
/*
|
|
|
* Although FOLL_DUMP is intended for get_dump_page(),
|
|
@@ -246,20 +335,47 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
|
|
|
page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
|
|
|
&page_mask);
|
|
|
if (page && !IS_ERR(page)) {
|
|
|
- lock_page(page);
|
|
|
- /*
|
|
|
- * Any THP page found by follow_page_mask() may have
|
|
|
- * gotten split before reaching munlock_vma_page(),
|
|
|
- * so we need to recompute the page_mask here.
|
|
|
- */
|
|
|
- page_mask = munlock_vma_page(page);
|
|
|
- unlock_page(page);
|
|
|
- put_page(page);
|
|
|
+ pagezone = page_zone(page);
|
|
|
+ /* The whole pagevec must be in the same zone */
|
|
|
+ if (pagezone != zone) {
|
|
|
+ if (pagevec_count(&pvec))
|
|
|
+ __munlock_pagevec(&pvec, zone);
|
|
|
+ zone = pagezone;
|
|
|
+ }
|
|
|
+ if (PageTransHuge(page)) {
|
|
|
+ /*
|
|
|
+ * THP pages are not handled by pagevec due
|
|
|
+ * to their possible split (see below).
|
|
|
+ */
|
|
|
+ if (pagevec_count(&pvec))
|
|
|
+ __munlock_pagevec(&pvec, zone);
|
|
|
+ lock_page(page);
|
|
|
+ /*
|
|
|
+ * Any THP page found by follow_page_mask() may
|
|
|
+ * have gotten split before reaching
|
|
|
+ * munlock_vma_page(), so we need to recompute
|
|
|
+ * the page_mask here.
|
|
|
+ */
|
|
|
+ page_mask = munlock_vma_page(page);
|
|
|
+ unlock_page(page);
|
|
|
+ put_page(page); /* follow_page_mask() */
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * Non-huge pages are handled in batches
|
|
|
+ * via pagevec. The pin from
|
|
|
+ * follow_page_mask() prevents them from
|
|
|
+ * collapsing by THP.
|
|
|
+ */
|
|
|
+ if (pagevec_add(&pvec, page) == 0)
|
|
|
+ __munlock_pagevec(&pvec, zone);
|
|
|
+ }
|
|
|
}
|
|
|
page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
|
|
|
start += page_increm * PAGE_SIZE;
|
|
|
cond_resched();
|
|
|
}
|
|
|
+ if (pagevec_count(&pvec))
|
|
|
+ __munlock_pagevec(&pvec, zone);
|
|
|
}
|
|
|
|
|
|
/*
|