12 years ago · b32967ff10
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -79,6 +79,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 
				 extern int migrate_misplaced_page(struct page *page, int node);
			
 
				 extern int migrate_misplaced_page(struct page *page, int node);
			
 
				 extern bool migrate_ratelimited(int node);
			
 
				+extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
			
 
				+			struct vm_area_struct *vma,
			
 
				+			pmd_t *pmd, pmd_t entry,
			
 
				+			unsigned long address,
			
 
				+			struct page *page, int node);
			
 
				+
			
 
				 #else
			
 
				 static inline int migrate_misplaced_page(struct page *page, int node)
			
 
				 {
			
@@ -88,6 +94,15 @@ static inline bool migrate_ratelimited(int node)
 
				 {
			
 
				 	return false;
			
 
				 }
			
 
				+
			
 
				+static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
			
 
				+			struct vm_area_struct *vma,
			
 
				+			pmd_t *pmd, pmd_t entry,
			
 
				+			unsigned long address,
			
 
				+			struct page *page, int node)
			
 
				+{
			
 
				+	return -EAGAIN;
			
 
				+}
			
 
				 #endif /* CONFIG_NUMA_BALANCING */
			
 
				 
			
 
				 #endif /* _LINUX_MIGRATE_H */
			
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -600,7 +600,7 @@ out:
 
				 }
			
 
				 __setup("transparent_hugepage=", setup_transparent_hugepage);
			
 
				 
			
 
				-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
			
 
				+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
			
 
				 {
			
 
				 	if (likely(vma->vm_flags & VM_WRITE))
			
 
				 		pmd = pmd_mkwrite(pmd);
			
@@ -1023,10 +1023,12 @@ out:
 
				 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				 				unsigned long addr, pmd_t pmd, pmd_t *pmdp)
			
 
				 {
			
 
				-	struct page *page = NULL;
			
 
				+	struct page *page;
			
 
				 	unsigned long haddr = addr & HPAGE_PMD_MASK;
			
 
				 	int target_nid;
			
 
				 	int current_nid = -1;
			
 
				+	bool migrated;
			
 
				+	bool page_locked = false;
			
 
				 
			
 
				 	spin_lock(&mm->page_table_lock);
			
 
				 	if (unlikely(!pmd_same(pmd, *pmdp)))
			
@@ -1034,42 +1036,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 
			
 
				 	page = pmd_page(pmd);
			
 
				 	get_page(page);
			
 
				-	spin_unlock(&mm->page_table_lock);
			
 
				 	current_nid = page_to_nid(page);
			
 
				 	count_vm_numa_event(NUMA_HINT_FAULTS);
			
 
				 	if (current_nid == numa_node_id())
			
 
				 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
			
 
				 
			
 
				 	target_nid = mpol_misplaced(page, vma, haddr);
			
 
				-	if (target_nid == -1)
			
 
				+	if (target_nid == -1) {
			
 
				+		put_page(page);
			
 
				 		goto clear_pmdnuma;
			
 
				+	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Due to lacking code to migrate thp pages, we'll split
			
 
				-	 * (which preserves the special PROT_NONE) and re-take the
			
 
				-	 * fault on the normal pages.
			
 
				-	 */
			
 
				-	split_huge_page(page);
			
 
				-	put_page(page);
			
 
				-
			
 
				-	return 0;
			
 
				+	/* Acquire the page lock to serialise THP migrations */
			
 
				+	spin_unlock(&mm->page_table_lock);
			
 
				+	lock_page(page);
			
 
				+	page_locked = true;
			
 
				 
			
 
				-clear_pmdnuma:
			
 
				+	/* Confirm the PTE did not while locked */
			
 
				 	spin_lock(&mm->page_table_lock);
			
 
				-	if (unlikely(!pmd_same(pmd, *pmdp)))
			
 
				+	if (unlikely(!pmd_same(pmd, *pmdp))) {
			
 
				+		unlock_page(page);
			
 
				+		put_page(page);
			
 
				 		goto out_unlock;
			
 
				+	}
			
 
				+	spin_unlock(&mm->page_table_lock);
			
 
				+
			
 
				+	/* Migrate the THP to the requested node */
			
 
				+	migrated = migrate_misplaced_transhuge_page(mm, vma,
			
 
				+				pmdp, pmd, addr,
			
 
				+				page, target_nid);
			
 
				+	if (migrated)
			
 
				+		current_nid = target_nid;
			
 
				+	else {
			
 
				+		spin_lock(&mm->page_table_lock);
			
 
				+		if (unlikely(!pmd_same(pmd, *pmdp))) {
			
 
				+			unlock_page(page);
			
 
				+			goto out_unlock;
			
 
				+		}
			
 
				+		goto clear_pmdnuma;
			
 
				+	}
			
 
				+
			
 
				+	task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
			
 
				+	return 0;
			
 
				 
			
 
				+clear_pmdnuma:
			
 
				 	pmd = pmd_mknonnuma(pmd);
			
 
				 	set_pmd_at(mm, haddr, pmdp, pmd);
			
 
				 	VM_BUG_ON(pmd_numa(*pmdp));
			
 
				 	update_mmu_cache_pmd(vma, addr, pmdp);
			
 
				+	if (page_locked)
			
 
				+		unlock_page(page);
			
 
				 
			
 
				 out_unlock:
			
 
				 	spin_unlock(&mm->page_table_lock);
			
 
				-	if (page) {
			
 
				-		put_page(page);
			
 
				-		task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false);
			
 
				-	}
			
 
				+	if (current_nid != -1)
			
 
				+		task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -212,15 +212,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 
				 {
			
 
				 	if (TestClearPageMlocked(page)) {
			
 
				 		unsigned long flags;
			
 
				+		int nr_pages = hpage_nr_pages(page);
			
 
				 
			
 
				 		local_irq_save(flags);
			
 
				-		__dec_zone_page_state(page, NR_MLOCK);
			
 
				+		__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
			
 
				 		SetPageMlocked(newpage);
			
 
				-		__inc_zone_page_state(newpage, NR_MLOCK);
			
 
				+		__mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
			
 
				 		local_irq_restore(flags);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
			
 
				+
			
 
				 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				 extern unsigned long vma_address(struct page *page,
			
 
				 				 struct vm_area_struct *vma);
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3288,15 +3288,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 
				 				  struct mem_cgroup **memcgp)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = NULL;
			
 
				+	unsigned int nr_pages = 1;
			
 
				 	struct page_cgroup *pc;
			
 
				 	enum charge_type ctype;
			
 
				 
			
 
				 	*memcgp = NULL;
			
 
				 
			
 
				-	VM_BUG_ON(PageTransHuge(page));
			
 
				 	if (mem_cgroup_disabled())
			
 
				 		return;
			
 
				 
			
 
				+	if (PageTransHuge(page))
			
 
				+		nr_pages <<= compound_order(page);
			
 
				+
			
 
				 	pc = lookup_page_cgroup(page);
			
 
				 	lock_page_cgroup(pc);
			
 
				 	if (PageCgroupUsed(pc)) {
			
@@ -3358,7 +3361,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 
				 	 * charged to the res_counter since we plan on replacing the
			
 
				 	 * old one and only one page is going to be left afterwards.
			
 
				 	 */
			
 
				-	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
			
 
				+	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
			
 
				 }
			
 
				 
			
 
				 /* remove redundant charge if migration failed*/
			
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -410,7 +410,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 
				  */
			
 
				 void migrate_page_copy(struct page *newpage, struct page *page)
			
 
				 {
			
 
				-	if (PageHuge(page))
			
 
				+	if (PageHuge(page) || PageTransHuge(page))
			
 
				 		copy_huge_page(newpage, page);
			
 
				 	else
			
 
				 		copy_highpage(newpage, page);
			
@@ -1491,25 +1491,10 @@ bool migrate_ratelimited(int node)
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Attempt to migrate a misplaced page to the specified destination
			
 
				- * node. Caller is expected to have an elevated reference count on
			
 
				- * the page that will be dropped by this function before returning.
			
 
				- */
			
 
				-int migrate_misplaced_page(struct page *page, int node)
			
 
				+/* Returns true if the node is migrate rate-limited after the update */
			
 
				+bool numamigrate_update_ratelimit(pg_data_t *pgdat)
			
 
				 {
			
 
				-	pg_data_t *pgdat = NODE_DATA(node);
			
 
				-	int isolated = 0;
			
 
				-	LIST_HEAD(migratepages);
			
 
				-
			
 
				-	/*
			
 
				-	 * Don't migrate pages that are mapped in multiple processes.
			
 
				-	 * TODO: Handle false sharing detection instead of this hammer
			
 
				-	 */
			
 
				-	if (page_mapcount(page) != 1) {
			
 
				-		put_page(page);
			
 
				-		goto out;
			
 
				-	}
			
 
				+	bool rate_limited = false;
			
 
				 
			
 
				 	/*
			
 
				 	 * Rate-limit the amount of data that is being migrated to a node.
			
@@ -1522,13 +1507,18 @@ int migrate_misplaced_page(struct page *page, int node)
 
				 		pgdat->numabalancing_migrate_next_window = jiffies +
			
 
				 			msecs_to_jiffies(migrate_interval_millisecs);
			
 
				 	}
			
 
				-	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
			
 
				-		spin_unlock(&pgdat->numabalancing_migrate_lock);
			
 
				-		put_page(page);
			
 
				-		goto out;
			
 
				-	}
			
 
				-	pgdat->numabalancing_migrate_nr_pages++;
			
 
				+	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
			
 
				+		rate_limited = true;
			
 
				+	else
			
 
				+		pgdat->numabalancing_migrate_nr_pages++;
			
 
				 	spin_unlock(&pgdat->numabalancing_migrate_lock);
			
 
				+	
			
 
				+	return rate_limited;
			
 
				+}
			
 
				+
			
 
				+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				 
			
 
				 	/* Avoid migrating to a node that is nearly full */
			
 
				 	if (migrate_balanced_pgdat(pgdat, 1)) {
			
@@ -1536,13 +1526,18 @@ int migrate_misplaced_page(struct page *page, int node)
 
				 
			
 
				 		if (isolate_lru_page(page)) {
			
 
				 			put_page(page);
			
 
				-			goto out;
			
 
				+			return 0;
			
 
				 		}
			
 
				-		isolated = 1;
			
 
				 
			
 
				+		/* Page is isolated */
			
 
				+		ret = 1;
			
 
				 		page_lru = page_is_file_cache(page);
			
 
				-		inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
			
 
				-		list_add(&page->lru, &migratepages);
			
 
				+		if (!PageTransHuge(page))
			
 
				+			inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
			
 
				+		else
			
 
				+			mod_zone_page_state(page_zone(page),
			
 
				+					NR_ISOLATED_ANON + page_lru,
			
 
				+					HPAGE_PMD_NR);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -1555,23 +1550,177 @@ int migrate_misplaced_page(struct page *page, int node)
 
				 	 */
			
 
				 	put_page(page);
			
 
				 
			
 
				-	if (isolated) {
			
 
				-		int nr_remaining;
			
 
				-
			
 
				-		nr_remaining = migrate_pages(&migratepages,
			
 
				-				alloc_misplaced_dst_page,
			
 
				-				node, false, MIGRATE_ASYNC,
			
 
				-				MR_NUMA_MISPLACED);
			
 
				-		if (nr_remaining) {
			
 
				-			putback_lru_pages(&migratepages);
			
 
				-			isolated = 0;
			
 
				-		} else
			
 
				-			count_vm_numa_event(NUMA_PAGE_MIGRATE);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Attempt to migrate a misplaced page to the specified destination
			
 
				+ * node. Caller is expected to have an elevated reference count on
			
 
				+ * the page that will be dropped by this function before returning.
			
 
				+ */
			
 
				+int migrate_misplaced_page(struct page *page, int node)
			
 
				+{
			
 
				+	pg_data_t *pgdat = NODE_DATA(node);
			
 
				+	int isolated = 0;
			
 
				+	int nr_remaining;
			
 
				+	LIST_HEAD(migratepages);
			
 
				+
			
 
				+	/*
			
 
				+	 * Don't migrate pages that are mapped in multiple processes.
			
 
				+	 * TODO: Handle false sharing detection instead of this hammer
			
 
				+	 */
			
 
				+	if (page_mapcount(page) != 1) {
			
 
				+		put_page(page);
			
 
				+		goto out;
			
 
				 	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Rate-limit the amount of data that is being migrated to a node.
			
 
				+	 * Optimal placement is no good if the memory bus is saturated and
			
 
				+	 * all the time is being spent migrating!
			
 
				+	 */
			
 
				+	if (numamigrate_update_ratelimit(pgdat)) {
			
 
				+		put_page(page);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	isolated = numamigrate_isolate_page(pgdat, page);
			
 
				+	if (!isolated)
			
 
				+		goto out;
			
 
				+
			
 
				+	list_add(&page->lru, &migratepages);
			
 
				+	nr_remaining = migrate_pages(&migratepages,
			
 
				+			alloc_misplaced_dst_page,
			
 
				+			node, false, MIGRATE_ASYNC,
			
 
				+			MR_NUMA_MISPLACED);
			
 
				+	if (nr_remaining) {
			
 
				+		putback_lru_pages(&migratepages);
			
 
				+		isolated = 0;
			
 
				+	} else
			
 
				+		count_vm_numa_event(NUMA_PAGE_MIGRATE);
			
 
				 	BUG_ON(!list_empty(&migratepages));
			
 
				 out:
			
 
				 	return isolated;
			
 
				 }
			
 
				+
			
 
				+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
			
 
				+				struct vm_area_struct *vma,
			
 
				+				pmd_t *pmd, pmd_t entry,
			
 
				+				unsigned long address,
			
 
				+				struct page *page, int node)
			
 
				+{
			
 
				+	unsigned long haddr = address & HPAGE_PMD_MASK;
			
 
				+	pg_data_t *pgdat = NODE_DATA(node);
			
 
				+	int isolated = 0;
			
 
				+	struct page *new_page = NULL;
			
 
				+	struct mem_cgroup *memcg = NULL;
			
 
				+	int page_lru = page_is_file_cache(page);
			
 
				+
			
 
				+	/*
			
 
				+	 * Don't migrate pages that are mapped in multiple processes.
			
 
				+	 * TODO: Handle false sharing detection instead of this hammer
			
 
				+	 */
			
 
				+	if (page_mapcount(page) != 1)
			
 
				+		goto out_dropref;
			
 
				+
			
 
				+	/*
			
 
				+	 * Rate-limit the amount of data that is being migrated to a node.
			
 
				+	 * Optimal placement is no good if the memory bus is saturated and
			
 
				+	 * all the time is being spent migrating!
			
 
				+	 */
			
 
				+	if (numamigrate_update_ratelimit(pgdat))
			
 
				+		goto out_dropref;
			
 
				+
			
 
				+	new_page = alloc_pages_node(node,
			
 
				+		(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
			
 
				+	if (!new_page)
			
 
				+		goto out_dropref;
			
 
				+	page_xchg_last_nid(new_page, page_last_nid(page));
			
 
				+
			
 
				+	isolated = numamigrate_isolate_page(pgdat, page);
			
 
				+	if (!isolated) {
			
 
				+		put_page(new_page);
			
 
				+		goto out_keep_locked;
			
 
				+	}
			
 
				+
			
 
				+	/* Prepare a page as a migration target */
			
 
				+	__set_page_locked(new_page);
			
 
				+	SetPageSwapBacked(new_page);
			
 
				+
			
 
				+	/* anon mapping, we can simply copy page->mapping to the new page: */
			
 
				+	new_page->mapping = page->mapping;
			
 
				+	new_page->index = page->index;
			
 
				+	migrate_page_copy(new_page, page);
			
 
				+	WARN_ON(PageLRU(new_page));
			
 
				+
			
 
				+	/* Recheck the target PMD */
			
 
				+	spin_lock(&mm->page_table_lock);
			
 
				+	if (unlikely(!pmd_same(*pmd, entry))) {
			
 
				+		spin_unlock(&mm->page_table_lock);
			
 
				+
			
 
				+		/* Reverse changes made by migrate_page_copy() */
			
 
				+		if (TestClearPageActive(new_page))
			
 
				+			SetPageActive(page);
			
 
				+		if (TestClearPageUnevictable(new_page))
			
 
				+			SetPageUnevictable(page);
			
 
				+		mlock_migrate_page(page, new_page);
			
 
				+
			
 
				+		unlock_page(new_page);
			
 
				+		put_page(new_page);		/* Free it */
			
 
				+
			
 
				+		unlock_page(page);
			
 
				+		putback_lru_page(page);
			
 
				+
			
 
				+		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Traditional migration needs to prepare the memcg charge
			
 
				+	 * transaction early to prevent the old page from being
			
 
				+	 * uncharged when installing migration entries.  Here we can
			
 
				+	 * save the potential rollback and start the charge transfer
			
 
				+	 * only when migration is already known to end successfully.
			
 
				+	 */
			
 
				+	mem_cgroup_prepare_migration(page, new_page, &memcg);
			
 
				+
			
 
				+	entry = mk_pmd(new_page, vma->vm_page_prot);
			
 
				+	entry = pmd_mknonnuma(entry);
			
 
				+	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
			
 
				+	entry = pmd_mkhuge(entry);
			
 
				+
			
 
				+	page_add_new_anon_rmap(new_page, vma, haddr);
			
 
				+
			
 
				+	set_pmd_at(mm, haddr, pmd, entry);
			
 
				+	update_mmu_cache_pmd(vma, address, entry);
			
 
				+	page_remove_rmap(page);
			
 
				+	/*
			
 
				+	 * Finish the charge transaction under the page table lock to
			
 
				+	 * prevent split_huge_page() from dividing up the charge
			
 
				+	 * before it's fully transferred to the new page.
			
 
				+	 */
			
 
				+	mem_cgroup_end_migration(memcg, page, new_page, true);
			
 
				+	spin_unlock(&mm->page_table_lock);
			
 
				+
			
 
				+	unlock_page(new_page);
			
 
				+	unlock_page(page);
			
 
				+	put_page(page);			/* Drop the rmap reference */
			
 
				+	put_page(page);			/* Drop the LRU isolation reference */
			
 
				+
			
 
				+	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
			
 
				+	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
			
 
				+
			
 
				+out:
			
 
				+	mod_zone_page_state(page_zone(page),
			
 
				+			NR_ISOLATED_ANON + page_lru,
			
 
				+			-HPAGE_PMD_NR);
			
 
				+	return isolated;
			
 
				+
			
 
				+out_dropref:
			
 
				+	put_page(page);
			
 
				+out_keep_locked:
			
 
				+	return 0;
			
 
				+}
			
 
				 #endif /* CONFIG_NUMA_BALANCING */
			
 
				 
			
 
				 #endif /* CONFIG_NUMA */