12 years ago · f6e858a00a
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -218,7 +218,7 @@ and name space for cpusets, with a minimum of additional kernel code.
 
				 The cpus and mems files in the root (top_cpuset) cpuset are
			
 
				 read-only.  The cpus file automatically tracks the value of
			
 
				 cpu_online_mask using a CPU hotplug notifier, and the mems file
			
 
				-automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e.,
			
 
				+automatically tracks the value of node_states[N_MEMORY]--i.e.,
			
 
				 nodes with memory--using the cpuset_track_online_nodes() hook.
			
 
				 
			
 
				 
			
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -390,6 +390,7 @@ struct memory_notify {
 
				        unsigned long start_pfn;
			
 
				        unsigned long nr_pages;
			
 
				        int status_change_nid_normal;
			
 
				+       int status_change_nid_high;
			
 
				        int status_change_nid;
			
 
				 }
			
 
				 
			
@@ -397,7 +398,9 @@ start_pfn is start_pfn of online/offline memory.
 
				 nr_pages is # of pages of online/offline memory.
			
 
				 status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
			
 
				 is (will be) set/clear, if this is -1, then nodemask status is not changed.
			
 
				-status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
			
 
				+status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
			
 
				+is (will be) set/clear, if this is -1, then nodemask status is not changed.
			
 
				+status_change_nid is set node id when N_MEMORY of nodemask is (will be)
			
 
				 set/clear. It means a new(memoryless) node gets new memory by online and a
			
 
				 node loses all memory. If this is -1, then nodemask status is not changed.
			
 
				 If status_changed_nid* >= 0, callback should create/discard structures for the
			
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -116,6 +116,13 @@ echo always >/sys/kernel/mm/transparent_hugepage/defrag
 
				 echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				 echo never >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				 
			
 
				+By default kernel tries to use huge zero page on read page fault.
			
 
				+It's possible to disable huge zero page by writing 0 or enable it
			
 
				+back by writing 1:
			
 
				+
			
 
				+echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
			
 
				+echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
			
 
				+
			
 
				 khugepaged will be automatically started when
			
 
				 transparent_hugepage/enabled is set to "always" or "madvise, and it'll
			
 
				 be automatically shutdown if it's set to "never".
			
@@ -197,6 +204,14 @@ thp_split is incremented every time a huge page is split into base
 
				 	pages. This can happen for a variety of reasons but a common
			
 
				 	reason is that a huge page is old and is being reclaimed.
			
 
				 
			
 
				+thp_zero_page_alloc is incremented every time a huge zero page is
			
 
				+	successfully allocated. It includes allocations which where
			
 
				+	dropped due race with other allocation. Note, it doesn't count
			
 
				+	every map of the huge zero page, only its allocation.
			
 
				+
			
 
				+thp_zero_page_alloc_failed is incremented if kernel fails to allocate
			
 
				+	huge zero page and falls back to using small pages.
			
 
				+
			
 
				 As the system ages, allocating huge pages may be expensive as the
			
 
				 system uses memory compaction to copy data around memory to free a
			
 
				 huge page for use. There are some counters in /proc/vmstat to help
			
@@ -276,7 +291,7 @@ unaffected. libhugetlbfs will also work fine as usual.
 
				 == Graceful fallback ==
			
 
				 
			
 
				 Code walking pagetables but unware about huge pmds can simply call
			
 
				-split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
			
 
				+split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
			
 
				 pmd_offset. It's trivial to make the code transparent hugepage aware
			
 
				 by just grepping for "pmd_offset" and adding split_huge_page_pmd where
			
 
				 missing after pmd_offset returns the pmd. Thanks to the graceful
			
@@ -299,7 +314,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c
 
				 		return NULL;
			
 
				 
			
 
				 	pmd = pmd_offset(pud, addr);
			
 
				-+	split_huge_page_pmd(mm, pmd);
			
 
				++	split_huge_page_pmd(vma, addr, pmd);
			
 
				 	if (pmd_none_or_clear_bad(pmd))
			
 
				 		return NULL;
			
 
				 
			
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -76,16 +76,7 @@ extern unsigned long zero_page_mask;
 
				 
			
 
				 #define ZERO_PAGE(vaddr) \
			
 
				 	(virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask))))
			
 
				-
			
 
				-#define is_zero_pfn is_zero_pfn
			
 
				-static inline int is_zero_pfn(unsigned long pfn)
			
 
				-{
			
 
				-	extern unsigned long zero_pfn;
			
 
				-	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
			
 
				-	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
			
 
				-}
			
 
				-
			
 
				-#define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
			
 
				+#define __HAVE_COLOR_ZERO_PAGE
			
 
				 
			
 
				 extern void paging_init(void);
			
 
				 
			
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -113,19 +113,6 @@ static int store_updates_sp(struct pt_regs *regs)
 
				 #define MM_FAULT_CONTINUE	-1
			
 
				 #define MM_FAULT_ERR(sig)	(sig)
			
 
				 
			
 
				-static int out_of_memory(struct pt_regs *regs)
			
 
				-{
			
 
				-	/*
			
 
				-	 * We ran out of memory, or some other thing happened to us that made
			
 
				-	 * us unable to handle the page fault gracefully.
			
 
				-	 */
			
 
				-	up_read(&current->mm->mmap_sem);
			
 
				-	if (!user_mode(regs))
			
 
				-		return MM_FAULT_ERR(SIGKILL);
			
 
				-	pagefault_out_of_memory();
			
 
				-	return MM_FAULT_RETURN;
			
 
				-}
			
 
				-
			
 
				 static int do_sigbus(struct pt_regs *regs, unsigned long address)
			
 
				 {
			
 
				 	siginfo_t info;
			
@@ -169,8 +156,18 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
 
				 		return MM_FAULT_CONTINUE;
			
 
				 
			
 
				 	/* Out of memory */
			
 
				-	if (fault & VM_FAULT_OOM)
			
 
				-		return out_of_memory(regs);
			
 
				+	if (fault & VM_FAULT_OOM) {
			
 
				+		up_read(&current->mm->mmap_sem);
			
 
				+
			
 
				+		/*
			
 
				+		 * We ran out of memory, or some other thing happened to us that
			
 
				+		 * made us unable to handle the page fault gracefully.
			
 
				+		 */
			
 
				+		if (!user_mode(regs))
			
 
				+			return MM_FAULT_ERR(SIGKILL);
			
 
				+		pagefault_out_of_memory();
			
 
				+		return MM_FAULT_RETURN;
			
 
				+	}
			
 
				 
			
 
				 	/* Bus error. x86 handles HWPOISON here, we'll add this if/when
			
 
				 	 * we support the feature in HW
			
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -55,16 +55,7 @@ extern unsigned long zero_page_mask;
 
				 #define ZERO_PAGE(vaddr) \
			
 
				 	(virt_to_page((void *)(empty_zero_page + \
			
 
				 	 (((unsigned long)(vaddr)) &zero_page_mask))))
			
 
				-
			
 
				-#define is_zero_pfn is_zero_pfn
			
 
				-static inline int is_zero_pfn(unsigned long pfn)
			
 
				-{
			
 
				-	extern unsigned long zero_pfn;
			
 
				-	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
			
 
				-	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
			
 
				-}
			
 
				-
			
 
				-#define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
			
 
				+#define __HAVE_COLOR_ZERO_PAGE
			
 
				 
			
 
				 #endif /* !__ASSEMBLY__ */
			
 
				 
			
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -301,17 +301,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 
				 	__bad_area(regs, error_code, address, SEGV_ACCERR);
			
 
				 }
			
 
				 
			
 
				-static void out_of_memory(void)
			
 
				-{
			
 
				-	/*
			
 
				-	 * We ran out of memory, call the OOM killer, and return the userspace
			
 
				-	 * (which will retry the fault, or kill us if we got oom-killed):
			
 
				-	 */
			
 
				-	up_read(&current->mm->mmap_sem);
			
 
				-
			
 
				-	pagefault_out_of_memory();
			
 
				-}
			
 
				-
			
 
				 static void
			
 
				 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
			
 
				 {
			
@@ -353,8 +342,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
				 			no_context(regs, error_code, address);
			
 
				 			return 1;
			
 
				 		}
			
 
				+		up_read(&current->mm->mmap_sem);
			
 
				 
			
 
				-		out_of_memory();
			
 
				+		/*
			
 
				+		 * We ran out of memory, call the OOM killer, and return the
			
 
				+		 * userspace (which will retry the fault, or kill us if we got
			
 
				+		 * oom-killed):
			
 
				+		 */
			
 
				+		pagefault_out_of_memory();
			
 
				 	} else {
			
 
				 		if (fault & VM_FAULT_SIGBUS)
			
 
				 			do_sigbus(regs, error_code, address);
			
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 
				 	if (pud_none_or_clear_bad(pud))
			
 
				 		goto out;
			
 
				 	pmd = pmd_offset(pud, 0xA0000);
			
 
				-	split_huge_page_pmd(mm, pmd);
			
 
				+	split_huge_page_pmd_mm(mm, 0xA0000, pmd);
			
 
				 	if (pmd_none_or_clear_bad(pmd))
			
 
				 		goto out;
			
 
				 	pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
			
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -803,20 +803,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 
				 	__bad_area(regs, error_code, address, SEGV_ACCERR);
			
 
				 }
			
 
				 
			
 
				-/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
			
 
				-static void
			
 
				-out_of_memory(struct pt_regs *regs, unsigned long error_code,
			
 
				-	      unsigned long address)
			
 
				-{
			
 
				-	/*
			
 
				-	 * We ran out of memory, call the OOM killer, and return the userspace
			
 
				-	 * (which will retry the fault, or kill us if we got oom-killed):
			
 
				-	 */
			
 
				-	up_read(&current->mm->mmap_sem);
			
 
				-
			
 
				-	pagefault_out_of_memory();
			
 
				-}
			
 
				-
			
 
				 static void
			
 
				 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
			
 
				 	  unsigned int fault)
			
@@ -879,7 +865,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
				 			return 1;
			
 
				 		}
			
 
				 
			
 
				-		out_of_memory(regs, error_code, address);
			
 
				+		up_read(&current->mm->mmap_sem);
			
 
				+
			
 
				+		/*
			
 
				+		 * We ran out of memory, call the OOM killer, and return the
			
 
				+		 * userspace (which will retry the fault, or kill us if we got
			
 
				+		 * oom-killed):
			
 
				+		 */
			
 
				+		pagefault_out_of_memory();
			
 
				 	} else {
			
 
				 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
			
 
				 			     VM_FAULT_HWPOISON_LARGE))
			
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -630,7 +630,9 @@ void __init paging_init(void)
 
				 	 *	 numa support is not compiled in, and later node_set_state
			
 
				 	 *	 will not set it back.
			
 
				 	 */
			
 
				-	node_clear_state(0, N_NORMAL_MEMORY);
			
 
				+	node_clear_state(0, N_MEMORY);
			
 
				+	if (N_MEMORY != N_NORMAL_MEMORY)
			
 
				+		node_clear_state(0, N_NORMAL_MEMORY);
			
 
				 
			
 
				 	zone_sizes_init();
			
 
				 }
			
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -227,7 +227,7 @@ static node_registration_func_t __hugetlb_unregister_node;
 
				 static inline bool hugetlb_register_node(struct node *node)
			
 
				 {
			
 
				 	if (__hugetlb_register_node &&
			
 
				-			node_state(node->dev.id, N_HIGH_MEMORY)) {
			
 
				+			node_state(node->dev.id, N_MEMORY)) {
			
 
				 		__hugetlb_register_node(node);
			
 
				 		return true;
			
 
				 	}
			
@@ -643,6 +643,9 @@ static struct node_attr node_state_attr[] = {
 
				 	[N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
			
 
				 #ifdef CONFIG_HIGHMEM
			
 
				 	[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
			
 
				+#endif
			
 
				+#ifdef CONFIG_MOVABLE_NODE
			
 
				+	[N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
			
 
				 #endif
			
 
				 	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
			
 
				 };
			
@@ -653,6 +656,9 @@ static struct attribute *node_state_attrs[] = {
 
				 	&node_state_attr[N_NORMAL_MEMORY].attr.attr,
			
 
				 #ifdef CONFIG_HIGHMEM
			
 
				 	&node_state_attr[N_HIGH_MEMORY].attr.attr,
			
 
				+#endif
			
 
				+#ifdef CONFIG_MOVABLE_NODE
			
 
				+	&node_state_attr[N_MEMORY].attr.attr,
			
 
				 #endif
			
 
				 	&node_state_attr[N_CPU].attr.attr,
			
 
				 	NULL
			
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
				 
			
 
				 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
			
 
				 
			
 
				-inline void
			
 
				-init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
			
 
				+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
			
 
				 {
			
 
				 	bh->b_end_io = handler;
			
 
				 	bh->b_private = private;
			
@@ -850,13 +849,10 @@ try_again:
 
				 		if (!bh)
			
 
				 			goto no_grow;
			
 
				 
			
 
				-		bh->b_bdev = NULL;
			
 
				 		bh->b_this_page = head;
			
 
				 		bh->b_blocknr = -1;
			
 
				 		head = bh;
			
 
				 
			
 
				-		bh->b_state = 0;
			
 
				-		atomic_set(&bh->b_count, 0);
			
 
				 		bh->b_size = size;
			
 
				 
			
 
				 		/* Link the buffer to its page */
			
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1034,7 +1034,7 @@ int bdi_writeback_thread(void *data)
 
				 	while (!kthread_freezable_should_stop(NULL)) {
			
 
				 		/*
			
 
				 		 * Remove own delayed wake-up timer, since we are already awake
			
 
				-		 * and we'll take care of the preriodic write-back.
			
 
				+		 * and we'll take care of the periodic write-back.
			
 
				 		 */
			
 
				 		del_timer(&wb->wakeup_timer);
			
 
				 
			
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
 
				 	/* Not inialized....update now */
			
 
				 	/* find out "max pfn" */
			
 
				 	end_pfn = 0;
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		unsigned long node_end;
			
 
				 		node_end  = NODE_DATA(nid)->node_start_pfn +
			
 
				 			NODE_DATA(nid)->node_spanned_pages;
			
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 
				 	spinlock_t *ptl;
			
 
				 	struct page *page;
			
 
				 
			
 
				-	split_huge_page_pmd(walk->mm, pmd);
			
 
				+	split_huge_page_pmd(vma, addr, pmd);
			
 
				 	if (pmd_trans_unstable(pmd))
			
 
				 		return 0;
			
 
				 
			
@@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 
				 		return NULL;
			
 
				 
			
 
				 	nid = page_to_nid(page);
			
 
				-	if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
			
 
				+	if (!node_isset(nid, node_states[N_MEMORY]))
			
 
				 		return NULL;
			
 
				 
			
 
				 	return page;
			
@@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 
				 	if (md->writeback)
			
 
				 		seq_printf(m, " writeback=%lu", md->writeback);
			
 
				 
			
 
				-	for_each_node_state(n, N_HIGH_MEMORY)
			
 
				+	for_each_node_state(n, N_MEMORY)
			
 
				 		if (md->node[n])
			
 
				 			seq_printf(m, " N%d=%lu", n, md->node[n]);
			
 
				 out:
			
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -449,6 +449,32 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 
				 			unsigned long size);
			
 
				 #endif
			
 
				 
			
 
				+#ifdef __HAVE_COLOR_ZERO_PAGE
			
 
				+static inline int is_zero_pfn(unsigned long pfn)
			
 
				+{
			
 
				+	extern unsigned long zero_pfn;
			
 
				+	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
			
 
				+	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long my_zero_pfn(unsigned long addr)
			
 
				+{
			
 
				+	return page_to_pfn(ZERO_PAGE(addr));
			
 
				+}
			
 
				+#else
			
 
				+static inline int is_zero_pfn(unsigned long pfn)
			
 
				+{
			
 
				+	extern unsigned long zero_pfn;
			
 
				+	return pfn == zero_pfn;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long my_zero_pfn(unsigned long addr)
			
 
				+{
			
 
				+	extern unsigned long zero_pfn;
			
 
				+	return zero_pfn;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 #ifdef CONFIG_MMU
			
 
				 
			
 
				 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
			
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -137,9 +137,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 
				 #define alloc_bootmem_low_pages_node(pgdat, x) \
			
 
				 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
			
 
				 
			
 
				-extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
			
 
				-				   int flags);
			
 
				-
			
 
				 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
			
 
				 extern void *alloc_remap(int nid, unsigned long size);
			
 
				 #else
			
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -144,7 +144,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 
				 	return node_possible_map;
			
 
				 }
			
 
				 
			
 
				-#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
			
 
				+#define cpuset_current_mems_allowed (node_states[N_MEMORY])
			
 
				 static inline void cpuset_init_current_mems_allowed(void) {}
			
 
				 
			
 
				 static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
			
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -34,6 +34,7 @@ struct vm_area_struct;
 
				 #define ___GFP_NO_KSWAPD	0x400000u
			
 
				 #define ___GFP_OTHER_NODE	0x800000u
			
 
				 #define ___GFP_WRITE		0x1000000u
			
 
				+/* If the above are modified, __GFP_BITS_SHIFT may need updating */
			
 
				 
			
 
				 /*
			
 
				  * GFP bitmasks..
			
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,7 @@ enum transparent_hugepage_flag {
 
				 	TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
			
 
				 	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
			
 
				 	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
			
 
				+	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
			
 
				 #ifdef CONFIG_DEBUG_VM
			
 
				 	TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
			
 
				 #endif
			
@@ -78,6 +79,9 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 
				 	 (transparent_hugepage_flags &					\
			
 
				 	  (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) &&		\
			
 
				 	  (__vma)->vm_flags & VM_HUGEPAGE))
			
 
				+#define transparent_hugepage_use_zero_page()				\
			
 
				+	(transparent_hugepage_flags &					\
			
 
				+	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
			
 
				 #ifdef CONFIG_DEBUG_VM
			
 
				 #define transparent_hugepage_debug_cow()				\
			
 
				 	(transparent_hugepage_flags &					\
			
@@ -95,12 +99,14 @@ extern int handle_pte_fault(struct mm_struct *mm,
 
				 			    struct vm_area_struct *vma, unsigned long address,
			
 
				 			    pte_t *pte, pmd_t *pmd, unsigned int flags);
			
 
				 extern int split_huge_page(struct page *page);
			
 
				-extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
			
 
				-#define split_huge_page_pmd(__mm, __pmd)				\
			
 
				+extern void __split_huge_page_pmd(struct vm_area_struct *vma,
			
 
				+		unsigned long address, pmd_t *pmd);
			
 
				+#define split_huge_page_pmd(__vma, __address, __pmd)			\
			
 
				 	do {								\
			
 
				 		pmd_t *____pmd = (__pmd);				\
			
 
				 		if (unlikely(pmd_trans_huge(*____pmd)))			\
			
 
				-			__split_huge_page_pmd(__mm, ____pmd);		\
			
 
				+			__split_huge_page_pmd(__vma, __address,		\
			
 
				+					____pmd);			\
			
 
				 	}  while (0)
			
 
				 #define wait_split_huge_page(__anon_vma, __pmd)				\
			
 
				 	do {								\
			
@@ -110,6 +116,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
 
				 		BUG_ON(pmd_trans_splitting(*____pmd) ||			\
			
 
				 		       pmd_trans_huge(*____pmd));			\
			
 
				 	} while (0)
			
 
				+extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
			
 
				+		pmd_t *pmd);
			
 
				 #if HPAGE_PMD_ORDER > MAX_ORDER
			
 
				 #error "hugepages can't be allocated by the buddy allocator"
			
 
				 #endif
			
@@ -177,10 +185,12 @@ static inline int split_huge_page(struct page *page)
 
				 {
			
 
				 	return 0;
			
 
				 }
			
 
				-#define split_huge_page_pmd(__mm, __pmd)	\
			
 
				+#define split_huge_page_pmd(__vma, __address, __pmd)	\
			
 
				 	do { } while (0)
			
 
				 #define wait_split_huge_page(__anon_vma, __pmd)	\
			
 
				 	do { } while (0)
			
 
				+#define split_huge_page_pmd_mm(__mm, __address, __pmd)	\
			
 
				+	do { } while (0)
			
 
				 #define compound_trans_head(page) compound_head(page)
			
 
				 static inline int hugepage_madvise(struct vm_area_struct *vma,
			
 
				 				   unsigned long *vm_flags, int advice)
			
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,7 +181,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 
				 						gfp_t gfp_mask,
			
 
				 						unsigned long *total_scanned);
			
 
				 
			
 
				-void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
			
 
				+void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
			
 
				+static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
			
 
				+					     enum vm_event_item idx)
			
 
				+{
			
 
				+	if (mem_cgroup_disabled())
			
 
				+		return;
			
 
				+	__mem_cgroup_count_vm_event(mm, idx);
			
 
				+}
			
 
				 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				 void mem_cgroup_split_huge_fixup(struct page *head);
			
 
				 #endif
			
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -54,6 +54,7 @@ struct memory_notify {
 
				 	unsigned long start_pfn;
			
 
				 	unsigned long nr_pages;
			
 
				 	int status_change_nid_normal;
			
 
				+	int status_change_nid_high;
			
 
				 	int status_change_nid;
			
 
				 };
			
 
				 
			
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -460,17 +460,44 @@ struct zone {
 
				 	unsigned long		zone_start_pfn;
			
 
				 
			
 
				 	/*
			
 
				-	 * zone_start_pfn, spanned_pages and present_pages are all
			
 
				-	 * protected by span_seqlock.  It is a seqlock because it has
			
 
				-	 * to be read outside of zone->lock, and it is done in the main
			
 
				-	 * allocator path.  But, it is written quite infrequently.
			
 
				+	 * spanned_pages is the total pages spanned by the zone, including
			
 
				+	 * holes, which is calculated as:
			
 
				+	 * 	spanned_pages = zone_end_pfn - zone_start_pfn;
			
 
				 	 *
			
 
				-	 * The lock is declared along with zone->lock because it is
			
 
				+	 * present_pages is physical pages existing within the zone, which
			
 
				+	 * is calculated as:
			
 
				+	 *	present_pages = spanned_pages - absent_pages(pags in holes);
			
 
				+	 *
			
 
				+	 * managed_pages is present pages managed by the buddy system, which
			
 
				+	 * is calculated as (reserved_pages includes pages allocated by the
			
 
				+	 * bootmem allocator):
			
 
				+	 *	managed_pages = present_pages - reserved_pages;
			
 
				+	 *
			
 
				+	 * So present_pages may be used by memory hotplug or memory power
			
 
				+	 * management logic to figure out unmanaged pages by checking
			
 
				+	 * (present_pages - managed_pages). And managed_pages should be used
			
 
				+	 * by page allocator and vm scanner to calculate all kinds of watermarks
			
 
				+	 * and thresholds.
			
 
				+	 *
			
 
				+	 * Locking rules:
			
 
				+	 *
			
 
				+	 * zone_start_pfn and spanned_pages are protected by span_seqlock.
			
 
				+	 * It is a seqlock because it has to be read outside of zone->lock,
			
 
				+	 * and it is done in the main allocator path.  But, it is written
			
 
				+	 * quite infrequently.
			
 
				+	 *
			
 
				+	 * The span_seq lock is declared along with zone->lock because it is
			
 
				 	 * frequently read in proximity to zone->lock.  It's good to
			
 
				 	 * give them a chance of being in the same cacheline.
			
 
				+	 *
			
 
				+	 * Write access to present_pages and managed_pages at runtime should
			
 
				+	 * be protected by lock_memory_hotplug()/unlock_memory_hotplug().
			
 
				+	 * Any reader who can't tolerant drift of present_pages and
			
 
				+	 * managed_pages should hold memory hotplug lock to get a stable value.
			
 
				 	 */
			
 
				-	unsigned long		spanned_pages;	/* total size, including holes */
			
 
				-	unsigned long		present_pages;	/* amount of memory (excluding holes) */
			
 
				+	unsigned long		spanned_pages;
			
 
				+	unsigned long		present_pages;
			
 
				+	unsigned long		managed_pages;
			
 
				 
			
 
				 	/*
			
 
				 	 * rarely used fields:
			
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -379,6 +379,11 @@ enum node_states {
 
				 	N_HIGH_MEMORY,		/* The node has regular or high memory */
			
 
				 #else
			
 
				 	N_HIGH_MEMORY = N_NORMAL_MEMORY,
			
 
				+#endif
			
 
				+#ifdef CONFIG_MOVABLE_NODE
			
 
				+	N_MEMORY,		/* The node has memory(regular, high, movable) */
			
 
				+#else
			
 
				+	N_MEMORY = N_HIGH_MEMORY,
			
 
				 #endif
			
 
				 	N_CPU,		/* The node has one or more cpus */
			
 
				 	NR_NODE_STATES
			
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -74,14 +74,9 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
 
				 		const char __user *buf, size_t nbytes, loff_t *pos,
			
 
				 		int (*read_strategy)(unsigned long long val, char *s));
			
 
				 
			
 
				-typedef int (*write_strategy_fn)(const char *buf, unsigned long long *val);
			
 
				-
			
 
				 int res_counter_memparse_write_strategy(const char *buf,
			
 
				 					unsigned long long *res);
			
 
				 
			
 
				-int res_counter_write(struct res_counter *counter, int member,
			
 
				-		      const char *buffer, write_strategy_fn write_strategy);
			
 
				-
			
 
				 /*
			
 
				  * the field descriptors. one for each member of res_counter
			
 
				  */
			
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -58,6 +58,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 
				 		THP_COLLAPSE_ALLOC,
			
 
				 		THP_COLLAPSE_ALLOC_FAILED,
			
 
				 		THP_SPLIT,
			
 
				+		THP_ZERO_PAGE_ALLOC,
			
 
				+		THP_ZERO_PAGE_ALLOC_FAILED,
			
 
				 #endif
			
 
				 		NR_VM_EVENT_ITEMS
			
 
				 };
			
--- a/init/main.c
+++ b/init/main.c
@@ -857,7 +857,7 @@ static void __init kernel_init_freeable(void)
 
				 	/*
			
 
				 	 * init can allocate pages on any node
			
 
				 	 */
			
 
				-	set_mems_allowed(node_states[N_HIGH_MEMORY]);
			
 
				+	set_mems_allowed(node_states[N_MEMORY]);
			
 
				 	/*
			
 
				 	 * init can run on any cpu.
			
 
				 	 */
			
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
 
				  * are online, with memory.  If none are online with memory, walk
			
 
				  * up the cpuset hierarchy until we find one that does have some
			
 
				  * online mems.  If we get all the way to the top and still haven't
			
 
				- * found any online mems, return node_states[N_HIGH_MEMORY].
			
 
				+ * found any online mems, return node_states[N_MEMORY].
			
 
				  *
			
 
				  * One way or another, we guarantee to return some non-empty subset
			
 
				- * of node_states[N_HIGH_MEMORY].
			
 
				+ * of node_states[N_MEMORY].
			
 
				  *
			
 
				  * Call with callback_mutex held.
			
 
				  */
			
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs,
 
				 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
			
 
				 {
			
 
				 	while (cs && !nodes_intersects(cs->mems_allowed,
			
 
				-					node_states[N_HIGH_MEMORY]))
			
 
				+					node_states[N_MEMORY]))
			
 
				 		cs = cs->parent;
			
 
				 	if (cs)
			
 
				 		nodes_and(*pmask, cs->mems_allowed,
			
 
				-					node_states[N_HIGH_MEMORY]);
			
 
				+					node_states[N_MEMORY]);
			
 
				 	else
			
 
				-		*pmask = node_states[N_HIGH_MEMORY];
			
 
				-	BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
			
 
				+		*pmask = node_states[N_MEMORY];
			
 
				+	BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	/*
			
 
				-	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
			
 
				+	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
			
 
				 	 * it's read-only
			
 
				 	 */
			
 
				 	if (cs == &top_cpuset) {
			
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
				 			goto done;
			
 
				 
			
 
				 		if (!nodes_subset(trialcs->mems_allowed,
			
 
				-				node_states[N_HIGH_MEMORY])) {
			
 
				+				node_states[N_MEMORY])) {
			
 
				 			retval =  -EINVAL;
			
 
				 			goto done;
			
 
				 		}
			
@@ -2026,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue)
 
				  * before dropping down to the next.  It always processes a node before
			
 
				  * any of its children.
			
 
				  *
			
 
				- * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
			
 
				+ * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
			
 
				  * if all present pages from a node are offlined.
			
 
				  */
			
 
				 static void
			
@@ -2065,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 
				 
			
 
				 			/* Continue past cpusets with all mems online */
			
 
				 			if (nodes_subset(cp->mems_allowed,
			
 
				-					node_states[N_HIGH_MEMORY]))
			
 
				+					node_states[N_MEMORY]))
			
 
				 				continue;
			
 
				 
			
 
				 			oldmems = cp->mems_allowed;
			
@@ -2073,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 
				 			/* Remove offline mems from this cpuset. */
			
 
				 			mutex_lock(&callback_mutex);
			
 
				 			nodes_and(cp->mems_allowed, cp->mems_allowed,
			
 
				-						node_states[N_HIGH_MEMORY]);
			
 
				+						node_states[N_MEMORY]);
			
 
				 			mutex_unlock(&callback_mutex);
			
 
				 
			
 
				 			/* Move tasks from the empty cpuset to a parent */
			
@@ -2126,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online)
 
				 
			
 
				 #ifdef CONFIG_MEMORY_HOTPLUG
			
 
				 /*
			
 
				- * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
			
 
				- * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
			
 
				+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
			
 
				+ * Call this routine anytime after node_states[N_MEMORY] changes.
			
 
				  * See cpuset_update_active_cpus() for CPU hotplug handling.
			
 
				  */
			
 
				 static int cpuset_track_online_nodes(struct notifier_block *self,
			
@@ -2140,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 
				 	case MEM_ONLINE:
			
 
				 		oldmems = top_cpuset.mems_allowed;
			
 
				 		mutex_lock(&callback_mutex);
			
 
				-		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
			
 
				+		top_cpuset.mems_allowed = node_states[N_MEMORY];
			
 
				 		mutex_unlock(&callback_mutex);
			
 
				 		update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
			
 
				 		break;
			
@@ -2169,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 
				 void __init cpuset_init_smp(void)
			
 
				 {
			
 
				 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
			
 
				-	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
			
 
				+	top_cpuset.mems_allowed = node_states[N_MEMORY];
			
 
				 
			
 
				 	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
			
 
				 
			
@@ -2237,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void)
 
				  *
			
 
				  * Description: Returns the nodemask_t mems_allowed of the cpuset
			
 
				  * attached to the specified @tsk.  Guaranteed to return some non-empty
			
 
				- * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
			
 
				+ * subset of node_states[N_MEMORY], even if this means going outside the
			
 
				  * tasks cpuset.
			
 
				  **/
			
 
				 
			
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -428,7 +428,7 @@ int kthreadd(void *unused)
 
				 	set_task_comm(tsk, "kthreadd");
			
 
				 	ignore_signals(tsk);
			
 
				 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
			
 
				-	set_mems_allowed(node_states[N_HIGH_MEMORY]);
			
 
				+	set_mems_allowed(node_states[N_MEMORY]);
			
 
				 
			
 
				 	current->flags |= PF_NOFREEZE;
			
 
				 
			
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -192,25 +192,3 @@ int res_counter_memparse_write_strategy(const char *buf,
 
				 	*res = PAGE_ALIGN(*res);
			
 
				 	return 0;
			
 
				 }
			
 
				-
			
 
				-int res_counter_write(struct res_counter *counter, int member,
			
 
				-		      const char *buf, write_strategy_fn write_strategy)
			
 
				-{
			
 
				-	char *end;
			
 
				-	unsigned long flags;
			
 
				-	unsigned long long tmp, *val;
			
 
				-
			
 
				-	if (write_strategy) {
			
 
				-		if (write_strategy(buf, &tmp))
			
 
				-			return -EINVAL;
			
 
				-	} else {
			
 
				-		tmp = simple_strtoull(buf, &end, 10);
			
 
				-		if (*end != '\0')
			
 
				-			return -EINVAL;
			
 
				-	}
			
 
				-	spin_lock_irqsave(&counter->lock, flags);
			
 
				-	val = res_counter_member(counter, member);
			
 
				-	*val = tmp;
			
 
				-	spin_unlock_irqrestore(&counter->lock, flags);
			
 
				-	return 0;
			
 
				-}
			
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,14 @@ config NO_BOOTMEM
 
				 config MEMORY_ISOLATION
			
 
				 	boolean
			
 
				 
			
 
				+config MOVABLE_NODE
			
 
				+	boolean "Enable to assign a node which has only movable memory"
			
 
				+	depends on HAVE_MEMBLOCK
			
 
				+	depends on NO_BOOTMEM
			
 
				+	depends on X86_64
			
 
				+	depends on NUMA
			
 
				+	depends on BROKEN
			
 
				+
			
 
				 # eventually, we can have this option just 'select SPARSEMEM'
			
 
				 config MEMORY_HOTPLUG
			
 
				 	bool "Allow for memory hot-add"
			
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 
				 	return count;
			
 
				 }
			
 
				 
			
 
				+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
			
 
				+{
			
 
				+	struct zone *z;
			
 
				+
			
 
				+	/*
			
 
				+	 * In free_area_init_core(), highmem zone's managed_pages is set to
			
 
				+	 * present_pages, and bootmem allocator doesn't allocate from highmem
			
 
				+	 * zones. So there's no need to recalculate managed_pages because all
			
 
				+	 * highmem pages will be managed by the buddy system. Here highmem
			
 
				+	 * zone also includes highmem movable zone.
			
 
				+	 */
			
 
				+	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
			
 
				+		if (!is_highmem(z))
			
 
				+			z->managed_pages = 0;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * free_all_bootmem_node - release a node's free pages to the buddy allocator
			
 
				  * @pgdat: node to be released
			
@@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 
				 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
			
 
				 {
			
 
				 	register_page_bootmem_info_node(pgdat);
			
 
				+	reset_node_lowmem_managed_pages(pgdat);
			
 
				 	return free_all_bootmem_core(pgdat->bdata);
			
 
				 }
			
 
				 
			
@@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void)
 
				 {
			
 
				 	unsigned long total_pages = 0;
			
 
				 	bootmem_data_t *bdata;
			
 
				+	struct pglist_data *pgdat;
			
 
				+
			
 
				+	for_each_online_pgdat(pgdat)
			
 
				+		reset_node_lowmem_managed_pages(pgdat);
			
 
				 
			
 
				 	list_for_each_entry(bdata, &bdata_list, list)
			
 
				 		total_pages += free_all_bootmem_core(bdata);
			
@@ -439,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 
				 	return mark_bootmem(start, end, 1, flags);
			
 
				 }
			
 
				 
			
 
				-int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
			
 
				-				   int flags)
			
 
				-{
			
 
				-	return reserve_bootmem(phys, len, flags);
			
 
				-}
			
 
				-
			
 
				 static unsigned long __init align_idx(struct bootmem_data *bdata,
			
 
				 				      unsigned long idx, unsigned long step)
			
 
				 {
			
@@ -575,27 +590,6 @@ find_block:
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
			
 
				-					unsigned long size, unsigned long align,
			
 
				-					unsigned long goal, unsigned long limit)
			
 
				-{
			
 
				-	if (WARN_ON_ONCE(slab_is_available()))
			
 
				-		return kzalloc(size, GFP_NOWAIT);
			
 
				-
			
 
				-#ifdef CONFIG_HAVE_ARCH_BOOTMEM
			
 
				-	{
			
 
				-		bootmem_data_t *p_bdata;
			
 
				-
			
 
				-		p_bdata = bootmem_arch_preferred_node(bdata, size, align,
			
 
				-							goal, limit);
			
 
				-		if (p_bdata)
			
 
				-			return alloc_bootmem_bdata(p_bdata, size, align,
			
 
				-							goal, limit);
			
 
				-	}
			
 
				-#endif
			
 
				-	return NULL;
			
 
				-}
			
 
				-
			
 
				 static void * __init alloc_bootmem_core(unsigned long size,
			
 
				 					unsigned long align,
			
 
				 					unsigned long goal,
			
@@ -604,9 +598,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
 
				 	bootmem_data_t *bdata;
			
 
				 	void *region;
			
 
				 
			
 
				-	region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
			
 
				-	if (region)
			
 
				-		return region;
			
 
				+	if (WARN_ON_ONCE(slab_is_available()))
			
 
				+		return kzalloc(size, GFP_NOWAIT);
			
 
				 
			
 
				 	list_for_each_entry(bdata, &bdata_list, list) {
			
 
				 		if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
			
@@ -704,11 +697,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 
				 {
			
 
				 	void *ptr;
			
 
				 
			
 
				+	if (WARN_ON_ONCE(slab_is_available()))
			
 
				+		return kzalloc(size, GFP_NOWAIT);
			
 
				 again:
			
 
				-	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
			
 
				-					   align, goal, limit);
			
 
				-	if (ptr)
			
 
				-		return ptr;
			
 
				 
			
 
				 	/* do not panic in alloc_bootmem_bdata() */
			
 
				 	if (limit && goal + size > limit)
			
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -215,60 +215,6 @@ static bool suitable_migration_target(struct page *page)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				-static void compact_capture_page(struct compact_control *cc)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-	int mtype, mtype_low, mtype_high;
			
 
				-
			
 
				-	if (!cc->page || *cc->page)
			
 
				-		return;
			
 
				-
			
 
				-	/*
			
 
				-	 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
			
 
				-	 * regardless of the migratetype of the freelist is is captured from.
			
 
				-	 * This is fine because the order for a high-order MIGRATE_MOVABLE
			
 
				-	 * allocation is typically at least a pageblock size and overall
			
 
				-	 * fragmentation is not impaired. Other allocation types must
			
 
				-	 * capture pages from their own migratelist because otherwise they
			
 
				-	 * could pollute other pageblocks like MIGRATE_MOVABLE with
			
 
				-	 * difficult to move pages and making fragmentation worse overall.
			
 
				-	 */
			
 
				-	if (cc->migratetype == MIGRATE_MOVABLE) {
			
 
				-		mtype_low = 0;
			
 
				-		mtype_high = MIGRATE_PCPTYPES;
			
 
				-	} else {
			
 
				-		mtype_low = cc->migratetype;
			
 
				-		mtype_high = cc->migratetype + 1;
			
 
				-	}
			
 
				-
			
 
				-	/* Speculatively examine the free lists without zone lock */
			
 
				-	for (mtype = mtype_low; mtype < mtype_high; mtype++) {
			
 
				-		int order;
			
 
				-		for (order = cc->order; order < MAX_ORDER; order++) {
			
 
				-			struct page *page;
			
 
				-			struct free_area *area;
			
 
				-			area = &(cc->zone->free_area[order]);
			
 
				-			if (list_empty(&area->free_list[mtype]))
			
 
				-				continue;
			
 
				-
			
 
				-			/* Take the lock and attempt capture of the page */
			
 
				-			if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
			
 
				-				return;
			
 
				-			if (!list_empty(&area->free_list[mtype])) {
			
 
				-				page = list_entry(area->free_list[mtype].next,
			
 
				-							struct page, lru);
			
 
				-				if (capture_free_page(page, cc->order, mtype)) {
			
 
				-					spin_unlock_irqrestore(&cc->zone->lock,
			
 
				-									flags);
			
 
				-					*cc->page = page;
			
 
				-					return;
			
 
				-				}
			
 
				-			}
			
 
				-			spin_unlock_irqrestore(&cc->zone->lock, flags);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Isolate free pages onto a private freelist. Caller must hold zone->lock.
			
 
				  * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
			
@@ -953,6 +899,60 @@ unsigned long compaction_suitable(struct zone *zone, int order)
 
				 	return COMPACT_CONTINUE;
			
 
				 }
			
 
				 
			
 
				+static void compact_capture_page(struct compact_control *cc)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	int mtype, mtype_low, mtype_high;
			
 
				+
			
 
				+	if (!cc->page || *cc->page)
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
			
 
				+	 * regardless of the migratetype of the freelist is is captured from.
			
 
				+	 * This is fine because the order for a high-order MIGRATE_MOVABLE
			
 
				+	 * allocation is typically at least a pageblock size and overall
			
 
				+	 * fragmentation is not impaired. Other allocation types must
			
 
				+	 * capture pages from their own migratelist because otherwise they
			
 
				+	 * could pollute other pageblocks like MIGRATE_MOVABLE with
			
 
				+	 * difficult to move pages and making fragmentation worse overall.
			
 
				+	 */
			
 
				+	if (cc->migratetype == MIGRATE_MOVABLE) {
			
 
				+		mtype_low = 0;
			
 
				+		mtype_high = MIGRATE_PCPTYPES;
			
 
				+	} else {
			
 
				+		mtype_low = cc->migratetype;
			
 
				+		mtype_high = cc->migratetype + 1;
			
 
				+	}
			
 
				+
			
 
				+	/* Speculatively examine the free lists without zone lock */
			
 
				+	for (mtype = mtype_low; mtype < mtype_high; mtype++) {
			
 
				+		int order;
			
 
				+		for (order = cc->order; order < MAX_ORDER; order++) {
			
 
				+			struct page *page;
			
 
				+			struct free_area *area;
			
 
				+			area = &(cc->zone->free_area[order]);
			
 
				+			if (list_empty(&area->free_list[mtype]))
			
 
				+				continue;
			
 
				+
			
 
				+			/* Take the lock and attempt capture of the page */
			
 
				+			if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
			
 
				+				return;
			
 
				+			if (!list_empty(&area->free_list[mtype])) {
			
 
				+				page = list_entry(area->free_list[mtype].next,
			
 
				+							struct page, lru);
			
 
				+				if (capture_free_page(page, cc->order, mtype)) {
			
 
				+					spin_unlock_irqrestore(&cc->zone->lock,
			
 
				+									flags);
			
 
				+					*cc->page = page;
			
 
				+					return;
			
 
				+				}
			
 
				+			}
			
 
				+			spin_unlock_irqrestore(&cc->zone->lock, flags);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static int compact_zone(struct zone *zone, struct compact_control *cc)
			
 
				 {
			
 
				 	int ret;
			
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,12 +12,14 @@
 
				 #include <linux/mmu_notifier.h>
			
 
				 #include <linux/rmap.h>
			
 
				 #include <linux/swap.h>
			
 
				+#include <linux/shrinker.h>
			
 
				 #include <linux/mm_inline.h>
			
 
				 #include <linux/kthread.h>
			
 
				 #include <linux/khugepaged.h>
			
 
				 #include <linux/freezer.h>
			
 
				 #include <linux/mman.h>
			
 
				 #include <linux/pagemap.h>
			
 
				+
			
 
				 #include <asm/tlb.h>
			
 
				 #include <asm/pgalloc.h>
			
 
				 #include "internal.h"
			
@@ -37,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
 
				 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
			
 
				 #endif
			
 
				 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
			
 
				-	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
			
 
				+	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
			
 
				+	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
			
 
				 
			
 
				 /* default scan 8*512 pte (or vmas) every 30 second */
			
 
				 static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
			
@@ -159,6 +162,77 @@ static int start_khugepaged(void)
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				+static atomic_t huge_zero_refcount;
			
 
				+static unsigned long huge_zero_pfn __read_mostly;
			
 
				+
			
 
				+static inline bool is_huge_zero_pfn(unsigned long pfn)
			
 
				+{
			
 
				+	unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
			
 
				+	return zero_pfn && pfn == zero_pfn;
			
 
				+}
			
 
				+
			
 
				+static inline bool is_huge_zero_pmd(pmd_t pmd)
			
 
				+{
			
 
				+	return is_huge_zero_pfn(pmd_pfn(pmd));
			
 
				+}
			
 
				+
			
 
				+static unsigned long get_huge_zero_page(void)
			
 
				+{
			
 
				+	struct page *zero_page;
			
 
				+retry:
			
 
				+	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
			
 
				+		return ACCESS_ONCE(huge_zero_pfn);
			
 
				+
			
 
				+	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
			
 
				+			HPAGE_PMD_ORDER);
			
 
				+	if (!zero_page) {
			
 
				+		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
			
 
				+		return 0;
			
 
				+	}
			
 
				+	count_vm_event(THP_ZERO_PAGE_ALLOC);
			
 
				+	preempt_disable();
			
 
				+	if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
			
 
				+		preempt_enable();
			
 
				+		__free_page(zero_page);
			
 
				+		goto retry;
			
 
				+	}
			
 
				+
			
 
				+	/* We take additional reference here. It will be put back by shrinker */
			
 
				+	atomic_set(&huge_zero_refcount, 2);
			
 
				+	preempt_enable();
			
 
				+	return ACCESS_ONCE(huge_zero_pfn);
			
 
				+}
			
 
				+
			
 
				+static void put_huge_zero_page(void)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Counter should never go to zero here. Only shrinker can put
			
 
				+	 * last reference.
			
 
				+	 */
			
 
				+	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
			
 
				+}
			
 
				+
			
 
				+static int shrink_huge_zero_page(struct shrinker *shrink,
			
 
				+		struct shrink_control *sc)
			
 
				+{
			
 
				+	if (!sc->nr_to_scan)
			
 
				+		/* we can free zero page only if last reference remains */
			
 
				+		return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
			
 
				+
			
 
				+	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
			
 
				+		unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
			
 
				+		BUG_ON(zero_pfn == 0);
			
 
				+		__free_page(__pfn_to_page(zero_pfn));
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct shrinker huge_zero_page_shrinker = {
			
 
				+	.shrink = shrink_huge_zero_page,
			
 
				+	.seeks = DEFAULT_SEEKS,
			
 
				+};
			
 
				+
			
 
				 #ifdef CONFIG_SYSFS
			
 
				 
			
 
				 static ssize_t double_flag_show(struct kobject *kobj,
			
@@ -284,6 +358,20 @@ static ssize_t defrag_store(struct kobject *kobj,
 
				 static struct kobj_attribute defrag_attr =
			
 
				 	__ATTR(defrag, 0644, defrag_show, defrag_store);
			
 
				 
			
 
				+static ssize_t use_zero_page_show(struct kobject *kobj,
			
 
				+		struct kobj_attribute *attr, char *buf)
			
 
				+{
			
 
				+	return single_flag_show(kobj, attr, buf,
			
 
				+				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
			
 
				+}
			
 
				+static ssize_t use_zero_page_store(struct kobject *kobj,
			
 
				+		struct kobj_attribute *attr, const char *buf, size_t count)
			
 
				+{
			
 
				+	return single_flag_store(kobj, attr, buf, count,
			
 
				+				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
			
 
				+}
			
 
				+static struct kobj_attribute use_zero_page_attr =
			
 
				+	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
			
 
				 #ifdef CONFIG_DEBUG_VM
			
 
				 static ssize_t debug_cow_show(struct kobject *kobj,
			
 
				 				struct kobj_attribute *attr, char *buf)
			
@@ -305,6 +393,7 @@ static struct kobj_attribute debug_cow_attr =
 
				 static struct attribute *hugepage_attr[] = {
			
 
				 	&enabled_attr.attr,
			
 
				 	&defrag_attr.attr,
			
 
				+	&use_zero_page_attr.attr,
			
 
				 #ifdef CONFIG_DEBUG_VM
			
 
				 	&debug_cow_attr.attr,
			
 
				 #endif
			
@@ -550,6 +639,8 @@ static int __init hugepage_init(void)
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				+	register_shrinker(&huge_zero_page_shrinker);
			
 
				+
			
 
				 	/*
			
 
				 	 * By default disable transparent hugepages on smaller systems,
			
 
				 	 * where the extra memory used could hurt more than TLB overhead
			
@@ -678,6 +769,22 @@ static inline struct page *alloc_hugepage(int defrag)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
			
 
				+		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
			
 
				+		unsigned long zero_pfn)
			
 
				+{
			
 
				+	pmd_t entry;
			
 
				+	if (!pmd_none(*pmd))
			
 
				+		return false;
			
 
				+	entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
			
 
				+	entry = pmd_wrprotect(entry);
			
 
				+	entry = pmd_mkhuge(entry);
			
 
				+	set_pmd_at(mm, haddr, pmd, entry);
			
 
				+	pgtable_trans_huge_deposit(mm, pgtable);
			
 
				+	mm->nr_ptes++;
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				 			       unsigned long address, pmd_t *pmd,
			
 
				 			       unsigned int flags)
			
@@ -691,6 +798,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 			return VM_FAULT_OOM;
			
 
				 		if (unlikely(khugepaged_enter(vma)))
			
 
				 			return VM_FAULT_OOM;
			
 
				+		if (!(flags & FAULT_FLAG_WRITE) &&
			
 
				+				transparent_hugepage_use_zero_page()) {
			
 
				+			pgtable_t pgtable;
			
 
				+			unsigned long zero_pfn;
			
 
				+			bool set;
			
 
				+			pgtable = pte_alloc_one(mm, haddr);
			
 
				+			if (unlikely(!pgtable))
			
 
				+				return VM_FAULT_OOM;
			
 
				+			zero_pfn = get_huge_zero_page();
			
 
				+			if (unlikely(!zero_pfn)) {
			
 
				+				pte_free(mm, pgtable);
			
 
				+				count_vm_event(THP_FAULT_FALLBACK);
			
 
				+				goto out;
			
 
				+			}
			
 
				+			spin_lock(&mm->page_table_lock);
			
 
				+			set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
			
 
				+					zero_pfn);
			
 
				+			spin_unlock(&mm->page_table_lock);
			
 
				+			if (!set) {
			
 
				+				pte_free(mm, pgtable);
			
 
				+				put_huge_zero_page();
			
 
				+			}
			
 
				+			return 0;
			
 
				+		}
			
 
				 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
			
 
				 					  vma, haddr, numa_node_id(), 0);
			
 
				 		if (unlikely(!page)) {
			
@@ -755,6 +886,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
				 		pte_free(dst_mm, pgtable);
			
 
				 		goto out_unlock;
			
 
				 	}
			
 
				+	/*
			
 
				+	 * mm->page_table_lock is enough to be sure that huge zero pmd is not
			
 
				+	 * under splitting since we don't split the page itself, only pmd to
			
 
				+	 * a page table.
			
 
				+	 */
			
 
				+	if (is_huge_zero_pmd(pmd)) {
			
 
				+		unsigned long zero_pfn;
			
 
				+		bool set;
			
 
				+		/*
			
 
				+		 * get_huge_zero_page() will never allocate a new page here,
			
 
				+		 * since we already have a zero page to copy. It just takes a
			
 
				+		 * reference.
			
 
				+		 */
			
 
				+		zero_pfn = get_huge_zero_page();
			
 
				+		set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
			
 
				+				zero_pfn);
			
 
				+		BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
			
 
				+		ret = 0;
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				 	if (unlikely(pmd_trans_splitting(pmd))) {
			
 
				 		/* split huge page running from under us */
			
 
				 		spin_unlock(&src_mm->page_table_lock);
			
@@ -806,6 +957,80 @@ unlock:
 
				 	spin_unlock(&mm->page_table_lock);
			
 
				 }
			
 
				 
			
 
				+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
			
 
				+		struct vm_area_struct *vma, unsigned long address,
			
 
				+		pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
			
 
				+{
			
 
				+	pgtable_t pgtable;
			
 
				+	pmd_t _pmd;
			
 
				+	struct page *page;
			
 
				+	int i, ret = 0;
			
 
				+	unsigned long mmun_start;	/* For mmu_notifiers */
			
 
				+	unsigned long mmun_end;		/* For mmu_notifiers */
			
 
				+
			
 
				+	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
			
 
				+	if (!page) {
			
 
				+		ret |= VM_FAULT_OOM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
			
 
				+		put_page(page);
			
 
				+		ret |= VM_FAULT_OOM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	clear_user_highpage(page, address);
			
 
				+	__SetPageUptodate(page);
			
 
				+
			
 
				+	mmun_start = haddr;
			
 
				+	mmun_end   = haddr + HPAGE_PMD_SIZE;
			
 
				+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
			
 
				+
			
 
				+	spin_lock(&mm->page_table_lock);
			
 
				+	if (unlikely(!pmd_same(*pmd, orig_pmd)))
			
 
				+		goto out_free_page;
			
 
				+
			
 
				+	pmdp_clear_flush(vma, haddr, pmd);
			
 
				+	/* leave pmd empty until pte is filled */
			
 
				+
			
 
				+	pgtable = pgtable_trans_huge_withdraw(mm);
			
 
				+	pmd_populate(mm, &_pmd, pgtable);
			
 
				+
			
 
				+	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
			
 
				+		pte_t *pte, entry;
			
 
				+		if (haddr == (address & PAGE_MASK)) {
			
 
				+			entry = mk_pte(page, vma->vm_page_prot);
			
 
				+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
			
 
				+			page_add_new_anon_rmap(page, vma, haddr);
			
 
				+		} else {
			
 
				+			entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
			
 
				+			entry = pte_mkspecial(entry);
			
 
				+		}
			
 
				+		pte = pte_offset_map(&_pmd, haddr);
			
 
				+		VM_BUG_ON(!pte_none(*pte));
			
 
				+		set_pte_at(mm, haddr, pte, entry);
			
 
				+		pte_unmap(pte);
			
 
				+	}
			
 
				+	smp_wmb(); /* make pte visible before pmd */
			
 
				+	pmd_populate(mm, pmd, pgtable);
			
 
				+	spin_unlock(&mm->page_table_lock);
			
 
				+	put_huge_zero_page();
			
 
				+	inc_mm_counter(mm, MM_ANONPAGES);
			
 
				+
			
 
				+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
			
 
				+
			
 
				+	ret |= VM_FAULT_WRITE;
			
 
				+out:
			
 
				+	return ret;
			
 
				+out_free_page:
			
 
				+	spin_unlock(&mm->page_table_lock);
			
 
				+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
			
 
				+	mem_cgroup_uncharge_page(page);
			
 
				+	put_page(page);
			
 
				+	goto out;
			
 
				+}
			
 
				+
			
 
				 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
			
 
				 					struct vm_area_struct *vma,
			
 
				 					unsigned long address,
			
@@ -912,19 +1137,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 			unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
			
 
				 {
			
 
				 	int ret = 0;
			
 
				-	struct page *page, *new_page;
			
 
				+	struct page *page = NULL, *new_page;
			
 
				 	unsigned long haddr;
			
 
				 	unsigned long mmun_start;	/* For mmu_notifiers */
			
 
				 	unsigned long mmun_end;		/* For mmu_notifiers */
			
 
				 
			
 
				 	VM_BUG_ON(!vma->anon_vma);
			
 
				+	haddr = address & HPAGE_PMD_MASK;
			
 
				+	if (is_huge_zero_pmd(orig_pmd))
			
 
				+		goto alloc;
			
 
				 	spin_lock(&mm->page_table_lock);
			
 
				 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
			
 
				 		goto out_unlock;
			
 
				 
			
 
				 	page = pmd_page(orig_pmd);
			
 
				 	VM_BUG_ON(!PageCompound(page) || !PageHead(page));
			
 
				-	haddr = address & HPAGE_PMD_MASK;
			
 
				 	if (page_mapcount(page) == 1) {
			
 
				 		pmd_t entry;
			
 
				 		entry = pmd_mkyoung(orig_pmd);
			
@@ -936,7 +1163,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	}
			
 
				 	get_page(page);
			
 
				 	spin_unlock(&mm->page_table_lock);
			
 
				-
			
 
				+alloc:
			
 
				 	if (transparent_hugepage_enabled(vma) &&
			
 
				 	    !transparent_hugepage_debug_cow())
			
 
				 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
			
@@ -946,24 +1173,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 
			
 
				 	if (unlikely(!new_page)) {
			
 
				 		count_vm_event(THP_FAULT_FALLBACK);
			
 
				-		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
			
 
				-						   pmd, orig_pmd, page, haddr);
			
 
				-		if (ret & VM_FAULT_OOM)
			
 
				-			split_huge_page(page);
			
 
				-		put_page(page);
			
 
				+		if (is_huge_zero_pmd(orig_pmd)) {
			
 
				+			ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
			
 
				+					address, pmd, orig_pmd, haddr);
			
 
				+		} else {
			
 
				+			ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
			
 
				+					pmd, orig_pmd, page, haddr);
			
 
				+			if (ret & VM_FAULT_OOM)
			
 
				+				split_huge_page(page);
			
 
				+			put_page(page);
			
 
				+		}
			
 
				 		goto out;
			
 
				 	}
			
 
				 	count_vm_event(THP_FAULT_ALLOC);
			
 
				 
			
 
				 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
			
 
				 		put_page(new_page);
			
 
				-		split_huge_page(page);
			
 
				-		put_page(page);
			
 
				+		if (page) {
			
 
				+			split_huge_page(page);
			
 
				+			put_page(page);
			
 
				+		}
			
 
				 		ret |= VM_FAULT_OOM;
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
			
 
				+	if (is_huge_zero_pmd(orig_pmd))
			
 
				+		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
			
 
				+	else
			
 
				+		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
			
 
				 	__SetPageUptodate(new_page);
			
 
				 
			
 
				 	mmun_start = haddr;
			
@@ -971,7 +1208,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
			
 
				 
			
 
				 	spin_lock(&mm->page_table_lock);
			
 
				-	put_page(page);
			
 
				+	if (page)
			
 
				+		put_page(page);
			
 
				 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
			
 
				 		spin_unlock(&mm->page_table_lock);
			
 
				 		mem_cgroup_uncharge_page(new_page);
			
@@ -979,14 +1217,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 		goto out_mn;
			
 
				 	} else {
			
 
				 		pmd_t entry;
			
 
				-		VM_BUG_ON(!PageHead(page));
			
 
				 		entry = mk_huge_pmd(new_page, vma);
			
 
				 		pmdp_clear_flush(vma, haddr, pmd);
			
 
				 		page_add_new_anon_rmap(new_page, vma, haddr);
			
 
				 		set_pmd_at(mm, haddr, pmd, entry);
			
 
				 		update_mmu_cache_pmd(vma, address, pmd);
			
 
				-		page_remove_rmap(page);
			
 
				-		put_page(page);
			
 
				+		if (is_huge_zero_pmd(orig_pmd)) {
			
 
				+			add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
			
 
				+			put_huge_zero_page();
			
 
				+		} else {
			
 
				+			VM_BUG_ON(!PageHead(page));
			
 
				+			page_remove_rmap(page);
			
 
				+			put_page(page);
			
 
				+		}
			
 
				 		ret |= VM_FAULT_WRITE;
			
 
				 	}
			
 
				 	spin_unlock(&mm->page_table_lock);
			
@@ -1055,15 +1298,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
				 		pmd_t orig_pmd;
			
 
				 		pgtable = pgtable_trans_huge_withdraw(tlb->mm);
			
 
				 		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
			
 
				-		page = pmd_page(orig_pmd);
			
 
				 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
			
 
				-		page_remove_rmap(page);
			
 
				-		VM_BUG_ON(page_mapcount(page) < 0);
			
 
				-		add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
			
 
				-		VM_BUG_ON(!PageHead(page));
			
 
				-		tlb->mm->nr_ptes--;
			
 
				-		spin_unlock(&tlb->mm->page_table_lock);
			
 
				-		tlb_remove_page(tlb, page);
			
 
				+		if (is_huge_zero_pmd(orig_pmd)) {
			
 
				+			tlb->mm->nr_ptes--;
			
 
				+			spin_unlock(&tlb->mm->page_table_lock);
			
 
				+			put_huge_zero_page();
			
 
				+		} else {
			
 
				+			page = pmd_page(orig_pmd);
			
 
				+			page_remove_rmap(page);
			
 
				+			VM_BUG_ON(page_mapcount(page) < 0);
			
 
				+			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
			
 
				+			VM_BUG_ON(!PageHead(page));
			
 
				+			tlb->mm->nr_ptes--;
			
 
				+			spin_unlock(&tlb->mm->page_table_lock);
			
 
				+			tlb_remove_page(tlb, page);
			
 
				+		}
			
 
				 		pte_free(tlb->mm, pgtable);
			
 
				 		ret = 1;
			
 
				 	}
			
@@ -1135,6 +1384,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
				 		pmd_t entry;
			
 
				 		entry = pmdp_get_and_clear(mm, addr, pmd);
			
 
				 		entry = pmd_modify(entry, newprot);
			
 
				+		BUG_ON(pmd_write(entry));
			
 
				 		set_pmd_at(mm, addr, pmd, entry);
			
 
				 		spin_unlock(&vma->vm_mm->page_table_lock);
			
 
				 		ret = 1;
			
@@ -1477,6 +1727,7 @@ int split_huge_page(struct page *page)
 
				 	struct anon_vma *anon_vma;
			
 
				 	int ret = 1;
			
 
				 
			
 
				+	BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
			
 
				 	BUG_ON(!PageAnon(page));
			
 
				 	anon_vma = page_lock_anon_vma(page);
			
 
				 	if (!anon_vma)
			
@@ -2336,19 +2587,65 @@ static int khugepaged(void *none)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
			
 
				+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
			
 
				+		unsigned long haddr, pmd_t *pmd)
			
 
				+{
			
 
				+	struct mm_struct *mm = vma->vm_mm;
			
 
				+	pgtable_t pgtable;
			
 
				+	pmd_t _pmd;
			
 
				+	int i;
			
 
				+
			
 
				+	pmdp_clear_flush(vma, haddr, pmd);
			
 
				+	/* leave pmd empty until pte is filled */
			
 
				+
			
 
				+	pgtable = pgtable_trans_huge_withdraw(mm);
			
 
				+	pmd_populate(mm, &_pmd, pgtable);
			
 
				+
			
 
				+	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
			
 
				+		pte_t *pte, entry;
			
 
				+		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
			
 
				+		entry = pte_mkspecial(entry);
			
 
				+		pte = pte_offset_map(&_pmd, haddr);
			
 
				+		VM_BUG_ON(!pte_none(*pte));
			
 
				+		set_pte_at(mm, haddr, pte, entry);
			
 
				+		pte_unmap(pte);
			
 
				+	}
			
 
				+	smp_wmb(); /* make pte visible before pmd */
			
 
				+	pmd_populate(mm, pmd, pgtable);
			
 
				+	put_huge_zero_page();
			
 
				+}
			
 
				+
			
 
				+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
			
 
				+		pmd_t *pmd)
			
 
				 {
			
 
				 	struct page *page;
			
 
				+	struct mm_struct *mm = vma->vm_mm;
			
 
				+	unsigned long haddr = address & HPAGE_PMD_MASK;
			
 
				+	unsigned long mmun_start;	/* For mmu_notifiers */
			
 
				+	unsigned long mmun_end;		/* For mmu_notifiers */
			
 
				+
			
 
				+	BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
			
 
				 
			
 
				+	mmun_start = haddr;
			
 
				+	mmun_end   = haddr + HPAGE_PMD_SIZE;
			
 
				+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
			
 
				 	spin_lock(&mm->page_table_lock);
			
 
				 	if (unlikely(!pmd_trans_huge(*pmd))) {
			
 
				 		spin_unlock(&mm->page_table_lock);
			
 
				+		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
			
 
				+		return;
			
 
				+	}
			
 
				+	if (is_huge_zero_pmd(*pmd)) {
			
 
				+		__split_huge_zero_page_pmd(vma, haddr, pmd);
			
 
				+		spin_unlock(&mm->page_table_lock);
			
 
				+		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
			
 
				 		return;
			
 
				 	}
			
 
				 	page = pmd_page(*pmd);
			
 
				 	VM_BUG_ON(!page_count(page));
			
 
				 	get_page(page);
			
 
				 	spin_unlock(&mm->page_table_lock);
			
 
				+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
			
 
				 
			
 
				 	split_huge_page(page);
			
 
				 
			
@@ -2356,6 +2653,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
 
				 	BUG_ON(pmd_trans_huge(*pmd));
			
 
				 }
			
 
				 
			
 
				+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
			
 
				+		pmd_t *pmd)
			
 
				+{
			
 
				+	struct vm_area_struct *vma;
			
 
				+
			
 
				+	vma = find_vma(mm, address);
			
 
				+	BUG_ON(vma == NULL);
			
 
				+	split_huge_page_pmd(vma, address, pmd);
			
 
				+}
			
 
				+
			
 
				 static void split_huge_page_address(struct mm_struct *mm,
			
 
				 				    unsigned long address)
			
 
				 {
			
@@ -2370,7 +2677,7 @@ static void split_huge_page_address(struct mm_struct *mm,
 
				 	 * Caller holds the mmap_sem write mode, so a huge pmd cannot
			
 
				 	 * materialize from under us.
			
 
				 	 */
			
 
				-	split_huge_page_pmd(mm, pmd);
			
 
				+	split_huge_page_pmd_mm(mm, address, pmd);
			
 
				 }
			
 
				 
			
 
				 void __vma_adjust_trans_huge(struct vm_area_struct *vma,
			
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
 
				 	 * on-line nodes with memory and will handle the hstate accounting.
			
 
				 	 */
			
 
				 	while (nr_pages--) {
			
 
				-		if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
			
 
				+		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
			
 
				 			break;
			
 
				 	}
			
 
				 }
			
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 
				 int __weak alloc_bootmem_huge_page(struct hstate *h)
			
 
				 {
			
 
				 	struct huge_bootmem_page *m;
			
 
				-	int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
			
 
				+	int nr_nodes = nodes_weight(node_states[N_MEMORY]);
			
 
				 
			
 
				 	while (nr_nodes) {
			
 
				 		void *addr;
			
 
				 
			
 
				 		addr = __alloc_bootmem_node_nopanic(
			
 
				 				NODE_DATA(hstate_next_node_to_alloc(h,
			
 
				-						&node_states[N_HIGH_MEMORY])),
			
 
				+						&node_states[N_MEMORY])),
			
 
				 				huge_page_size(h), huge_page_size(h), 0);
			
 
				 
			
 
				 		if (addr) {
			
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 
				 			if (!alloc_bootmem_huge_page(h))
			
 
				 				break;
			
 
				 		} else if (!alloc_fresh_huge_page(h,
			
 
				-					 &node_states[N_HIGH_MEMORY]))
			
 
				+					 &node_states[N_MEMORY]))
			
 
				 			break;
			
 
				 	}
			
 
				 	h->max_huge_pages = i;
			
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
 
				 		if (!(obey_mempolicy &&
			
 
				 				init_nodemask_of_mempolicy(nodes_allowed))) {
			
 
				 			NODEMASK_FREE(nodes_allowed);
			
 
				-			nodes_allowed = &node_states[N_HIGH_MEMORY];
			
 
				+			nodes_allowed = &node_states[N_MEMORY];
			
 
				 		}
			
 
				 	} else if (nodes_allowed) {
			
 
				 		/*
			
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
 
				 		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
			
 
				 		init_nodemask_of_node(nodes_allowed, nid);
			
 
				 	} else
			
 
				-		nodes_allowed = &node_states[N_HIGH_MEMORY];
			
 
				+		nodes_allowed = &node_states[N_MEMORY];
			
 
				 
			
 
				 	h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
			
 
				 
			
 
				-	if (nodes_allowed != &node_states[N_HIGH_MEMORY])
			
 
				+	if (nodes_allowed != &node_states[N_MEMORY])
			
 
				 		NODEMASK_FREE(nodes_allowed);
			
 
				 
			
 
				 	return len;
			
@@ -1844,7 +1844,7 @@ static void hugetlb_register_all_nodes(void)
 
				 {
			
 
				 	int nid;
			
 
				 
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		struct node *node = node_devices[nid];
			
 
				 		if (node->dev.id == nid)
			
 
				 			hugetlb_register_node(node);
			
@@ -1939,8 +1939,8 @@ void __init hugetlb_add_hstate(unsigned order)
 
				 	for (i = 0; i < MAX_NUMNODES; ++i)
			
 
				 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
			
 
				 	INIT_LIST_HEAD(&h->hugepage_activelist);
			
 
				-	h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
			
 
				-	h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
			
 
				+	h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
			
 
				+	h->next_nid_to_free = first_node(node_states[N_MEMORY]);
			
 
				 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
			
 
				 					huge_page_size(h)/1024);
			
 
				 	/*
			
@@ -2035,11 +2035,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 
				 		if (!(obey_mempolicy &&
			
 
				 			       init_nodemask_of_mempolicy(nodes_allowed))) {
			
 
				 			NODEMASK_FREE(nodes_allowed);
			
 
				-			nodes_allowed = &node_states[N_HIGH_MEMORY];
			
 
				+			nodes_allowed = &node_states[N_MEMORY];
			
 
				 		}
			
 
				 		h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
			
 
				 
			
 
				-		if (nodes_allowed != &node_states[N_HIGH_MEMORY])
			
 
				+		if (nodes_allowed != &node_states[N_MEMORY])
			
 
				 			NODEMASK_FREE(nodes_allowed);
			
 
				 	}
			
 
				 out:
			
@@ -2386,8 +2386,10 @@ again:
 
				 		/*
			
 
				 		 * HWPoisoned hugepage is already unmapped and dropped reference
			
 
				 		 */
			
 
				-		if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
			
 
				+		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
			
 
				+			pte_clear(mm, address, ptep);
			
 
				 			continue;
			
 
				+		}
			
 
				 
			
 
				 		page = pte_page(pte);
			
 
				 		/*
			
@@ -3170,7 +3172,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
 
				 
			
 
				 	spin_lock(&hugetlb_lock);
			
 
				 	if (is_hugepage_on_freelist(hpage)) {
			
 
				-		list_del(&hpage->lru);
			
 
				+		/*
			
 
				+		 * Hwpoisoned hugepage isn't linked to activelist or freelist,
			
 
				+		 * but dangling hpage->lru can trigger list-debug warnings
			
 
				+		 * (this happens when we call unpoison_memory() on it),
			
 
				+		 * so let it point to itself with list_del_init().
			
 
				+		 */
			
 
				+		list_del_init(&hpage->lru);
			
 
				 		set_page_refcounted(hpage);
			
 
				 		h->free_huge_pages--;
			
 
				 		h->free_huge_pages_node[nid]--;
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,6 +59,8 @@
 
				 #include <trace/events/vmscan.h>
			
 
				 
			
 
				 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
			
 
				+EXPORT_SYMBOL(mem_cgroup_subsys);
			
 
				+
			
 
				 #define MEM_CGROUP_RECLAIM_RETRIES	5
			
 
				 static struct mem_cgroup *root_mem_cgroup __read_mostly;
			
 
				 
			
@@ -800,7 +802,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 
				 	int nid;
			
 
				 	u64 total = 0;
			
 
				 
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY)
			
 
				+	for_each_node_state(nid, N_MEMORY)
			
 
				 		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
			
 
				 	return total;
			
 
				 }
			
@@ -1015,13 +1017,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
 
				 	     iter != NULL;				\
			
 
				 	     iter = mem_cgroup_iter(NULL, iter, NULL))
			
 
				 
			
 
				-void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
			
 
				+void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg;
			
 
				 
			
 
				-	if (!mm)
			
 
				-		return;
			
 
				-
			
 
				 	rcu_read_lock();
			
 
				 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			
 
				 	if (unlikely(!memcg))
			
@@ -1040,7 +1039,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 
				 out:
			
 
				 	rcu_read_unlock();
			
 
				 }
			
 
				-EXPORT_SYMBOL(mem_cgroup_count_vm_event);
			
 
				+EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
			
 
				 
			
 
				 /**
			
 
				  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
			
@@ -1644,9 +1643,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
 
				 		return;
			
 
				 
			
 
				 	/* make a nodemask where this memcg uses memory from */
			
 
				-	memcg->scan_nodes = node_states[N_HIGH_MEMORY];
			
 
				+	memcg->scan_nodes = node_states[N_MEMORY];
			
 
				 
			
 
				-	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
			
 
				+	for_each_node_mask(nid, node_states[N_MEMORY]) {
			
 
				 
			
 
				 		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
			
 
				 			node_clear(nid, memcg->scan_nodes);
			
@@ -1717,7 +1716,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 
				 	/*
			
 
				 	 * Check rest of nodes.
			
 
				 	 */
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		if (node_isset(nid, memcg->scan_nodes))
			
 
				 			continue;
			
 
				 		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
			
@@ -3776,7 +3775,7 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 
				 		lru_add_drain_all();
			
 
				 		drain_all_stock_sync(memcg);
			
 
				 		mem_cgroup_start_move(memcg);
			
 
				-		for_each_node_state(node, N_HIGH_MEMORY) {
			
 
				+		for_each_node_state(node, N_MEMORY) {
			
 
				 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			
 
				 				enum lru_list lru;
			
 
				 				for_each_lru(lru) {
			
@@ -4122,7 +4121,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 
				 
			
 
				 	total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
			
 
				 	seq_printf(m, "total=%lu", total_nr);
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
			
 
				 		seq_printf(m, " N%d=%lu", nid, node_nr);
			
 
				 	}
			
@@ -4130,7 +4129,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 
				 
			
 
				 	file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
			
 
				 	seq_printf(m, "file=%lu", file_nr);
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
			
 
				 				LRU_ALL_FILE);
			
 
				 		seq_printf(m, " N%d=%lu", nid, node_nr);
			
@@ -4139,7 +4138,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 
				 
			
 
				 	anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
			
 
				 	seq_printf(m, "anon=%lu", anon_nr);
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
			
 
				 				LRU_ALL_ANON);
			
 
				 		seq_printf(m, " N%d=%lu", nid, node_nr);
			
@@ -4148,7 +4147,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 
				 
			
 
				 	unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
			
 
				 	seq_printf(m, "unevictable=%lu", unevictable_nr);
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
			
 
				 				BIT(LRU_UNEVICTABLE));
			
 
				 		seq_printf(m, " N%d=%lu", nid, node_nr);
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -717,20 +717,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
 
				 	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
			
 
				 }
			
 
				 
			
 
				-#ifndef is_zero_pfn
			
 
				-static inline int is_zero_pfn(unsigned long pfn)
			
 
				-{
			
 
				-	return pfn == zero_pfn;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#ifndef my_zero_pfn
			
 
				-static inline unsigned long my_zero_pfn(unsigned long addr)
			
 
				-{
			
 
				-	return zero_pfn;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 /*
			
 
				  * vm_normal_page -- This function gets the "struct page" associated with a pte.
			
 
				  *
			
@@ -1250,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 
				 					BUG();
			
 
				 				}
			
 
				 #endif
			
 
				-				split_huge_page_pmd(vma->vm_mm, pmd);
			
 
				+				split_huge_page_pmd(vma, addr, pmd);
			
 
				 			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
			
 
				 				goto next;
			
 
				 			/* fall through */
			
@@ -1519,7 +1505,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 
				 	}
			
 
				 	if (pmd_trans_huge(*pmd)) {
			
 
				 		if (flags & FOLL_SPLIT) {
			
 
				-			split_huge_page_pmd(mm, pmd);
			
 
				+			split_huge_page_pmd(vma, address, pmd);
			
 
				 			goto split_fallthrough;
			
 
				 		}
			
 
				 		spin_lock(&mm->page_table_lock);
			
@@ -2794,13 +2780,8 @@ unlock:
 
				 oom_free_new:
			
 
				 	page_cache_release(new_page);
			
 
				 oom:
			
 
				-	if (old_page) {
			
 
				-		if (page_mkwrite) {
			
 
				-			unlock_page(old_page);
			
 
				-			page_cache_release(old_page);
			
 
				-		}
			
 
				+	if (old_page)
			
 
				 		page_cache_release(old_page);
			
 
				-	}
			
 
				 	return VM_FAULT_OOM;
			
 
				 
			
 
				 unwritable_page:
			
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info,  struct page *page,
 
				 void __ref put_page_bootmem(struct page *page)
			
 
				 {
			
 
				 	unsigned long type;
			
 
				+	static DEFINE_MUTEX(ppb_lock);
			
 
				 
			
 
				 	type = (unsigned long) page->lru.next;
			
 
				 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
			
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
 
				 		ClearPagePrivate(page);
			
 
				 		set_page_private(page, 0);
			
 
				 		INIT_LIST_HEAD(&page->lru);
			
 
				+
			
 
				+		/*
			
 
				+		 * Please refer to comment for __free_pages_bootmem()
			
 
				+		 * for why we serialize here.
			
 
				+		 */
			
 
				+		mutex_lock(&ppb_lock);
			
 
				 		__free_pages_bootmem(page, 0);
			
 
				+		mutex_unlock(&ppb_lock);
			
 
				 	}
			
 
				 
			
 
				 }
			
@@ -581,11 +589,19 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_MOVABLE_NODE
			
 
				+/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
			
 
				+static bool can_online_high_movable(struct zone *zone)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+#else /* #ifdef CONFIG_MOVABLE_NODE */
			
 
				 /* ensure every online node has NORMAL memory */
			
 
				 static bool can_online_high_movable(struct zone *zone)
			
 
				 {
			
 
				 	return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
			
 
				 }
			
 
				+#endif /* #ifdef CONFIG_MOVABLE_NODE */
			
 
				 
			
 
				 /* check which state of node_states will be changed when online memory */
			
 
				 static void node_states_check_changes_online(unsigned long nr_pages,
			
@@ -595,13 +611,15 @@ static void node_states_check_changes_online(unsigned long nr_pages,
 
				 	enum zone_type zone_last = ZONE_NORMAL;
			
 
				 
			
 
				 	/*
			
 
				-	 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
			
 
				-	 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
			
 
				+	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
			
 
				+	 * contains nodes which have zones of 0...ZONE_NORMAL,
			
 
				+	 * set zone_last to ZONE_NORMAL.
			
 
				 	 *
			
 
				-	 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
			
 
				-	 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
			
 
				+	 * If we don't have HIGHMEM nor movable node,
			
 
				+	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
			
 
				+	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
			
 
				 	 */
			
 
				-	if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
			
 
				+	if (N_MEMORY == N_NORMAL_MEMORY)
			
 
				 		zone_last = ZONE_MOVABLE;
			
 
				 
			
 
				 	/*
			
@@ -615,12 +633,34 @@ static void node_states_check_changes_online(unsigned long nr_pages,
 
				 	else
			
 
				 		arg->status_change_nid_normal = -1;
			
 
				 
			
 
				+#ifdef CONFIG_HIGHMEM
			
 
				+	/*
			
 
				+	 * If we have movable node, node_states[N_HIGH_MEMORY]
			
 
				+	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
			
 
				+	 * set zone_last to ZONE_HIGHMEM.
			
 
				+	 *
			
 
				+	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
			
 
				+	 * contains nodes which have zones of 0...ZONE_MOVABLE,
			
 
				+	 * set zone_last to ZONE_MOVABLE.
			
 
				+	 */
			
 
				+	zone_last = ZONE_HIGHMEM;
			
 
				+	if (N_MEMORY == N_HIGH_MEMORY)
			
 
				+		zone_last = ZONE_MOVABLE;
			
 
				+
			
 
				+	if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
			
 
				+		arg->status_change_nid_high = nid;
			
 
				+	else
			
 
				+		arg->status_change_nid_high = -1;
			
 
				+#else
			
 
				+	arg->status_change_nid_high = arg->status_change_nid_normal;
			
 
				+#endif
			
 
				+
			
 
				 	/*
			
 
				 	 * if the node don't have memory befor online, we will need to
			
 
				-	 * set the node to node_states[N_HIGH_MEMORY] after the memory
			
 
				+	 * set the node to node_states[N_MEMORY] after the memory
			
 
				 	 * is online.
			
 
				 	 */
			
 
				-	if (!node_state(nid, N_HIGH_MEMORY))
			
 
				+	if (!node_state(nid, N_MEMORY))
			
 
				 		arg->status_change_nid = nid;
			
 
				 	else
			
 
				 		arg->status_change_nid = -1;
			
@@ -631,7 +671,10 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 
				 	if (arg->status_change_nid_normal >= 0)
			
 
				 		node_set_state(node, N_NORMAL_MEMORY);
			
 
				 
			
 
				-	node_set_state(node, N_HIGH_MEMORY);
			
 
				+	if (arg->status_change_nid_high >= 0)
			
 
				+		node_set_state(node, N_HIGH_MEMORY);
			
 
				+
			
 
				+	node_set_state(node, N_MEMORY);
			
 
				 }
			
 
				 
			
 
				 
			
@@ -713,6 +756,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 
				 		return ret;
			
 
				 	}
			
 
				 
			
 
				+	zone->managed_pages += onlined_pages;
			
 
				 	zone->present_pages += onlined_pages;
			
 
				 	zone->zone_pgdat->node_present_pages += onlined_pages;
			
 
				 	if (onlined_pages) {
			
@@ -1066,6 +1110,13 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 
				 	return offlined;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_MOVABLE_NODE
			
 
				+/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
			
 
				+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+#else /* #ifdef CONFIG_MOVABLE_NODE */
			
 
				 /* ensure the node has NORMAL memory if it is still online */
			
 
				 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
			
 
				 {
			
@@ -1089,6 +1140,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 
				 	 */
			
 
				 	return present_pages == 0;
			
 
				 }
			
 
				+#endif /* #ifdef CONFIG_MOVABLE_NODE */
			
 
				 
			
 
				 /* check which state of node_states will be changed when offline memory */
			
 
				 static void node_states_check_changes_offline(unsigned long nr_pages,
			
@@ -1099,13 +1151,15 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
 
				 	enum zone_type zt, zone_last = ZONE_NORMAL;
			
 
				 
			
 
				 	/*
			
 
				-	 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
			
 
				-	 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
			
 
				+	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
			
 
				+	 * contains nodes which have zones of 0...ZONE_NORMAL,
			
 
				+	 * set zone_last to ZONE_NORMAL.
			
 
				 	 *
			
 
				-	 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
			
 
				-	 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
			
 
				+	 * If we don't have HIGHMEM nor movable node,
			
 
				+	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
			
 
				+	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
			
 
				 	 */
			
 
				-	if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
			
 
				+	if (N_MEMORY == N_NORMAL_MEMORY)
			
 
				 		zone_last = ZONE_MOVABLE;
			
 
				 
			
 
				 	/*
			
@@ -1122,6 +1176,30 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
 
				 	else
			
 
				 		arg->status_change_nid_normal = -1;
			
 
				 
			
 
				+#ifdef CONFIG_HIGHMEM
			
 
				+	/*
			
 
				+	 * If we have movable node, node_states[N_HIGH_MEMORY]
			
 
				+	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
			
 
				+	 * set zone_last to ZONE_HIGHMEM.
			
 
				+	 *
			
 
				+	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
			
 
				+	 * contains nodes which have zones of 0...ZONE_MOVABLE,
			
 
				+	 * set zone_last to ZONE_MOVABLE.
			
 
				+	 */
			
 
				+	zone_last = ZONE_HIGHMEM;
			
 
				+	if (N_MEMORY == N_HIGH_MEMORY)
			
 
				+		zone_last = ZONE_MOVABLE;
			
 
				+
			
 
				+	for (; zt <= zone_last; zt++)
			
 
				+		present_pages += pgdat->node_zones[zt].present_pages;
			
 
				+	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
			
 
				+		arg->status_change_nid_high = zone_to_nid(zone);
			
 
				+	else
			
 
				+		arg->status_change_nid_high = -1;
			
 
				+#else
			
 
				+	arg->status_change_nid_high = arg->status_change_nid_normal;
			
 
				+#endif
			
 
				+
			
 
				 	/*
			
 
				 	 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
			
 
				 	 */
			
@@ -1146,9 +1224,13 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
 
				 	if (arg->status_change_nid_normal >= 0)
			
 
				 		node_clear_state(node, N_NORMAL_MEMORY);
			
 
				 
			
 
				-	if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) &&
			
 
				-	    (arg->status_change_nid >= 0))
			
 
				+	if ((N_MEMORY != N_NORMAL_MEMORY) &&
			
 
				+	    (arg->status_change_nid_high >= 0))
			
 
				 		node_clear_state(node, N_HIGH_MEMORY);
			
 
				+
			
 
				+	if ((N_MEMORY != N_HIGH_MEMORY) &&
			
 
				+	    (arg->status_change_nid >= 0))
			
 
				+		node_clear_state(node, N_MEMORY);
			
 
				 }
			
 
				 
			
 
				 static int __ref __offline_pages(unsigned long start_pfn,
			
@@ -1248,6 +1330,7 @@ repeat:
 
				 	/* reset pagetype flags and makes migrate type to be MOVABLE */
			
 
				 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
			
 
				 	/* removal success */
			
 
				+	zone->managed_pages -= offlined_pages;
			
 
				 	zone->present_pages -= offlined_pages;
			
 
				 	zone->zone_pgdat->node_present_pages -= offlined_pages;
			
 
				 	totalram_pages -= offlined_pages;
			
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -212,9 +212,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
 
				 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
			
 
				 	if (pol == NULL)
			
 
				 		return 0;
			
 
				-	/* Check N_HIGH_MEMORY */
			
 
				+	/* Check N_MEMORY */
			
 
				 	nodes_and(nsc->mask1,
			
 
				-		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
			
 
				+		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
			
 
				 
			
 
				 	VM_BUG_ON(!nodes);
			
 
				 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
			
@@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 
				 	pmd = pmd_offset(pud, addr);
			
 
				 	do {
			
 
				 		next = pmd_addr_end(addr, end);
			
 
				-		split_huge_page_pmd(vma->vm_mm, pmd);
			
 
				+		split_huge_page_pmd(vma, addr, pmd);
			
 
				 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
			
 
				 			continue;
			
 
				 		if (check_pte_range(vma, pmd, addr, next, nodes,
			
@@ -1388,7 +1388,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
 
				 		goto out_put;
			
 
				 	}
			
 
				 
			
 
				-	if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
			
 
				+	if (!nodes_subset(*new, node_states[N_MEMORY])) {
			
 
				 		err = -EINVAL;
			
 
				 		goto out_put;
			
 
				 	}
			
@@ -2326,7 +2326,7 @@ void __init numa_policy_init(void)
 
				 	 * fall back to the largest node if they're all smaller.
			
 
				 	 */
			
 
				 	nodes_clear(interleave_nodes);
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		unsigned long total_pages = node_present_pages(nid);
			
 
				 
			
 
				 		/* Preserve the largest node */
			
@@ -2407,7 +2407,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 
				 		*nodelist++ = '\0';
			
 
				 		if (nodelist_parse(nodelist, nodes))
			
 
				 			goto out;
			
 
				-		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
			
 
				+		if (!nodes_subset(nodes, node_states[N_MEMORY]))
			
 
				 			goto out;
			
 
				 	} else
			
 
				 		nodes_clear(nodes);
			
@@ -2441,7 +2441,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 
				 		 * Default to online nodes with memory if no nodelist
			
 
				 		 */
			
 
				 		if (!nodelist)
			
 
				-			nodes = node_states[N_HIGH_MEMORY];
			
 
				+			nodes = node_states[N_MEMORY];
			
 
				 		break;
			
 
				 	case MPOL_LOCAL:
			
 
				 		/*
			
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1238,7 +1238,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
 
				 			if (node < 0 || node >= MAX_NUMNODES)
			
 
				 				goto out_pm;
			
 
				 
			
 
				-			if (!node_state(node, N_HIGH_MEMORY))
			
 
				+			if (!node_state(node, N_MEMORY))
			
 
				 				goto out_pm;
			
 
				 
			
 
				 			err = -EACCES;
			
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1488,7 +1488,11 @@ munmap_back:
 
				 		 *
			
 
				 		 * Answer: Yes, several device drivers can do it in their
			
 
				 		 *         f_op->mmap method. -DaveM
			
 
				+		 * Bug: If addr is changed, prev, rb_link, rb_parent should
			
 
				+		 *      be updated for vma_link()
			
 
				 		 */
			
 
				+		WARN_ON_ONCE(addr != vma->vm_start);
			
 
				+
			
 
				 		addr = vma->vm_start;
			
 
				 		pgoff = vma->vm_pgoff;
			
 
				 		vm_flags = vma->vm_flags;
			
@@ -2065,6 +2069,18 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 
				 		if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
			
 
				 			error = acct_stack_growth(vma, size, grow);
			
 
				 			if (!error) {
			
 
				+				/*
			
 
				+				 * vma_gap_update() doesn't support concurrent
			
 
				+				 * updates, but we only hold a shared mmap_sem
			
 
				+				 * lock here, so we need to protect against
			
 
				+				 * concurrent vma expansions.
			
 
				+				 * vma_lock_anon_vma() doesn't help here, as
			
 
				+				 * we don't guarantee that all growable vmas
			
 
				+				 * in a mm share the same root anon vma.
			
 
				+				 * So, we reuse mm->page_table_lock to guard
			
 
				+				 * against concurrent vma expansions.
			
 
				+				 */
			
 
				+				spin_lock(&vma->vm_mm->page_table_lock);
			
 
				 				anon_vma_interval_tree_pre_update_vma(vma);
			
 
				 				vma->vm_end = address;
			
 
				 				anon_vma_interval_tree_post_update_vma(vma);
			
@@ -2072,6 +2088,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 
				 					vma_gap_update(vma->vm_next);
			
 
				 				else
			
 
				 					vma->vm_mm->highest_vm_end = address;
			
 
				+				spin_unlock(&vma->vm_mm->page_table_lock);
			
 
				+
			
 
				 				perf_event_mmap(vma);
			
 
				 			}
			
 
				 		}
			
@@ -2122,11 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma,
 
				 		if (grow <= vma->vm_pgoff) {
			
 
				 			error = acct_stack_growth(vma, size, grow);
			
 
				 			if (!error) {
			
 
				+				/*
			
 
				+				 * vma_gap_update() doesn't support concurrent
			
 
				+				 * updates, but we only hold a shared mmap_sem
			
 
				+				 * lock here, so we need to protect against
			
 
				+				 * concurrent vma expansions.
			
 
				+				 * vma_lock_anon_vma() doesn't help here, as
			
 
				+				 * we don't guarantee that all growable vmas
			
 
				+				 * in a mm share the same root anon vma.
			
 
				+				 * So, we reuse mm->page_table_lock to guard
			
 
				+				 * against concurrent vma expansions.
			
 
				+				 */
			
 
				+				spin_lock(&vma->vm_mm->page_table_lock);
			
 
				 				anon_vma_interval_tree_pre_update_vma(vma);
			
 
				 				vma->vm_start = address;
			
 
				 				vma->vm_pgoff -= grow;
			
 
				 				anon_vma_interval_tree_post_update_vma(vma);
			
 
				 				vma_gap_update(vma);
			
 
				+				spin_unlock(&vma->vm_mm->page_table_lock);
			
 
				+
			
 
				 				perf_event_mmap(vma);
			
 
				 			}
			
 
				 		}
			
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -90,7 +90,7 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 
				 		next = pmd_addr_end(addr, end);
			
 
				 		if (pmd_trans_huge(*pmd)) {
			
 
				 			if (next - addr != HPAGE_PMD_SIZE)
			
 
				-				split_huge_page_pmd(vma->vm_mm, pmd);
			
 
				+				split_huge_page_pmd(vma, addr, pmd);
			
 
				 			else if (change_huge_pmd(vma, pmd, addr, newprot))
			
 
				 				continue;
			
 
				 			/* fall through */
			
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
				 				need_flush = true;
			
 
				 				continue;
			
 
				 			} else if (!err) {
			
 
				-				split_huge_page_pmd(vma->vm_mm, old_pmd);
			
 
				+				split_huge_page_pmd(vma, old_addr, old_pmd);
			
 
				 			}
			
 
				 			VM_BUG_ON(pmd_trans_huge(*old_pmd));
			
 
				 		}
			
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
 
				 	return count;
			
 
				 }
			
 
				 
			
 
				+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
			
 
				+{
			
 
				+	struct zone *z;
			
 
				+
			
 
				+	/*
			
 
				+	 * In free_area_init_core(), highmem zone's managed_pages is set to
			
 
				+	 * present_pages, and bootmem allocator doesn't allocate from highmem
			
 
				+	 * zones. So there's no need to recalculate managed_pages because all
			
 
				+	 * highmem pages will be managed by the buddy system. Here highmem
			
 
				+	 * zone also includes highmem movable zone.
			
 
				+	 */
			
 
				+	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
			
 
				+		if (!is_highmem(z))
			
 
				+			z->managed_pages = 0;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * free_all_bootmem_node - release a node's free pages to the buddy allocator
			
 
				  * @pgdat: node to be released
			
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
 
				 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
			
 
				 {
			
 
				 	register_page_bootmem_info_node(pgdat);
			
 
				+	reset_node_lowmem_managed_pages(pgdat);
			
 
				 
			
 
				 	/* free_low_memory_core_early(MAX_NUMNODES) will be called later */
			
 
				 	return 0;
			
@@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 
				  */
			
 
				 unsigned long __init free_all_bootmem(void)
			
 
				 {
			
 
				+	struct pglist_data *pgdat;
			
 
				+
			
 
				+	for_each_online_pgdat(pgdat)
			
 
				+		reset_node_lowmem_managed_pages(pgdat);
			
 
				+
			
 
				 	/*
			
 
				 	 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
			
 
				 	 *  because in some case like Node0 doesn't have RAM installed
			
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -215,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 
				 	 * the page allocator means a mempolicy is in effect.  Cpuset policy
			
 
				 	 * is enforced in get_page_from_freelist().
			
 
				 	 */
			
 
				-	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
			
 
				+	if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
			
 
				 		*totalpages = total_swap_pages;
			
 
				 		for_each_node_mask(nid, *nodemask)
			
 
				 			*totalpages += node_spanned_pages(nid);
			
@@ -591,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 
				 	spin_unlock(&zone_scan_lock);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Try to acquire the oom killer lock for all system zones.  Returns zero if a
			
 
				- * parallel oom killing is taking place, otherwise locks all zones and returns
			
 
				- * non-zero.
			
 
				- */
			
 
				-static int try_set_system_oom(void)
			
 
				-{
			
 
				-	struct zone *zone;
			
 
				-	int ret = 1;
			
 
				-
			
 
				-	spin_lock(&zone_scan_lock);
			
 
				-	for_each_populated_zone(zone)
			
 
				-		if (zone_is_oom_locked(zone)) {
			
 
				-			ret = 0;
			
 
				-			goto out;
			
 
				-		}
			
 
				-	for_each_populated_zone(zone)
			
 
				-		zone_set_flag(zone, ZONE_OOM_LOCKED);
			
 
				-out:
			
 
				-	spin_unlock(&zone_scan_lock);
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
			
 
				- * attempts or page faults may now recall the oom killer, if necessary.
			
 
				- */
			
 
				-static void clear_system_oom(void)
			
 
				-{
			
 
				-	struct zone *zone;
			
 
				-
			
 
				-	spin_lock(&zone_scan_lock);
			
 
				-	for_each_populated_zone(zone)
			
 
				-		zone_clear_flag(zone, ZONE_OOM_LOCKED);
			
 
				-	spin_unlock(&zone_scan_lock);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * out_of_memory - kill the "best" process when we run out of memory
			
 
				  * @zonelist: zonelist pointer
			
@@ -708,15 +671,16 @@ out:
 
				 
			
 
				 /*
			
 
				  * The pagefault handler calls here because it is out of memory, so kill a
			
 
				- * memory-hogging task.  If a populated zone has ZONE_OOM_LOCKED set, a parallel
			
 
				- * oom killing is already in progress so do nothing.  If a task is found with
			
 
				- * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
			
 
				+ * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
			
 
				+ * parallel oom killing is already in progress so do nothing.
			
 
				  */
			
 
				 void pagefault_out_of_memory(void)
			
 
				 {
			
 
				-	if (try_set_system_oom()) {
			
 
				+	struct zonelist *zonelist = node_zonelist(first_online_node,
			
 
				+						  GFP_KERNEL);
			
 
				+
			
 
				+	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
			
 
				 		out_of_memory(NULL, 0, 0, NULL, false);
			
 
				-		clear_system_oom();
			
 
				+		clear_zonelist_oom(zonelist, GFP_KERNEL);
			
 
				 	}
			
 
				-	schedule_timeout_killable(1);
			
 
				 }
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -89,6 +89,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 
				 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
			
 
				 #ifdef CONFIG_HIGHMEM
			
 
				 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
			
 
				+#endif
			
 
				+#ifdef CONFIG_MOVABLE_NODE
			
 
				+	[N_MEMORY] = { { [0] = 1UL } },
			
 
				 #endif
			
 
				 	[N_CPU] = { { [0] = 1UL } },
			
 
				 #endif	/* NUMA */
			
@@ -732,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 
				 	local_irq_restore(flags);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Read access to zone->managed_pages is safe because it's unsigned long,
			
 
				+ * but we still need to serialize writers. Currently all callers of
			
 
				+ * __free_pages_bootmem() except put_page_bootmem() should only be used
			
 
				+ * at boot time. So for shorter boot time, we shift the burden to
			
 
				+ * put_page_bootmem() to serialize writers.
			
 
				+ */
			
 
				 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
			
 
				 {
			
 
				 	unsigned int nr_pages = 1 << order;
			
@@ -747,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 
				 		set_page_count(p, 0);
			
 
				 	}
			
 
				 
			
 
				+	page_zone(page)->managed_pages += 1 << order;
			
 
				 	set_page_refcounted(page);
			
 
				 	__free_pages(page, order);
			
 
				 }
			
@@ -1695,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 
				  *
			
 
				  * If the zonelist cache is present in the passed in zonelist, then
			
 
				  * returns a pointer to the allowed node mask (either the current
			
 
				- * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
			
 
				+ * tasks mems_allowed, or node_states[N_MEMORY].)
			
 
				  *
			
 
				  * If the zonelist cache is not available for this zonelist, does
			
 
				  * nothing and returns NULL.
			
@@ -1724,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 
				 
			
 
				 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
			
 
				 					&cpuset_current_mems_allowed :
			
 
				-					&node_states[N_HIGH_MEMORY];
			
 
				+					&node_states[N_MEMORY];
			
 
				 	return allowednodes;
			
 
				 }
			
 
				 
			
@@ -2981,6 +2992,7 @@ void show_free_areas(unsigned int filter)
 
				 			" isolated(anon):%lukB"
			
 
				 			" isolated(file):%lukB"
			
 
				 			" present:%lukB"
			
 
				+			" managed:%lukB"
			
 
				 			" mlocked:%lukB"
			
 
				 			" dirty:%lukB"
			
 
				 			" writeback:%lukB"
			
@@ -3010,6 +3022,7 @@ void show_free_areas(unsigned int filter)
 
				 			K(zone_page_state(zone, NR_ISOLATED_ANON)),
			
 
				 			K(zone_page_state(zone, NR_ISOLATED_FILE)),
			
 
				 			K(zone->present_pages),
			
 
				+			K(zone->managed_pages),
			
 
				 			K(zone_page_state(zone, NR_MLOCK)),
			
 
				 			K(zone_page_state(zone, NR_FILE_DIRTY)),
			
 
				 			K(zone_page_state(zone, NR_WRITEBACK)),
			
@@ -3238,7 +3251,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 
				 		return node;
			
 
				 	}
			
 
				 
			
 
				-	for_each_node_state(n, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(n, N_MEMORY) {
			
 
				 
			
 
				 		/* Don't want a node to appear more than once */
			
 
				 		if (node_isset(n, *used_node_mask))
			
@@ -3380,7 +3393,7 @@ static int default_zonelist_order(void)
 
				  	 * local memory, NODE_ORDER may be suitable.
			
 
				          */
			
 
				 	average_size = total_size /
			
 
				-				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
			
 
				+				(nodes_weight(node_states[N_MEMORY]) + 1);
			
 
				 	for_each_online_node(nid) {
			
 
				 		low_kmem_size = 0;
			
 
				 		total_size = 0;
			
@@ -4476,6 +4489,26 @@ void __init set_pageblock_order(void)
 
				 
			
 
				 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
			
 
				 
			
 
				+static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
			
 
				+						   unsigned long present_pages)
			
 
				+{
			
 
				+	unsigned long pages = spanned_pages;
			
 
				+
			
 
				+	/*
			
 
				+	 * Provide a more accurate estimation if there are holes within
			
 
				+	 * the zone and SPARSEMEM is in use. If there are holes within the
			
 
				+	 * zone, each populated memory region may cost us one or two extra
			
 
				+	 * memmap pages due to alignment because memmap pages for each
			
 
				+	 * populated regions may not naturally algined on page boundary.
			
 
				+	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
			
 
				+	 */
			
 
				+	if (spanned_pages > present_pages + (present_pages >> 4) &&
			
 
				+	    IS_ENABLED(CONFIG_SPARSEMEM))
			
 
				+		pages = present_pages;
			
 
				+
			
 
				+	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Set up the zone data structures:
			
 
				  *   - mark all pages reserved
			
@@ -4499,48 +4532,56 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 
				 
			
 
				 	for (j = 0; j < MAX_NR_ZONES; j++) {
			
 
				 		struct zone *zone = pgdat->node_zones + j;
			
 
				-		unsigned long size, realsize, memmap_pages;
			
 
				+		unsigned long size, realsize, freesize, memmap_pages;
			
 
				 
			
 
				 		size = zone_spanned_pages_in_node(nid, j, zones_size);
			
 
				-		realsize = size - zone_absent_pages_in_node(nid, j,
			
 
				+		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
			
 
				 								zholes_size);
			
 
				 
			
 
				 		/*
			
 
				-		 * Adjust realsize so that it accounts for how much memory
			
 
				+		 * Adjust freesize so that it accounts for how much memory
			
 
				 		 * is used by this zone for memmap. This affects the watermark
			
 
				 		 * and per-cpu initialisations
			
 
				 		 */
			
 
				-		memmap_pages =
			
 
				-			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
			
 
				-		if (realsize >= memmap_pages) {
			
 
				-			realsize -= memmap_pages;
			
 
				+		memmap_pages = calc_memmap_size(size, realsize);
			
 
				+		if (freesize >= memmap_pages) {
			
 
				+			freesize -= memmap_pages;
			
 
				 			if (memmap_pages)
			
 
				 				printk(KERN_DEBUG
			
 
				 				       "  %s zone: %lu pages used for memmap\n",
			
 
				 				       zone_names[j], memmap_pages);
			
 
				 		} else
			
 
				 			printk(KERN_WARNING
			
 
				-				"  %s zone: %lu pages exceeds realsize %lu\n",
			
 
				-				zone_names[j], memmap_pages, realsize);
			
 
				+				"  %s zone: %lu pages exceeds freesize %lu\n",
			
 
				+				zone_names[j], memmap_pages, freesize);
			
 
				 
			
 
				 		/* Account for reserved pages */
			
 
				-		if (j == 0 && realsize > dma_reserve) {
			
 
				-			realsize -= dma_reserve;
			
 
				+		if (j == 0 && freesize > dma_reserve) {
			
 
				+			freesize -= dma_reserve;
			
 
				 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
			
 
				 					zone_names[0], dma_reserve);
			
 
				 		}
			
 
				 
			
 
				 		if (!is_highmem_idx(j))
			
 
				-			nr_kernel_pages += realsize;
			
 
				-		nr_all_pages += realsize;
			
 
				+			nr_kernel_pages += freesize;
			
 
				+		/* Charge for highmem memmap if there are enough kernel pages */
			
 
				+		else if (nr_kernel_pages > memmap_pages * 2)
			
 
				+			nr_kernel_pages -= memmap_pages;
			
 
				+		nr_all_pages += freesize;
			
 
				 
			
 
				 		zone->spanned_pages = size;
			
 
				-		zone->present_pages = realsize;
			
 
				+		zone->present_pages = freesize;
			
 
				+		/*
			
 
				+		 * Set an approximate value for lowmem here, it will be adjusted
			
 
				+		 * when the bootmem allocator frees pages into the buddy system.
			
 
				+		 * And all highmem pages will be managed by the buddy system.
			
 
				+		 */
			
 
				+		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
			
 
				 #ifdef CONFIG_NUMA
			
 
				 		zone->node = nid;
			
 
				-		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
			
 
				+		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
			
 
				 						/ 100;
			
 
				-		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
			
 
				+		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
			
 
				 #endif
			
 
				 		zone->name = zone_names[j];
			
 
				 		spin_lock_init(&zone->lock);
			
@@ -4731,7 +4772,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
 
				 /*
			
 
				  * early_calculate_totalpages()
			
 
				  * Sum pages in active regions for movable zone.
			
 
				- * Populate N_HIGH_MEMORY for calculating usable_nodes.
			
 
				+ * Populate N_MEMORY for calculating usable_nodes.
			
 
				  */
			
 
				 static unsigned long __init early_calculate_totalpages(void)
			
 
				 {
			
@@ -4744,7 +4785,7 @@ static unsigned long __init early_calculate_totalpages(void)
 
				 
			
 
				 		totalpages += pages;
			
 
				 		if (pages)
			
 
				-			node_set_state(nid, N_HIGH_MEMORY);
			
 
				+			node_set_state(nid, N_MEMORY);
			
 
				 	}
			
 
				   	return totalpages;
			
 
				 }
			
@@ -4761,9 +4802,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 
				 	unsigned long usable_startpfn;
			
 
				 	unsigned long kernelcore_node, kernelcore_remaining;
			
 
				 	/* save the state before borrow the nodemask */
			
 
				-	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
			
 
				+	nodemask_t saved_node_state = node_states[N_MEMORY];
			
 
				 	unsigned long totalpages = early_calculate_totalpages();
			
 
				-	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
			
 
				+	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
			
 
				 
			
 
				 	/*
			
 
				 	 * If movablecore was specified, calculate what size of
			
@@ -4798,7 +4839,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 
				 restart:
			
 
				 	/* Spread kernelcore memory as evenly as possible throughout nodes */
			
 
				 	kernelcore_node = required_kernelcore / usable_nodes;
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		unsigned long start_pfn, end_pfn;
			
 
				 
			
 
				 		/*
			
@@ -4890,23 +4931,27 @@ restart:
 
				 
			
 
				 out:
			
 
				 	/* restore the node_state */
			
 
				-	node_states[N_HIGH_MEMORY] = saved_node_state;
			
 
				+	node_states[N_MEMORY] = saved_node_state;
			
 
				 }
			
 
				 
			
 
				-/* Any regular memory on that node ? */
			
 
				-static void __init check_for_regular_memory(pg_data_t *pgdat)
			
 
				+/* Any regular or high memory on that node ? */
			
 
				+static void check_for_memory(pg_data_t *pgdat, int nid)
			
 
				 {
			
 
				-#ifdef CONFIG_HIGHMEM
			
 
				 	enum zone_type zone_type;
			
 
				 
			
 
				-	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
			
 
				+	if (N_MEMORY == N_NORMAL_MEMORY)
			
 
				+		return;
			
 
				+
			
 
				+	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
			
 
				 		struct zone *zone = &pgdat->node_zones[zone_type];
			
 
				 		if (zone->present_pages) {
			
 
				-			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
			
 
				+			node_set_state(nid, N_HIGH_MEMORY);
			
 
				+			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
			
 
				+			    zone_type <= ZONE_NORMAL)
			
 
				+				node_set_state(nid, N_NORMAL_MEMORY);
			
 
				 			break;
			
 
				 		}
			
 
				 	}
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -4989,8 +5034,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 
				 
			
 
				 		/* Any memory on that node */
			
 
				 		if (pgdat->node_present_pages)
			
 
				-			node_set_state(nid, N_HIGH_MEMORY);
			
 
				-		check_for_regular_memory(pgdat);
			
 
				+			node_set_state(nid, N_MEMORY);
			
 
				+		check_for_memory(pgdat, nid);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -5727,7 +5772,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 
				 	unsigned int tries = 0;
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	migrate_prep_local();
			
 
				+	migrate_prep();
			
 
				 
			
 
				 	while (pfn < end || !list_empty(&cc->migratepages)) {
			
 
				 		if (fatal_signal_pending(current)) {
			
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -274,7 +274,7 @@ void __init page_cgroup_init(void)
 
				 	if (mem_cgroup_disabled())
			
 
				 		return;
			
 
				 
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+	for_each_node_state(nid, N_MEMORY) {
			
 
				 		unsigned long start_pfn, end_pfn;
			
 
				 
			
 
				 		start_pfn = node_start_pfn(nid);
			
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
 
				 		if (!walk->pte_entry)
			
 
				 			continue;
			
 
				 
			
 
				-		split_huge_page_pmd(walk->mm, pmd);
			
 
				+		split_huge_page_pmd_mm(walk->mm, addr, pmd);
			
 
				 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
			
 
				 			goto again;
			
 
				 		err = walk_pte_range(pmd, addr, next, walk);
			
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1249,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
				 	update_hiwater_rss(mm);
			
 
				 
			
 
				 	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
			
 
				-		if (PageAnon(page))
			
 
				-			dec_mm_counter(mm, MM_ANONPAGES);
			
 
				-		else
			
 
				-			dec_mm_counter(mm, MM_FILEPAGES);
			
 
				+		if (!PageHuge(page)) {
			
 
				+			if (PageAnon(page))
			
 
				+				dec_mm_counter(mm, MM_ANONPAGES);
			
 
				+			else
			
 
				+				dec_mm_counter(mm, MM_FILEPAGES);
			
 
				+		}
			
 
				 		set_pte_at(mm, address, pte,
			
 
				-				swp_entry_to_pte(make_hwpoison_entry(page)));
			
 
				+			   swp_entry_to_pte(make_hwpoison_entry(page)));
			
 
				 	} else if (PageAnon(page)) {
			
 
				 		swp_entry_t entry = { .val = page_private(page) };
			
 
				 
			
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1715,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
			
 
				+ */
			
 
				+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
			
 
				+				    pgoff_t index, pgoff_t end, int origin)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+	struct pagevec pvec;
			
 
				+	pgoff_t indices[PAGEVEC_SIZE];
			
 
				+	bool done = false;
			
 
				+	int i;
			
 
				+
			
 
				+	pagevec_init(&pvec, 0);
			
 
				+	pvec.nr = 1;		/* start small: we may be there already */
			
 
				+	while (!done) {
			
 
				+		pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
			
 
				+					pvec.nr, pvec.pages, indices);
			
 
				+		if (!pvec.nr) {
			
 
				+			if (origin == SEEK_DATA)
			
 
				+				index = end;
			
 
				+			break;
			
 
				+		}
			
 
				+		for (i = 0; i < pvec.nr; i++, index++) {
			
 
				+			if (index < indices[i]) {
			
 
				+				if (origin == SEEK_HOLE) {
			
 
				+					done = true;
			
 
				+					break;
			
 
				+				}
			
 
				+				index = indices[i];
			
 
				+			}
			
 
				+			page = pvec.pages[i];
			
 
				+			if (page && !radix_tree_exceptional_entry(page)) {
			
 
				+				if (!PageUptodate(page))
			
 
				+					page = NULL;
			
 
				+			}
			
 
				+			if (index >= end ||
			
 
				+			    (page && origin == SEEK_DATA) ||
			
 
				+			    (!page && origin == SEEK_HOLE)) {
			
 
				+				done = true;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		shmem_deswap_pagevec(&pvec);
			
 
				+		pagevec_release(&pvec);
			
 
				+		pvec.nr = PAGEVEC_SIZE;
			
 
				+		cond_resched();
			
 
				+	}
			
 
				+	return index;
			
 
				+}
			
 
				+
			
 
				+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
			
 
				+{
			
 
				+	struct address_space *mapping = file->f_mapping;
			
 
				+	struct inode *inode = mapping->host;
			
 
				+	pgoff_t start, end;
			
 
				+	loff_t new_offset;
			
 
				+
			
 
				+	if (origin != SEEK_DATA && origin != SEEK_HOLE)
			
 
				+		return generic_file_llseek_size(file, offset, origin,
			
 
				+					MAX_LFS_FILESIZE, i_size_read(inode));
			
 
				+	mutex_lock(&inode->i_mutex);
			
 
				+	/* We're holding i_mutex so we can access i_size directly */
			
 
				+
			
 
				+	if (offset < 0)
			
 
				+		offset = -EINVAL;
			
 
				+	else if (offset >= inode->i_size)
			
 
				+		offset = -ENXIO;
			
 
				+	else {
			
 
				+		start = offset >> PAGE_CACHE_SHIFT;
			
 
				+		end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
			
 
				+		new_offset = shmem_seek_hole_data(mapping, start, end, origin);
			
 
				+		new_offset <<= PAGE_CACHE_SHIFT;
			
 
				+		if (new_offset > offset) {
			
 
				+			if (new_offset < inode->i_size)
			
 
				+				offset = new_offset;
			
 
				+			else if (origin == SEEK_DATA)
			
 
				+				offset = -ENXIO;
			
 
				+			else
			
 
				+				offset = inode->i_size;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (offset >= 0 && offset != file->f_pos) {
			
 
				+		file->f_pos = offset;
			
 
				+		file->f_version = 0;
			
 
				+	}
			
 
				+	mutex_unlock(&inode->i_mutex);
			
 
				+	return offset;
			
 
				+}
			
 
				+
			
 
				 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
			
 
				 							 loff_t len)
			
 
				 {
			
@@ -2586,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
 
				 static const struct file_operations shmem_file_operations = {
			
 
				 	.mmap		= shmem_mmap,
			
 
				 #ifdef CONFIG_TMPFS
			
 
				-	.llseek		= generic_file_llseek,
			
 
				+	.llseek		= shmem_file_llseek,
			
 
				 	.read		= do_sync_read,
			
 
				 	.write		= do_sync_write,
			
 
				 	.aio_read	= shmem_file_aio_read,
			
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3131,7 +3131,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 
				 	int nid;
			
 
				 
			
 
				 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
			
 
				-		for_each_node_state(nid, N_HIGH_MEMORY) {
			
 
				+		for_each_node_state(nid, N_MEMORY) {
			
 
				 			pg_data_t *pgdat = NODE_DATA(nid);
			
 
				 			const struct cpumask *mask;
			
 
				 
			
@@ -3187,7 +3187,7 @@ static int __init kswapd_init(void)
 
				 	int nid;
			
 
				 
			
 
				 	swap_setup();
			
 
				-	for_each_node_state(nid, N_HIGH_MEMORY)
			
 
				+	for_each_node_state(nid, N_MEMORY)
			
 
				  		kswapd_run(nid);
			
 
				 	hotcpu_notifier(cpu_callback, 0);
			
 
				 	return 0;
			
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -801,6 +801,8 @@ const char * const vmstat_text[] = {
 
				 	"thp_collapse_alloc",
			
 
				 	"thp_collapse_alloc_failed",
			
 
				 	"thp_split",
			
 
				+	"thp_zero_page_alloc",
			
 
				+	"thp_zero_page_alloc_failed",
			
 
				 #endif
			
 
				 
			
 
				 #endif /* CONFIG_VM_EVENTS_COUNTERS */
			
@@ -930,7 +932,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
 
				 	pg_data_t *pgdat = (pg_data_t *)arg;
			
 
				 
			
 
				 	/* check memoryless node */
			
 
				-	if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
			
 
				+	if (!node_state(pgdat->node_id, N_MEMORY))
			
 
				 		return 0;
			
 
				 
			
 
				 	seq_printf(m, "Page block order: %d\n", pageblock_order);
			
@@ -992,14 +994,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 
				 		   "\n        high     %lu"
			
 
				 		   "\n        scanned  %lu"
			
 
				 		   "\n        spanned  %lu"
			
 
				-		   "\n        present  %lu",
			
 
				+		   "\n        present  %lu"
			
 
				+		   "\n        managed  %lu",
			
 
				 		   zone_page_state(zone, NR_FREE_PAGES),
			
 
				 		   min_wmark_pages(zone),
			
 
				 		   low_wmark_pages(zone),
			
 
				 		   high_wmark_pages(zone),
			
 
				 		   zone->pages_scanned,
			
 
				 		   zone->spanned_pages,
			
 
				-		   zone->present_pages);
			
 
				+		   zone->present_pages,
			
 
				+		   zone->managed_pages);
			
 
				 
			
 
				 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
			
 
				 		seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
			
@@ -1292,7 +1296,7 @@ static int unusable_show(struct seq_file *m, void *arg)
 
				 	pg_data_t *pgdat = (pg_data_t *)arg;
			
 
				 
			
 
				 	/* check memoryless node */
			
 
				-	if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
			
 
				+	if (!node_state(pgdat->node_id, N_MEMORY))
			
 
				 		return 0;
			
 
				 
			
 
				 	walk_zones_in_node(m, pgdat, unusable_show_print);