12 years ago · 5ce1a70e2f
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -0,0 +1,52 @@
 
				+What:		/sys/kernel/mm/ksm
			
 
				+Date:		September 2009
			
 
				+KernelVersion:	2.6.32
			
 
				+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
			
 
				+Description:	Interface for Kernel Samepage Merging (KSM)
			
 
				+
			
 
				+What:		/sys/kernel/mm/ksm/full_scans
			
 
				+What:		/sys/kernel/mm/ksm/pages_shared
			
 
				+What:		/sys/kernel/mm/ksm/pages_sharing
			
 
				+What:		/sys/kernel/mm/ksm/pages_to_scan
			
 
				+What:		/sys/kernel/mm/ksm/pages_unshared
			
 
				+What:		/sys/kernel/mm/ksm/pages_volatile
			
 
				+What:		/sys/kernel/mm/ksm/run
			
 
				+What:		/sys/kernel/mm/ksm/sleep_millisecs
			
 
				+Date:		September 2009
			
 
				+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
			
 
				+Description:	Kernel Samepage Merging daemon sysfs interface
			
 
				+
			
 
				+		full_scans: how many times all mergeable areas have been
			
 
				+		scanned.
			
 
				+
			
 
				+		pages_shared: how many shared pages are being used.
			
 
				+
			
 
				+		pages_sharing: how many more sites are sharing them i.e. how
			
 
				+		much saved.
			
 
				+
			
 
				+		pages_to_scan: how many present pages to scan before ksmd goes
			
 
				+		to sleep.
			
 
				+
			
 
				+		pages_unshared: how many pages unique but repeatedly checked
			
 
				+		for merging.
			
 
				+
			
 
				+		pages_volatile: how many pages changing too fast to be placed
			
 
				+		in a tree.
			
 
				+
			
 
				+		run: write 0 to disable ksm, read 0 while ksm is disabled.
			
 
				+			write 1 to run ksm, read 1 while ksm is running.
			
 
				+			write 2 to disable ksm and unmerge all its pages.
			
 
				+
			
 
				+		sleep_millisecs: how many milliseconds ksm should sleep between
			
 
				+		scans.
			
 
				+
			
 
				+		See Documentation/vm/ksm.txt for more information.
			
 
				+
			
 
				+What:		/sys/kernel/mm/ksm/merge_across_nodes
			
 
				+Date:		January 2013
			
 
				+KernelVersion:	3.9
			
 
				+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
			
 
				+Description:	Control merging pages across different NUMA nodes.
			
 
				+
			
 
				+		When it is set to 0 only pages from the same node are merged,
			
 
				+		otherwise pages from all nodes can be merged together (default).
			
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1640,6 +1640,42 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
				 			that the amount of memory usable for all allocations
			
 
				 			is not too small.
			
 
				 
			
 
				+	movablemem_map=acpi
			
 
				+			[KNL,X86,IA-64,PPC] This parameter is similar to
			
 
				+			memmap except it specifies the memory map of
			
 
				+			ZONE_MOVABLE.
			
 
				+			This option inform the kernel to use Hot Pluggable bit
			
 
				+			in flags from SRAT from ACPI BIOS to determine which
			
 
				+			memory devices could be hotplugged. The corresponding
			
 
				+			memory ranges will be set as ZONE_MOVABLE.
			
 
				+			NOTE: Whatever node the kernel resides in will always
			
 
				+			      be un-hotpluggable.
			
 
				+
			
 
				+	movablemem_map=nn[KMG]@ss[KMG]
			
 
				+			[KNL,X86,IA-64,PPC] This parameter is similar to
			
 
				+			memmap except it specifies the memory map of
			
 
				+			ZONE_MOVABLE.
			
 
				+			If user specifies memory ranges, the info in SRAT will
			
 
				+			be ingored. And it works like the following:
			
 
				+			- If more ranges are all within one node, then from
			
 
				+			  lowest ss to the end of the node will be ZONE_MOVABLE.
			
 
				+			- If a range is within a node, then from ss to the end
			
 
				+			  of the node will be ZONE_MOVABLE.
			
 
				+			- If a range covers two or more nodes, then from ss to
			
 
				+			  the end of the 1st node will be ZONE_MOVABLE, and all
			
 
				+			  the rest nodes will only have ZONE_MOVABLE.
			
 
				+			If memmap is specified at the same time, the
			
 
				+			movablemem_map will be limited within the memmap
			
 
				+			areas. If kernelcore or movablecore is also specified,
			
 
				+			movablemem_map will have higher priority to be
			
 
				+			satisfied. So the administrator should be careful that
			
 
				+			the amount of movablemem_map areas are not too large.
			
 
				+			Otherwise kernel won't have enough memory to start.
			
 
				+			NOTE: We don't stop users specifying the node the
			
 
				+			      kernel resides in as hotpluggable so that this
			
 
				+			      option can be used as a workaround of firmware
			
 
				+                              bugs.
			
 
				+
			
 
				 	MTD_Partition=	[MTD]
			
 
				 			Format: <name>,<region-number>,<size>,<offset>
			
 
				 
			
--- a/Documentation/vm/ksm.txt
+++ b/Documentation/vm/ksm.txt
@@ -58,6 +58,21 @@ sleep_millisecs  - how many milliseconds ksmd should sleep before next scan
 
				                    e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
			
 
				                    Default: 20 (chosen for demonstration purposes)
			
 
				 
			
 
				+merge_across_nodes - specifies if pages from different numa nodes can be merged.
			
 
				+                   When set to 0, ksm merges only pages which physically
			
 
				+                   reside in the memory area of same NUMA node. That brings
			
 
				+                   lower latency to access of shared pages. Systems with more
			
 
				+                   nodes, at significant NUMA distances, are likely to benefit
			
 
				+                   from the lower latency of setting 0. Smaller systems, which
			
 
				+                   need to minimize memory usage, are likely to benefit from
			
 
				+                   the greater sharing of setting 1 (default). You may wish to
			
 
				+                   compare how your system performs under each setting, before
			
 
				+                   deciding on which to use. merge_across_nodes setting can be
			
 
				+                   changed only when there are no ksm shared pages in system:
			
 
				+                   set run 2 to unmerge pages first, then to 1 after changing
			
 
				+                   merge_across_nodes, to remerge according to the new setting.
			
 
				+                   Default: 1 (merging across nodes as in earlier releases)
			
 
				+
			
 
				 run              - set 0 to stop ksmd from running but keep merged pages,
			
 
				                    set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
			
 
				                    set 2 to stop ksmd and unmerge all pages currently merged,
			
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -434,4 +434,7 @@ int __meminit vmemmap_populate(struct page *start_page,
 
				 	return 0;
			
 
				 }
			
 
				 #endif	/* CONFIG_ARM64_64K_PAGES */
			
 
				+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
			
 
				+{
			
 
				+}
			
 
				 #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
			
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -93,7 +93,7 @@ void show_mem(unsigned int filter)
 
				 	printk(KERN_INFO "%d pages swap cached\n", total_cached);
			
 
				 	printk(KERN_INFO "Total of %ld pages in page table cache\n",
			
 
				 	       quicklist_total_size());
			
 
				-	printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages());
			
 
				+	printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
			
 
				 }
			
 
				 
			
 
				 
			
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -666,7 +666,7 @@ void show_mem(unsigned int filter)
 
				 	printk(KERN_INFO "%d pages swap cached\n", total_cached);
			
 
				 	printk(KERN_INFO "Total of %ld pages in page table cache\n",
			
 
				 	       quicklist_total_size());
			
 
				-	printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages());
			
 
				+	printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -822,4 +822,8 @@ int __meminit vmemmap_populate(struct page *start_page,
 
				 {
			
 
				 	return vmemmap_populate_basepages(start_page, size, node);
			
 
				 }
			
 
				+
			
 
				+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
			
 
				+{
			
 
				+}
			
 
				 #endif
			
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				+
			
 
				+#ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				+int arch_remove_memory(u64 start, u64 size)
			
 
				+{
			
 
				+	unsigned long start_pfn = start >> PAGE_SHIFT;
			
 
				+	unsigned long nr_pages = size >> PAGE_SHIFT;
			
 
				+	struct zone *zone;
			
 
				+	int ret;
			
 
				+
			
 
				+	zone = page_zone(pfn_to_page(start_pfn));
			
 
				+	ret = __remove_pages(zone, start_pfn, nr_pages);
			
 
				+	if (ret)
			
 
				+		pr_warn("%s: Problem encountered in __remove_pages() as"
			
 
				+			" ret=%d\n", __func__,  ret);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 /*
			
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -297,5 +297,10 @@ int __meminit vmemmap_populate(struct page *start_page,
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
			
 
				 
			
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
				 
			
 
				 	return __add_pages(nid, zone, start_pfn, nr_pages);
			
 
				 }
			
 
				+
			
 
				+#ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				+int arch_remove_memory(u64 start, u64 size)
			
 
				+{
			
 
				+	unsigned long start_pfn = start >> PAGE_SHIFT;
			
 
				+	unsigned long nr_pages = size >> PAGE_SHIFT;
			
 
				+	struct zone *zone;
			
 
				+
			
 
				+	zone = page_zone(pfn_to_page(start_pfn));
			
 
				+	return __remove_pages(zone, start_pfn, nr_pages);
			
 
				+}
			
 
				+#endif
			
 
				 #endif /* CONFIG_MEMORY_HOTPLUG */
			
 
				 
			
 
				 /*
			
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -228,4 +228,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
				 		vmem_remove_mapping(start, size);
			
 
				 	return rc;
			
 
				 }
			
 
				+
			
 
				+#ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				+int arch_remove_memory(u64 start, u64 size)
			
 
				+{
			
 
				+	/*
			
 
				+	 * There is no hardware or firmware interface which could trigger a
			
 
				+	 * hot memory remove on s390. So there is nothing that needs to be
			
 
				+	 * implemented.
			
 
				+	 */
			
 
				+	return -EBUSY;
			
 
				+}
			
 
				+#endif
			
 
				 #endif /* CONFIG_MEMORY_HOTPLUG */
			
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -268,6 +268,10 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Add memory segment to the segment list if it doesn't overlap with
			
 
				  * an already present segment.
			
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr)
 
				 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				+int arch_remove_memory(u64 start, u64 size)
			
 
				+{
			
 
				+	unsigned long start_pfn = start >> PAGE_SHIFT;
			
 
				+	unsigned long nr_pages = size >> PAGE_SHIFT;
			
 
				+	struct zone *zone;
			
 
				+	int ret;
			
 
				+
			
 
				+	zone = page_zone(pfn_to_page(start_pfn));
			
 
				+	ret = __remove_pages(zone, start_pfn, nr_pages);
			
 
				+	if (unlikely(ret))
			
 
				+		pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
			
 
				+			ret);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+#endif
			
 
				 #endif /* CONFIG_MEMORY_HOTPLUG */
			
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -57,7 +57,7 @@ void show_mem(unsigned int filter)
 
				 	printk("Mem-info:\n");
			
 
				 	show_free_areas(filter);
			
 
				 	printk("Free swap:       %6ldkB\n",
			
 
				-	       nr_swap_pages << (PAGE_SHIFT-10));
			
 
				+	       get_nr_swap_pages() << (PAGE_SHIFT-10));
			
 
				 	printk("%ld pages of RAM\n", totalram_pages);
			
 
				 	printk("%ld free pages\n", nr_free_pages());
			
 
				 }
			
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2235,6 +2235,11 @@ void __meminit vmemmap_populate_print_last(void)
 
				 		node_start = 0;
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+void vmemmap_free(struct page *memmap, unsigned long nr_pages)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
			
 
				 
			
 
				 static void prot_init_common(unsigned long page_none,
			
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -130,7 +130,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 
				 	if (!retval) {
			
 
				 		unsigned long addr = MEM_USER_INTRPT;
			
 
				 		addr = mmap_region(NULL, addr, INTRPT_SIZE,
			
 
				-				   MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
			
 
				 				   VM_READ|VM_EXEC|
			
 
				 				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
			
 
				 		if (addr > (unsigned long) -PAGE_SIZE)
			
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
 
				 {
			
 
				 	return -EINVAL;
			
 
				 }
			
 
				+
			
 
				+#ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				+int arch_remove_memory(u64 start, u64 size)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	return -EBUSY;
			
 
				+}
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 struct kmem_cache *pgd_cache;
			
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -61,7 +61,7 @@ void show_mem(unsigned int filter)
 
				 	       global_page_state(NR_PAGETABLE),
			
 
				 	       global_page_state(NR_BOUNCE),
			
 
				 	       global_page_state(NR_FILE_PAGES),
			
 
				-	       nr_swap_pages);
			
 
				+	       get_nr_swap_pages());
			
 
				 
			
 
				 	for_each_zone(zone) {
			
 
				 		unsigned long flags, order, total = 0, largest_order = -1;
			
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -57,8 +57,8 @@ static inline int numa_cpu_node(int cpu)
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_NUMA
			
 
				-extern void __cpuinit numa_set_node(int cpu, int node);
			
 
				-extern void __cpuinit numa_clear_node(int cpu);
			
 
				+extern void numa_set_node(int cpu, int node);
			
 
				+extern void numa_clear_node(int cpu);
			
 
				 extern void __init init_cpu_to_node(void);
			
 
				 extern void __cpuinit numa_add_cpu(int cpu);
			
 
				 extern void __cpuinit numa_remove_cpu(int cpu);
			
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
 
				  * as a pte too.
			
 
				  */
			
 
				 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
			
 
				+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
			
 
				 extern phys_addr_t slow_virt_to_phys(void *__address);
			
 
				 
			
 
				 #endif	/* !__ASSEMBLY__ */
			
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -696,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic);
 
				 
			
 
				 int acpi_unmap_lsapic(int cpu)
			
 
				 {
			
 
				+#ifdef CONFIG_ACPI_NUMA
			
 
				+	set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
			
 
				+#endif
			
 
				+
			
 
				 	per_cpu(x86_cpu_to_apicid, cpu) = -1;
			
 
				 	set_cpu_present(cpu, false);
			
 
				 	num_processors--;
			
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p)
 
				 	setup_bios_corruption_check();
			
 
				 #endif
			
 
				 
			
 
				+	/*
			
 
				+	 * In the memory hotplug case, the kernel needs info from SRAT to
			
 
				+	 * determine which memory is hotpluggable before allocating memory
			
 
				+	 * using memblock.
			
 
				+	 */
			
 
				+	acpi_boot_table_init();
			
 
				+	early_acpi_boot_init();
			
 
				+	early_parse_srat();
			
 
				+
			
 
				 #ifdef CONFIG_X86_32
			
 
				 	printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
			
 
				 			(max_pfn_mapped<<PAGE_SHIFT) - 1);
			
@@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p)
 
				 	/*
			
 
				 	 * Parse the ACPI tables for possible boot-time SMP configuration.
			
 
				 	 */
			
 
				-	acpi_boot_table_init();
			
 
				-
			
 
				-	early_acpi_boot_init();
			
 
				-
			
 
				 	initmem_init();
			
 
				 	memblock_find_dma_reserve();
			
 
				 
			
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -862,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
				 
			
 
				 	return __add_pages(nid, zone, start_pfn, nr_pages);
			
 
				 }
			
 
				+
			
 
				+#ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				+int arch_remove_memory(u64 start, u64 size)
			
 
				+{
			
 
				+	unsigned long start_pfn = start >> PAGE_SHIFT;
			
 
				+	unsigned long nr_pages = size >> PAGE_SHIFT;
			
 
				+	struct zone *zone;
			
 
				+
			
 
				+	zone = page_zone(pfn_to_page(start_pfn));
			
 
				+	return __remove_pages(zone, start_pfn, nr_pages);
			
 
				+}
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 /*
			
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -707,6 +707,343 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(arch_add_memory);
			
 
				 
			
 
				+#define PAGE_INUSE 0xFD
			
 
				+
			
 
				+static void __meminit free_pagetable(struct page *page, int order)
			
 
				+{
			
 
				+	struct zone *zone;
			
 
				+	bool bootmem = false;
			
 
				+	unsigned long magic;
			
 
				+	unsigned int nr_pages = 1 << order;
			
 
				+
			
 
				+	/* bootmem page has reserved flag */
			
 
				+	if (PageReserved(page)) {
			
 
				+		__ClearPageReserved(page);
			
 
				+		bootmem = true;
			
 
				+
			
 
				+		magic = (unsigned long)page->lru.next;
			
 
				+		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
			
 
				+			while (nr_pages--)
			
 
				+				put_page_bootmem(page++);
			
 
				+		} else
			
 
				+			__free_pages_bootmem(page, order);
			
 
				+	} else
			
 
				+		free_pages((unsigned long)page_address(page), order);
			
 
				+
			
 
				+	/*
			
 
				+	 * SECTION_INFO pages and MIX_SECTION_INFO pages
			
 
				+	 * are all allocated by bootmem.
			
 
				+	 */
			
 
				+	if (bootmem) {
			
 
				+		zone = page_zone(page);
			
 
				+		zone_span_writelock(zone);
			
 
				+		zone->present_pages += nr_pages;
			
 
				+		zone_span_writeunlock(zone);
			
 
				+		totalram_pages += nr_pages;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
			
 
				+{
			
 
				+	pte_t *pte;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < PTRS_PER_PTE; i++) {
			
 
				+		pte = pte_start + i;
			
 
				+		if (pte_val(*pte))
			
 
				+			return;
			
 
				+	}
			
 
				+
			
 
				+	/* free a pte talbe */
			
 
				+	free_pagetable(pmd_page(*pmd), 0);
			
 
				+	spin_lock(&init_mm.page_table_lock);
			
 
				+	pmd_clear(pmd);
			
 
				+	spin_unlock(&init_mm.page_table_lock);
			
 
				+}
			
 
				+
			
 
				+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
			
 
				+{
			
 
				+	pmd_t *pmd;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < PTRS_PER_PMD; i++) {
			
 
				+		pmd = pmd_start + i;
			
 
				+		if (pmd_val(*pmd))
			
 
				+			return;
			
 
				+	}
			
 
				+
			
 
				+	/* free a pmd talbe */
			
 
				+	free_pagetable(pud_page(*pud), 0);
			
 
				+	spin_lock(&init_mm.page_table_lock);
			
 
				+	pud_clear(pud);
			
 
				+	spin_unlock(&init_mm.page_table_lock);
			
 
				+}
			
 
				+
			
 
				+/* Return true if pgd is changed, otherwise return false. */
			
 
				+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
			
 
				+{
			
 
				+	pud_t *pud;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < PTRS_PER_PUD; i++) {
			
 
				+		pud = pud_start + i;
			
 
				+		if (pud_val(*pud))
			
 
				+			return false;
			
 
				+	}
			
 
				+
			
 
				+	/* free a pud table */
			
 
				+	free_pagetable(pgd_page(*pgd), 0);
			
 
				+	spin_lock(&init_mm.page_table_lock);
			
 
				+	pgd_clear(pgd);
			
 
				+	spin_unlock(&init_mm.page_table_lock);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void __meminit
			
 
				+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
			
 
				+		 bool direct)
			
 
				+{
			
 
				+	unsigned long next, pages = 0;
			
 
				+	pte_t *pte;
			
 
				+	void *page_addr;
			
 
				+	phys_addr_t phys_addr;
			
 
				+
			
 
				+	pte = pte_start + pte_index(addr);
			
 
				+	for (; addr < end; addr = next, pte++) {
			
 
				+		next = (addr + PAGE_SIZE) & PAGE_MASK;
			
 
				+		if (next > end)
			
 
				+			next = end;
			
 
				+
			
 
				+		if (!pte_present(*pte))
			
 
				+			continue;
			
 
				+
			
 
				+		/*
			
 
				+		 * We mapped [0,1G) memory as identity mapping when
			
 
				+		 * initializing, in arch/x86/kernel/head_64.S. These
			
 
				+		 * pagetables cannot be removed.
			
 
				+		 */
			
 
				+		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
			
 
				+		if (phys_addr < (phys_addr_t)0x40000000)
			
 
				+			return;
			
 
				+
			
 
				+		if (IS_ALIGNED(addr, PAGE_SIZE) &&
			
 
				+		    IS_ALIGNED(next, PAGE_SIZE)) {
			
 
				+			/*
			
 
				+			 * Do not free direct mapping pages since they were
			
 
				+			 * freed when offlining, or simplely not in use.
			
 
				+			 */
			
 
				+			if (!direct)
			
 
				+				free_pagetable(pte_page(*pte), 0);
			
 
				+
			
 
				+			spin_lock(&init_mm.page_table_lock);
			
 
				+			pte_clear(&init_mm, addr, pte);
			
 
				+			spin_unlock(&init_mm.page_table_lock);
			
 
				+
			
 
				+			/* For non-direct mapping, pages means nothing. */
			
 
				+			pages++;
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * If we are here, we are freeing vmemmap pages since
			
 
				+			 * direct mapped memory ranges to be freed are aligned.
			
 
				+			 *
			
 
				+			 * If we are not removing the whole page, it means
			
 
				+			 * other page structs in this page are being used and
			
 
				+			 * we canot remove them. So fill the unused page_structs
			
 
				+			 * with 0xFD, and remove the page when it is wholly
			
 
				+			 * filled with 0xFD.
			
 
				+			 */
			
 
				+			memset((void *)addr, PAGE_INUSE, next - addr);
			
 
				+
			
 
				+			page_addr = page_address(pte_page(*pte));
			
 
				+			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
			
 
				+				free_pagetable(pte_page(*pte), 0);
			
 
				+
			
 
				+				spin_lock(&init_mm.page_table_lock);
			
 
				+				pte_clear(&init_mm, addr, pte);
			
 
				+				spin_unlock(&init_mm.page_table_lock);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Call free_pte_table() in remove_pmd_table(). */
			
 
				+	flush_tlb_all();
			
 
				+	if (direct)
			
 
				+		update_page_count(PG_LEVEL_4K, -pages);
			
 
				+}
			
 
				+
			
 
				+static void __meminit
			
 
				+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
			
 
				+		 bool direct)
			
 
				+{
			
 
				+	unsigned long next, pages = 0;
			
 
				+	pte_t *pte_base;
			
 
				+	pmd_t *pmd;
			
 
				+	void *page_addr;
			
 
				+
			
 
				+	pmd = pmd_start + pmd_index(addr);
			
 
				+	for (; addr < end; addr = next, pmd++) {
			
 
				+		next = pmd_addr_end(addr, end);
			
 
				+
			
 
				+		if (!pmd_present(*pmd))
			
 
				+			continue;
			
 
				+
			
 
				+		if (pmd_large(*pmd)) {
			
 
				+			if (IS_ALIGNED(addr, PMD_SIZE) &&
			
 
				+			    IS_ALIGNED(next, PMD_SIZE)) {
			
 
				+				if (!direct)
			
 
				+					free_pagetable(pmd_page(*pmd),
			
 
				+						       get_order(PMD_SIZE));
			
 
				+
			
 
				+				spin_lock(&init_mm.page_table_lock);
			
 
				+				pmd_clear(pmd);
			
 
				+				spin_unlock(&init_mm.page_table_lock);
			
 
				+				pages++;
			
 
				+			} else {
			
 
				+				/* If here, we are freeing vmemmap pages. */
			
 
				+				memset((void *)addr, PAGE_INUSE, next - addr);
			
 
				+
			
 
				+				page_addr = page_address(pmd_page(*pmd));
			
 
				+				if (!memchr_inv(page_addr, PAGE_INUSE,
			
 
				+						PMD_SIZE)) {
			
 
				+					free_pagetable(pmd_page(*pmd),
			
 
				+						       get_order(PMD_SIZE));
			
 
				+
			
 
				+					spin_lock(&init_mm.page_table_lock);
			
 
				+					pmd_clear(pmd);
			
 
				+					spin_unlock(&init_mm.page_table_lock);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
			
 
				+		remove_pte_table(pte_base, addr, next, direct);
			
 
				+		free_pte_table(pte_base, pmd);
			
 
				+	}
			
 
				+
			
 
				+	/* Call free_pmd_table() in remove_pud_table(). */
			
 
				+	if (direct)
			
 
				+		update_page_count(PG_LEVEL_2M, -pages);
			
 
				+}
			
 
				+
			
 
				+static void __meminit
			
 
				+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
			
 
				+		 bool direct)
			
 
				+{
			
 
				+	unsigned long next, pages = 0;
			
 
				+	pmd_t *pmd_base;
			
 
				+	pud_t *pud;
			
 
				+	void *page_addr;
			
 
				+
			
 
				+	pud = pud_start + pud_index(addr);
			
 
				+	for (; addr < end; addr = next, pud++) {
			
 
				+		next = pud_addr_end(addr, end);
			
 
				+
			
 
				+		if (!pud_present(*pud))
			
 
				+			continue;
			
 
				+
			
 
				+		if (pud_large(*pud)) {
			
 
				+			if (IS_ALIGNED(addr, PUD_SIZE) &&
			
 
				+			    IS_ALIGNED(next, PUD_SIZE)) {
			
 
				+				if (!direct)
			
 
				+					free_pagetable(pud_page(*pud),
			
 
				+						       get_order(PUD_SIZE));
			
 
				+
			
 
				+				spin_lock(&init_mm.page_table_lock);
			
 
				+				pud_clear(pud);
			
 
				+				spin_unlock(&init_mm.page_table_lock);
			
 
				+				pages++;
			
 
				+			} else {
			
 
				+				/* If here, we are freeing vmemmap pages. */
			
 
				+				memset((void *)addr, PAGE_INUSE, next - addr);
			
 
				+
			
 
				+				page_addr = page_address(pud_page(*pud));
			
 
				+				if (!memchr_inv(page_addr, PAGE_INUSE,
			
 
				+						PUD_SIZE)) {
			
 
				+					free_pagetable(pud_page(*pud),
			
 
				+						       get_order(PUD_SIZE));
			
 
				+
			
 
				+					spin_lock(&init_mm.page_table_lock);
			
 
				+					pud_clear(pud);
			
 
				+					spin_unlock(&init_mm.page_table_lock);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
			
 
				+		remove_pmd_table(pmd_base, addr, next, direct);
			
 
				+		free_pmd_table(pmd_base, pud);
			
 
				+	}
			
 
				+
			
 
				+	if (direct)
			
 
				+		update_page_count(PG_LEVEL_1G, -pages);
			
 
				+}
			
 
				+
			
 
				+/* start and end are both virtual address. */
			
 
				+static void __meminit
			
 
				+remove_pagetable(unsigned long start, unsigned long end, bool direct)
			
 
				+{
			
 
				+	unsigned long next;
			
 
				+	pgd_t *pgd;
			
 
				+	pud_t *pud;
			
 
				+	bool pgd_changed = false;
			
 
				+
			
 
				+	for (; start < end; start = next) {
			
 
				+		next = pgd_addr_end(start, end);
			
 
				+
			
 
				+		pgd = pgd_offset_k(start);
			
 
				+		if (!pgd_present(*pgd))
			
 
				+			continue;
			
 
				+
			
 
				+		pud = (pud_t *)pgd_page_vaddr(*pgd);
			
 
				+		remove_pud_table(pud, start, next, direct);
			
 
				+		if (free_pud_table(pud, pgd))
			
 
				+			pgd_changed = true;
			
 
				+	}
			
 
				+
			
 
				+	if (pgd_changed)
			
 
				+		sync_global_pgds(start, end - 1);
			
 
				+
			
 
				+	flush_tlb_all();
			
 
				+}
			
 
				+
			
 
				+void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
			
 
				+{
			
 
				+	unsigned long start = (unsigned long)memmap;
			
 
				+	unsigned long end = (unsigned long)(memmap + nr_pages);
			
 
				+
			
 
				+	remove_pagetable(start, end, false);
			
 
				+}
			
 
				+
			
 
				+static void __meminit
			
 
				+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
			
 
				+{
			
 
				+	start = (unsigned long)__va(start);
			
 
				+	end = (unsigned long)__va(end);
			
 
				+
			
 
				+	remove_pagetable(start, end, true);
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				+int __ref arch_remove_memory(u64 start, u64 size)
			
 
				+{
			
 
				+	unsigned long start_pfn = start >> PAGE_SHIFT;
			
 
				+	unsigned long nr_pages = size >> PAGE_SHIFT;
			
 
				+	struct zone *zone;
			
 
				+	int ret;
			
 
				+
			
 
				+	zone = page_zone(pfn_to_page(start_pfn));
			
 
				+	kernel_physical_mapping_remove(start, start + size);
			
 
				+	ret = __remove_pages(zone, start_pfn, nr_pages);
			
 
				+	WARN_ON_ONCE(ret);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+#endif
			
 
				 #endif /* CONFIG_MEMORY_HOTPLUG */
			
 
				 
			
 
				 static struct kcore_list kcore_vsyscall;
			
@@ -1019,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
			
 
				+void register_page_bootmem_memmap(unsigned long section_nr,
			
 
				+				  struct page *start_page, unsigned long size)
			
 
				+{
			
 
				+	unsigned long addr = (unsigned long)start_page;
			
 
				+	unsigned long end = (unsigned long)(start_page + size);
			
 
				+	unsigned long next;
			
 
				+	pgd_t *pgd;
			
 
				+	pud_t *pud;
			
 
				+	pmd_t *pmd;
			
 
				+	unsigned int nr_pages;
			
 
				+	struct page *page;
			
 
				+
			
 
				+	for (; addr < end; addr = next) {
			
 
				+		pte_t *pte = NULL;
			
 
				+
			
 
				+		pgd = pgd_offset_k(addr);
			
 
				+		if (pgd_none(*pgd)) {
			
 
				+			next = (addr + PAGE_SIZE) & PAGE_MASK;
			
 
				+			continue;
			
 
				+		}
			
 
				+		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
			
 
				+
			
 
				+		pud = pud_offset(pgd, addr);
			
 
				+		if (pud_none(*pud)) {
			
 
				+			next = (addr + PAGE_SIZE) & PAGE_MASK;
			
 
				+			continue;
			
 
				+		}
			
 
				+		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
			
 
				+
			
 
				+		if (!cpu_has_pse) {
			
 
				+			next = (addr + PAGE_SIZE) & PAGE_MASK;
			
 
				+			pmd = pmd_offset(pud, addr);
			
 
				+			if (pmd_none(*pmd))
			
 
				+				continue;
			
 
				+			get_page_bootmem(section_nr, pmd_page(*pmd),
			
 
				+					 MIX_SECTION_INFO);
			
 
				+
			
 
				+			pte = pte_offset_kernel(pmd, addr);
			
 
				+			if (pte_none(*pte))
			
 
				+				continue;
			
 
				+			get_page_bootmem(section_nr, pte_page(*pte),
			
 
				+					 SECTION_INFO);
			
 
				+		} else {
			
 
				+			next = pmd_addr_end(addr, end);
			
 
				+
			
 
				+			pmd = pmd_offset(pud, addr);
			
 
				+			if (pmd_none(*pmd))
			
 
				+				continue;
			
 
				+
			
 
				+			nr_pages = 1 << (get_order(PMD_SIZE));
			
 
				+			page = pmd_page(*pmd);
			
 
				+			while (nr_pages--)
			
 
				+				get_page_bootmem(section_nr, page++,
			
 
				+						 SECTION_INFO);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 void __meminit vmemmap_populate_print_last(void)
			
 
				 {
			
 
				 	if (p_start) {
			
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -56,7 +56,7 @@ early_param("numa", numa_setup);
 
				 /*
			
 
				  * apicid, cpu, node mappings
			
 
				  */
			
 
				-s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
			
 
				+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
			
 
				 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
			
 
				 };
			
 
				 
			
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
 
				 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
			
 
				 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
			
 
				 
			
 
				-void __cpuinit numa_set_node(int cpu, int node)
			
 
				+void numa_set_node(int cpu, int node)
			
 
				 {
			
 
				 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
			
 
				 
			
@@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node)
 
				 		set_cpu_numa_node(cpu, node);
			
 
				 }
			
 
				 
			
 
				-void __cpuinit numa_clear_node(int cpu)
			
 
				+void numa_clear_node(int cpu)
			
 
				 {
			
 
				 	numa_set_node(cpu, NUMA_NO_NODE);
			
 
				 }
			
@@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
 
				 	 * Allocate node data.  Try node-local memory and then any node.
			
 
				 	 * Never allocate in DMA zone.
			
 
				 	 */
			
 
				-	nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
			
 
				+	nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
			
 
				 	if (!nd_pa) {
			
 
				-		pr_err("Cannot find %zu bytes in node %d\n",
			
 
				-		       nd_size, nid);
			
 
				+		pr_err("Cannot find %zu bytes in any node\n", nd_size);
			
 
				 		return;
			
 
				 	}
			
 
				 	nd = __va(nd_pa);
			
@@ -561,10 +560,12 @@ static int __init numa_init(int (*init_func)(void))
 
				 	for (i = 0; i < MAX_LOCAL_APIC; i++)
			
 
				 		set_apicid_to_node(i, NUMA_NO_NODE);
			
 
				 
			
 
				-	nodes_clear(numa_nodes_parsed);
			
 
				+	/*
			
 
				+	 * Do not clear numa_nodes_parsed or zero numa_meminfo here, because
			
 
				+	 * SRAT was parsed earlier in early_parse_srat().
			
 
				+	 */
			
 
				 	nodes_clear(node_possible_map);
			
 
				 	nodes_clear(node_online_map);
			
 
				-	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
			
 
				 	WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
			
 
				 	numa_reset_distance();
			
 
				 
			
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -529,21 +529,13 @@ out_unlock:
 
				 	return do_split;
			
 
				 }
			
 
				 
			
 
				-static int split_large_page(pte_t *kpte, unsigned long address)
			
 
				+int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
			
 
				 {
			
 
				 	unsigned long pfn, pfninc = 1;
			
 
				 	unsigned int i, level;
			
 
				-	pte_t *pbase, *tmp;
			
 
				+	pte_t *tmp;
			
 
				 	pgprot_t ref_prot;
			
 
				-	struct page *base;
			
 
				-
			
 
				-	if (!debug_pagealloc)
			
 
				-		spin_unlock(&cpa_lock);
			
 
				-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
			
 
				-	if (!debug_pagealloc)
			
 
				-		spin_lock(&cpa_lock);
			
 
				-	if (!base)
			
 
				-		return -ENOMEM;
			
 
				+	struct page *base = virt_to_page(pbase);
			
 
				 
			
 
				 	spin_lock(&pgd_lock);
			
 
				 	/*
			
@@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 
				 	 * up for us already:
			
 
				 	 */
			
 
				 	tmp = lookup_address(address, &level);
			
 
				-	if (tmp != kpte)
			
 
				-		goto out_unlock;
			
 
				+	if (tmp != kpte) {
			
 
				+		spin_unlock(&pgd_lock);
			
 
				+		return 1;
			
 
				+	}
			
 
				 
			
 
				-	pbase = (pte_t *)page_address(base);
			
 
				 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
			
 
				 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
			
 
				 	/*
			
@@ -601,17 +594,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 
				 	 * going on.
			
 
				 	 */
			
 
				 	__flush_tlb_all();
			
 
				+	spin_unlock(&pgd_lock);
			
 
				 
			
 
				-	base = NULL;
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				-out_unlock:
			
 
				-	/*
			
 
				-	 * If we dropped out via the lookup_address check under
			
 
				-	 * pgd_lock then stick the page back into the pool:
			
 
				-	 */
			
 
				-	if (base)
			
 
				+static int split_large_page(pte_t *kpte, unsigned long address)
			
 
				+{
			
 
				+	pte_t *pbase;
			
 
				+	struct page *base;
			
 
				+
			
 
				+	if (!debug_pagealloc)
			
 
				+		spin_unlock(&cpa_lock);
			
 
				+	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
			
 
				+	if (!debug_pagealloc)
			
 
				+		spin_lock(&cpa_lock);
			
 
				+	if (!base)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	pbase = (pte_t *)page_address(base);
			
 
				+	if (__split_large_page(kpte, address, pbase))
			
 
				 		__free_page(base);
			
 
				-	spin_unlock(&pgd_lock);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;}
 
				 static inline int save_add_info(void) {return 0;}
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
			
 
				+static void __init
			
 
				+handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
			
 
				+{
			
 
				+	int overlap, i;
			
 
				+	unsigned long start_pfn, end_pfn;
			
 
				+
			
 
				+	start_pfn = PFN_DOWN(start);
			
 
				+	end_pfn = PFN_UP(end);
			
 
				+
			
 
				+	/*
			
 
				+	 * For movablemem_map=acpi:
			
 
				+	 *
			
 
				+	 * SRAT:		|_____| |_____| |_________| |_________| ......
			
 
				+	 * node id:                0       1         1           2
			
 
				+	 * hotpluggable:	   n       y         y           n
			
 
				+	 * movablemem_map:	        |_____| |_________|
			
 
				+	 *
			
 
				+	 * Using movablemem_map, we can prevent memblock from allocating memory
			
 
				+	 * on ZONE_MOVABLE at boot time.
			
 
				+	 *
			
 
				+	 * Before parsing SRAT, memblock has already reserve some memory ranges
			
 
				+	 * for other purposes, such as for kernel image. We cannot prevent
			
 
				+	 * kernel from using these memory, so we need to exclude these memory
			
 
				+	 * even if it is hotpluggable.
			
 
				+	 * Furthermore, to ensure the kernel has enough memory to boot, we make
			
 
				+	 * all the memory on the node which the kernel resides in
			
 
				+	 * un-hotpluggable.
			
 
				+	 */
			
 
				+	if (hotpluggable && movablemem_map.acpi) {
			
 
				+		/* Exclude ranges reserved by memblock. */
			
 
				+		struct memblock_type *rgn = &memblock.reserved;
			
 
				+
			
 
				+		for (i = 0; i < rgn->cnt; i++) {
			
 
				+			if (end <= rgn->regions[i].base ||
			
 
				+			    start >= rgn->regions[i].base +
			
 
				+			    rgn->regions[i].size)
			
 
				+				continue;
			
 
				+
			
 
				+			/*
			
 
				+			 * If the memory range overlaps the memory reserved by
			
 
				+			 * memblock, then the kernel resides in this node.
			
 
				+			 */
			
 
				+			node_set(node, movablemem_map.numa_nodes_kernel);
			
 
				+
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * If the kernel resides in this node, then the whole node
			
 
				+		 * should not be hotpluggable.
			
 
				+		 */
			
 
				+		if (node_isset(node, movablemem_map.numa_nodes_kernel))
			
 
				+			goto out;
			
 
				+
			
 
				+		insert_movablemem_map(start_pfn, end_pfn);
			
 
				+
			
 
				+		/*
			
 
				+		 * numa_nodes_hotplug nodemask represents which nodes are put
			
 
				+		 * into movablemem_map.map[].
			
 
				+		 */
			
 
				+		node_set(node, movablemem_map.numa_nodes_hotplug);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * For movablemem_map=nn[KMG]@ss[KMG]:
			
 
				+	 *
			
 
				+	 * SRAT:		|_____| |_____| |_________| |_________| ......
			
 
				+	 * node id:		   0       1         1           2
			
 
				+	 * user specified:	          |__|                 |___|
			
 
				+	 * movablemem_map:		  |___| |_________|    |______| ......
			
 
				+	 *
			
 
				+	 * Using movablemem_map, we can prevent memblock from allocating memory
			
 
				+	 * on ZONE_MOVABLE at boot time.
			
 
				+	 *
			
 
				+	 * NOTE: In this case, SRAT info will be ingored.
			
 
				+	 */
			
 
				+	overlap = movablemem_map_overlap(start_pfn, end_pfn);
			
 
				+	if (overlap >= 0) {
			
 
				+		/*
			
 
				+		 * If part of this range is in movablemem_map, we need to
			
 
				+		 * add the range after it to extend the range to the end
			
 
				+		 * of the node, because from the min address specified to
			
 
				+		 * the end of the node will be ZONE_MOVABLE.
			
 
				+		 */
			
 
				+		start_pfn = max(start_pfn,
			
 
				+			    movablemem_map.map[overlap].start_pfn);
			
 
				+		insert_movablemem_map(start_pfn, end_pfn);
			
 
				+
			
 
				+		/*
			
 
				+		 * Set the nodemask, so that if the address range on one node
			
 
				+		 * is not continuse, we can add the subsequent ranges on the
			
 
				+		 * same node into movablemem_map.
			
 
				+		 */
			
 
				+		node_set(node, movablemem_map.numa_nodes_hotplug);
			
 
				+	} else {
			
 
				+		if (node_isset(node, movablemem_map.numa_nodes_hotplug))
			
 
				+			/*
			
 
				+			 * Insert the range if we already have movable ranges
			
 
				+			 * on the same node.
			
 
				+			 */
			
 
				+			insert_movablemem_map(start_pfn, end_pfn);
			
 
				+	}
			
 
				+out:
			
 
				+	return;
			
 
				+}
			
 
				+#else		/* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
			
 
				+static inline void
			
 
				+handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
			
 
				+{
			
 
				+}
			
 
				+#endif		/* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
			
 
				+
			
 
				 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
			
 
				 int __init
			
 
				 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
			
 
				 {
			
 
				 	u64 start, end;
			
 
				+	u32 hotpluggable;
			
 
				 	int node, pxm;
			
 
				 
			
 
				 	if (srat_disabled())
			
@@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
				 		goto out_err_bad_srat;
			
 
				 	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
			
 
				 		goto out_err;
			
 
				-	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
			
 
				+	hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
			
 
				+	if (hotpluggable && !save_add_info())
			
 
				 		goto out_err;
			
 
				 
			
 
				 	start = ma->base_address;
			
@@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
				 
			
 
				 	node_set(node, numa_nodes_parsed);
			
 
				 
			
 
				-	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
			
 
				+	printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
			
 
				 	       node, pxm,
			
 
				-	       (unsigned long long) start, (unsigned long long) end - 1);
			
 
				+	       (unsigned long long) start, (unsigned long long) end - 1,
			
 
				+	       hotpluggable ? "Hot Pluggable": "");
			
 
				+
			
 
				+	handle_movablemem(node, start, end, hotpluggable);
			
 
				 
			
 
				 	return 0;
			
 
				 out_err_bad_srat:
			
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
 
				 #include <linux/mutex.h>
			
 
				 #include <linux/idr.h>
			
 
				 #include <linux/log2.h>
			
 
				+#include <linux/pm_runtime.h>
			
 
				 
			
 
				 #include "blk.h"
			
 
				 
			
@@ -534,6 +535,14 @@ static void register_disk(struct gendisk *disk)
 
				 			return;
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	/*
			
 
				+	 * avoid probable deadlock caused by allocating memory with
			
 
				+	 * GFP_KERNEL in runtime_resume callback of its all ancestor
			
 
				+	 * devices
			
 
				+	 */
			
 
				+	pm_runtime_set_memalloc_noio(ddev, true);
			
 
				+
			
 
				 	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
			
 
				 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
			
 
				 
			
@@ -663,6 +672,7 @@ void del_gendisk(struct gendisk *disk)
 
				 	disk->driverfs_dev = NULL;
			
 
				 	if (!sysfs_deprecated)
			
 
				 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
			
 
				+	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
			
 
				 	device_del(disk_to_dev(disk));
			
 
				 }
			
 
				 EXPORT_SYMBOL(del_gendisk);
			
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -280,9 +280,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 
				 
			
 
				 static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
			
 
				 {
			
 
				-	int result = 0;
			
 
				+	int result = 0, nid;
			
 
				 	struct acpi_memory_info *info, *n;
			
 
				 
			
 
				+	nid = acpi_get_node(mem_device->device->handle);
			
 
				+
			
 
				 	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
			
 
				 		if (info->failed)
			
 
				 			/* The kernel does not use this memory block */
			
@@ -295,7 +297,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 
				 			 */
			
 
				 			return -EBUSY;
			
 
				 
			
 
				-		result = remove_memory(info->start_addr, info->length);
			
 
				+		if (nid < 0)
			
 
				+			nid = memory_add_physaddr_to_nid(info->start_addr);
			
 
				+		result = remove_memory(nid, info->start_addr, info->length);
			
 
				 		if (result)
			
 
				 			return result;
			
 
				 
			
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 
				 					    handler, max_entries);
			
 
				 }
			
 
				 
			
 
				-int __init acpi_numa_init(void)
			
 
				-{
			
 
				-	int cnt = 0;
			
 
				+static int srat_mem_cnt;
			
 
				 
			
 
				+void __init early_parse_srat(void)
			
 
				+{
			
 
				 	/*
			
 
				 	 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
			
 
				 	 * SRAT cpu entries could have different order with that in MADT.
			
@@ -295,21 +295,24 @@ int __init acpi_numa_init(void)
 
				 	/* SRAT: Static Resource Affinity Table */
			
 
				 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
			
 
				 		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
			
 
				-				     acpi_parse_x2apic_affinity, 0);
			
 
				+				      acpi_parse_x2apic_affinity, 0);
			
 
				 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
			
 
				-				     acpi_parse_processor_affinity, 0);
			
 
				-		cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
			
 
				-					    acpi_parse_memory_affinity,
			
 
				-					    NR_NODE_MEMBLKS);
			
 
				+				      acpi_parse_processor_affinity, 0);
			
 
				+		srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
			
 
				+						     acpi_parse_memory_affinity,
			
 
				+						     NR_NODE_MEMBLKS);
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				+int __init acpi_numa_init(void)
			
 
				+{
			
 
				 	/* SLIT: System Locality Information Table */
			
 
				 	acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
			
 
				 
			
 
				 	acpi_numa_arch_fixup();
			
 
				 
			
 
				-	if (cnt < 0)
			
 
				-		return cnt;
			
 
				+	if (srat_mem_cnt < 0)
			
 
				+		return srat_mem_cnt;
			
 
				 	else if (!parsed_numa_memblks)
			
 
				 		return -ENOENT;
			
 
				 	return 0;
			
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -45,6 +45,7 @@
 
				 #include <linux/cpuidle.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/acpi.h>
			
 
				+#include <linux/memory_hotplug.h>
			
 
				 
			
 
				 #include <asm/io.h>
			
 
				 #include <asm/cpu.h>
			
@@ -641,6 +642,7 @@ static int acpi_processor_remove(struct acpi_device *device)
 
				 
			
 
				 	per_cpu(processors, pr->id) = NULL;
			
 
				 	per_cpu(processor_device_array, pr->id) = NULL;
			
 
				+	try_offline_node(cpu_to_node(pr->id));
			
 
				 
			
 
				 free:
			
 
				 	free_cpumask_var(pr->throttling.shared_cpu_map);
			
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -693,6 +693,12 @@ int offline_memory_block(struct memory_block *mem)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/* return true if the memory block is offlined, otherwise, return false */
			
 
				+bool is_memblock_offlined(struct memory_block *mem)
			
 
				+{
			
 
				+	return mem->state == MEM_OFFLINE;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Initialize the sysfs support for memory devices...
			
 
				  */
			
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -124,6 +124,76 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
			
 
				 
			
 
				+static int dev_memalloc_noio(struct device *dev, void *data)
			
 
				+{
			
 
				+	return dev->power.memalloc_noio;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
			
 
				+ * @dev: Device to handle.
			
 
				+ * @enable: True for setting the flag and False for clearing the flag.
			
 
				+ *
			
 
				+ * Set the flag for all devices in the path from the device to the
			
 
				+ * root device in the device tree if @enable is true, otherwise clear
			
 
				+ * the flag for devices in the path whose siblings don't set the flag.
			
 
				+ *
			
 
				+ * The function should only be called by block device, or network
			
 
				+ * device driver for solving the deadlock problem during runtime
			
 
				+ * resume/suspend:
			
 
				+ *
			
 
				+ *     If memory allocation with GFP_KERNEL is called inside runtime
			
 
				+ *     resume/suspend callback of any one of its ancestors(or the
			
 
				+ *     block device itself), the deadlock may be triggered inside the
			
 
				+ *     memory allocation since it might not complete until the block
			
 
				+ *     device becomes active and the involed page I/O finishes. The
			
 
				+ *     situation is pointed out first by Alan Stern. Network device
			
 
				+ *     are involved in iSCSI kind of situation.
			
 
				+ *
			
 
				+ * The lock of dev_hotplug_mutex is held in the function for handling
			
 
				+ * hotplug race because pm_runtime_set_memalloc_noio() may be called
			
 
				+ * in async probe().
			
 
				+ *
			
 
				+ * The function should be called between device_add() and device_del()
			
 
				+ * on the affected device(block/network device).
			
 
				+ */
			
 
				+void pm_runtime_set_memalloc_noio(struct device *dev, bool enable)
			
 
				+{
			
 
				+	static DEFINE_MUTEX(dev_hotplug_mutex);
			
 
				+
			
 
				+	mutex_lock(&dev_hotplug_mutex);
			
 
				+	for (;;) {
			
 
				+		bool enabled;
			
 
				+
			
 
				+		/* hold power lock since bitfield is not SMP-safe. */
			
 
				+		spin_lock_irq(&dev->power.lock);
			
 
				+		enabled = dev->power.memalloc_noio;
			
 
				+		dev->power.memalloc_noio = enable;
			
 
				+		spin_unlock_irq(&dev->power.lock);
			
 
				+
			
 
				+		/*
			
 
				+		 * not need to enable ancestors any more if the device
			
 
				+		 * has been enabled.
			
 
				+		 */
			
 
				+		if (enabled && enable)
			
 
				+			break;
			
 
				+
			
 
				+		dev = dev->parent;
			
 
				+
			
 
				+		/*
			
 
				+		 * clear flag of the parent device only if all the
			
 
				+		 * children don't set the flag because ancestor's
			
 
				+		 * flag was set by any one of the descendants.
			
 
				+		 */
			
 
				+		if (!dev || (!enable &&
			
 
				+			     device_for_each_child(dev, NULL,
			
 
				+						   dev_memalloc_noio)))
			
 
				+			break;
			
 
				+	}
			
 
				+	mutex_unlock(&dev_hotplug_mutex);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio);
			
 
				+
			
 
				 /**
			
 
				  * rpm_check_suspend_allowed - Test whether a device may be suspended.
			
 
				  * @dev: Device to test.
			
@@ -278,7 +348,24 @@ static int rpm_callback(int (*cb)(struct device *), struct device *dev)
 
				 	if (!cb)
			
 
				 		return -ENOSYS;
			
 
				 
			
 
				-	retval = __rpm_callback(cb, dev);
			
 
				+	if (dev->power.memalloc_noio) {
			
 
				+		unsigned int noio_flag;
			
 
				+
			
 
				+		/*
			
 
				+		 * Deadlock might be caused if memory allocation with
			
 
				+		 * GFP_KERNEL happens inside runtime_suspend and
			
 
				+		 * runtime_resume callbacks of one block device's
			
 
				+		 * ancestor or the block device itself. Network
			
 
				+		 * device might be thought as part of iSCSI block
			
 
				+		 * device, so network device and its ancestor should
			
 
				+		 * be marked as memalloc_noio too.
			
 
				+		 */
			
 
				+		noio_flag = memalloc_noio_save();
			
 
				+		retval = __rpm_callback(cb, dev);
			
 
				+		memalloc_noio_restore(noio_flag);
			
 
				+	} else {
			
 
				+		retval = __rpm_callback(cb, dev);
			
 
				+	}
			
 
				 
			
 
				 	dev->power.runtime_error = retval;
			
 
				 	return retval != -EACCES ? retval : -EIO;
			
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -21,6 +21,7 @@
 
				 #include <linux/types.h>
			
 
				 #include <linux/bootmem.h>
			
 
				 #include <linux/slab.h>
			
 
				+#include <linux/mm.h>
			
 
				 
			
 
				 /*
			
 
				  * Data types ------------------------------------------------------------------
			
@@ -52,6 +53,9 @@ static ssize_t start_show(struct firmware_map_entry *entry, char *buf);
 
				 static ssize_t end_show(struct firmware_map_entry *entry, char *buf);
			
 
				 static ssize_t type_show(struct firmware_map_entry *entry, char *buf);
			
 
				 
			
 
				+static struct firmware_map_entry * __meminit
			
 
				+firmware_map_find_entry(u64 start, u64 end, const char *type);
			
 
				+
			
 
				 /*
			
 
				  * Static data -----------------------------------------------------------------
			
 
				  */
			
@@ -79,7 +83,52 @@ static const struct sysfs_ops memmap_attr_ops = {
 
				 	.show = memmap_attr_show,
			
 
				 };
			
 
				 
			
 
				-static struct kobj_type memmap_ktype = {
			
 
				+/* Firmware memory map entries. */
			
 
				+static LIST_HEAD(map_entries);
			
 
				+static DEFINE_SPINLOCK(map_entries_lock);
			
 
				+
			
 
				+/*
			
 
				+ * For memory hotplug, there is no way to free memory map entries allocated
			
 
				+ * by boot mem after the system is up. So when we hot-remove memory whose
			
 
				+ * map entry is allocated by bootmem, we need to remember the storage and
			
 
				+ * reuse it when the memory is hot-added again.
			
 
				+ */
			
 
				+static LIST_HEAD(map_entries_bootmem);
			
 
				+static DEFINE_SPINLOCK(map_entries_bootmem_lock);
			
 
				+
			
 
				+
			
 
				+static inline struct firmware_map_entry *
			
 
				+to_memmap_entry(struct kobject *kobj)
			
 
				+{
			
 
				+	return container_of(kobj, struct firmware_map_entry, kobj);
			
 
				+}
			
 
				+
			
 
				+static void __meminit release_firmware_map_entry(struct kobject *kobj)
			
 
				+{
			
 
				+	struct firmware_map_entry *entry = to_memmap_entry(kobj);
			
 
				+
			
 
				+	if (PageReserved(virt_to_page(entry))) {
			
 
				+		/*
			
 
				+		 * Remember the storage allocated by bootmem, and reuse it when
			
 
				+		 * the memory is hot-added again. The entry will be added to
			
 
				+		 * map_entries_bootmem here, and deleted from &map_entries in
			
 
				+		 * firmware_map_remove_entry().
			
 
				+		 */
			
 
				+		if (firmware_map_find_entry(entry->start, entry->end,
			
 
				+		    entry->type)) {
			
 
				+			spin_lock(&map_entries_bootmem_lock);
			
 
				+			list_add(&entry->list, &map_entries_bootmem);
			
 
				+			spin_unlock(&map_entries_bootmem_lock);
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	kfree(entry);
			
 
				+}
			
 
				+
			
 
				+static struct kobj_type __refdata memmap_ktype = {
			
 
				+	.release	= release_firmware_map_entry,
			
 
				 	.sysfs_ops	= &memmap_attr_ops,
			
 
				 	.default_attrs	= def_attrs,
			
 
				 };
			
@@ -88,13 +137,6 @@ static struct kobj_type memmap_ktype = {
 
				  * Registration functions ------------------------------------------------------
			
 
				  */
			
 
				 
			
 
				-/*
			
 
				- * Firmware memory map entries. No locking is needed because the
			
 
				- * firmware_map_add() and firmware_map_add_early() functions are called
			
 
				- * in firmware initialisation code in one single thread of execution.
			
 
				- */
			
 
				-static LIST_HEAD(map_entries);
			
 
				-
			
 
				 /**
			
 
				  * firmware_map_add_entry() - Does the real work to add a firmware memmap entry.
			
 
				  * @start: Start of the memory range.
			
@@ -118,11 +160,25 @@ static int firmware_map_add_entry(u64 start, u64 end,
 
				 	INIT_LIST_HEAD(&entry->list);
			
 
				 	kobject_init(&entry->kobj, &memmap_ktype);
			
 
				 
			
 
				+	spin_lock(&map_entries_lock);
			
 
				 	list_add_tail(&entry->list, &map_entries);
			
 
				+	spin_unlock(&map_entries_lock);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * firmware_map_remove_entry() - Does the real work to remove a firmware
			
 
				+ * memmap entry.
			
 
				+ * @entry: removed entry.
			
 
				+ *
			
 
				+ * The caller must hold map_entries_lock, and release it properly.
			
 
				+ **/
			
 
				+static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
			
 
				+{
			
 
				+	list_del(&entry->list);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Add memmap entry on sysfs
			
 
				  */
			
@@ -144,6 +200,78 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Remove memmap entry on sysfs
			
 
				+ */
			
 
				+static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
			
 
				+{
			
 
				+	kobject_put(&entry->kobj);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * firmware_map_find_entry_in_list() - Search memmap entry in a given list.
			
 
				+ * @start: Start of the memory range.
			
 
				+ * @end:   End of the memory range (exclusive).
			
 
				+ * @type:  Type of the memory range.
			
 
				+ * @list:  In which to find the entry.
			
 
				+ *
			
 
				+ * This function is to find the memmap entey of a given memory range in a
			
 
				+ * given list. The caller must hold map_entries_lock, and must not release
			
 
				+ * the lock until the processing of the returned entry has completed.
			
 
				+ *
			
 
				+ * Return: Pointer to the entry to be found on success, or NULL on failure.
			
 
				+ */
			
 
				+static struct firmware_map_entry * __meminit
			
 
				+firmware_map_find_entry_in_list(u64 start, u64 end, const char *type,
			
 
				+				struct list_head *list)
			
 
				+{
			
 
				+	struct firmware_map_entry *entry;
			
 
				+
			
 
				+	list_for_each_entry(entry, list, list)
			
 
				+		if ((entry->start == start) && (entry->end == end) &&
			
 
				+		    (!strcmp(entry->type, type))) {
			
 
				+			return entry;
			
 
				+		}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * firmware_map_find_entry() - Search memmap entry in map_entries.
			
 
				+ * @start: Start of the memory range.
			
 
				+ * @end:   End of the memory range (exclusive).
			
 
				+ * @type:  Type of the memory range.
			
 
				+ *
			
 
				+ * This function is to find the memmap entey of a given memory range.
			
 
				+ * The caller must hold map_entries_lock, and must not release the lock
			
 
				+ * until the processing of the returned entry has completed.
			
 
				+ *
			
 
				+ * Return: Pointer to the entry to be found on success, or NULL on failure.
			
 
				+ */
			
 
				+static struct firmware_map_entry * __meminit
			
 
				+firmware_map_find_entry(u64 start, u64 end, const char *type)
			
 
				+{
			
 
				+	return firmware_map_find_entry_in_list(start, end, type, &map_entries);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * firmware_map_find_entry_bootmem() - Search memmap entry in map_entries_bootmem.
			
 
				+ * @start: Start of the memory range.
			
 
				+ * @end:   End of the memory range (exclusive).
			
 
				+ * @type:  Type of the memory range.
			
 
				+ *
			
 
				+ * This function is similar to firmware_map_find_entry except that it find the
			
 
				+ * given entry in map_entries_bootmem.
			
 
				+ *
			
 
				+ * Return: Pointer to the entry to be found on success, or NULL on failure.
			
 
				+ */
			
 
				+static struct firmware_map_entry * __meminit
			
 
				+firmware_map_find_entry_bootmem(u64 start, u64 end, const char *type)
			
 
				+{
			
 
				+	return firmware_map_find_entry_in_list(start, end, type,
			
 
				+					       &map_entries_bootmem);
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
			
 
				  * memory hotplug.
			
@@ -161,9 +289,19 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type)
 
				 {
			
 
				 	struct firmware_map_entry *entry;
			
 
				 
			
 
				-	entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC);
			
 
				-	if (!entry)
			
 
				-		return -ENOMEM;
			
 
				+	entry = firmware_map_find_entry_bootmem(start, end, type);
			
 
				+	if (!entry) {
			
 
				+		entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC);
			
 
				+		if (!entry)
			
 
				+			return -ENOMEM;
			
 
				+	} else {
			
 
				+		/* Reuse storage allocated by bootmem. */
			
 
				+		spin_lock(&map_entries_bootmem_lock);
			
 
				+		list_del(&entry->list);
			
 
				+		spin_unlock(&map_entries_bootmem_lock);
			
 
				+
			
 
				+		memset(entry, 0, sizeof(*entry));
			
 
				+	}
			
 
				 
			
 
				 	firmware_map_add_entry(start, end, type, entry);
			
 
				 	/* create the memmap entry */
			
@@ -196,6 +334,36 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
 
				 	return firmware_map_add_entry(start, end, type, entry);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * firmware_map_remove() - remove a firmware mapping entry
			
 
				+ * @start: Start of the memory range.
			
 
				+ * @end:   End of the memory range.
			
 
				+ * @type:  Type of the memory range.
			
 
				+ *
			
 
				+ * removes a firmware mapping entry.
			
 
				+ *
			
 
				+ * Returns 0 on success, or -EINVAL if no entry.
			
 
				+ **/
			
 
				+int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
			
 
				+{
			
 
				+	struct firmware_map_entry *entry;
			
 
				+
			
 
				+	spin_lock(&map_entries_lock);
			
 
				+	entry = firmware_map_find_entry(start, end - 1, type);
			
 
				+	if (!entry) {
			
 
				+		spin_unlock(&map_entries_lock);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	firmware_map_remove_entry(entry);
			
 
				+	spin_unlock(&map_entries_lock);
			
 
				+
			
 
				+	/* remove the memmap entry */
			
 
				+	remove_sysfs_fw_map_entry(entry);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Sysfs functions -------------------------------------------------------------
			
 
				  */
			
@@ -217,8 +385,10 @@ static ssize_t type_show(struct firmware_map_entry *entry, char *buf)
 
				 	return snprintf(buf, PAGE_SIZE, "%s\n", entry->type);
			
 
				 }
			
 
				 
			
 
				-#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr)
			
 
				-#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
			
 
				+static inline struct memmap_attribute *to_memmap_attr(struct attribute *attr)
			
 
				+{
			
 
				+	return container_of(attr, struct memmap_attribute, attr);
			
 
				+}
			
 
				 
			
 
				 static ssize_t memmap_attr_show(struct kobject *kobj,
			
 
				 				struct attribute *attr, char *buf)
			
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -25,8 +25,8 @@ struct shadow_info {
 
				 /*
			
 
				  * It would be nice if we scaled with the size of transaction.
			
 
				  */
			
 
				-#define HASH_SIZE 256
			
 
				-#define HASH_MASK (HASH_SIZE - 1)
			
 
				+#define DM_HASH_SIZE 256
			
 
				+#define DM_HASH_MASK (DM_HASH_SIZE - 1)
			
 
				 
			
 
				 struct dm_transaction_manager {
			
 
				 	int is_clone;
			
@@ -36,7 +36,7 @@ struct dm_transaction_manager {
 
				 	struct dm_space_map *sm;
			
 
				 
			
 
				 	spinlock_t lock;
			
 
				-	struct hlist_head buckets[HASH_SIZE];
			
 
				+	struct hlist_head buckets[DM_HASH_SIZE];
			
 
				 };
			
 
				 
			
 
				 /*----------------------------------------------------------------*/
			
@@ -44,7 +44,7 @@ struct dm_transaction_manager {
 
				 static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
			
 
				 {
			
 
				 	int r = 0;
			
 
				-	unsigned bucket = dm_hash_block(b, HASH_MASK);
			
 
				+	unsigned bucket = dm_hash_block(b, DM_HASH_MASK);
			
 
				 	struct shadow_info *si;
			
 
				 	struct hlist_node *n;
			
 
				 
			
@@ -71,7 +71,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
 
				 	si = kmalloc(sizeof(*si), GFP_NOIO);
			
 
				 	if (si) {
			
 
				 		si->where = b;
			
 
				-		bucket = dm_hash_block(b, HASH_MASK);
			
 
				+		bucket = dm_hash_block(b, DM_HASH_MASK);
			
 
				 		spin_lock(&tm->lock);
			
 
				 		hlist_add_head(&si->hlist, tm->buckets + bucket);
			
 
				 		spin_unlock(&tm->lock);
			
@@ -86,7 +86,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm)
 
				 	int i;
			
 
				 
			
 
				 	spin_lock(&tm->lock);
			
 
				-	for (i = 0; i < HASH_SIZE; i++) {
			
 
				+	for (i = 0; i < DM_HASH_SIZE; i++) {
			
 
				 		bucket = tm->buckets + i;
			
 
				 		hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
			
 
				 			kfree(si);
			
@@ -115,7 +115,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
 
				 	tm->sm = sm;
			
 
				 
			
 
				 	spin_lock_init(&tm->lock);
			
 
				-	for (i = 0; i < HASH_SIZE; i++)
			
 
				+	for (i = 0; i < DM_HASH_SIZE; i++)
			
 
				 		INIT_HLIST_HEAD(tm->buckets + i);
			
 
				 
			
 
				 	return tm;
			
--- a/drivers/staging/zcache/zbud.c
+++ b/drivers/staging/zcache/zbud.c
@@ -404,7 +404,7 @@ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
 
				 	else
			
 
				 		zbud_pers_pageframes--;
			
 
				 	zbudpage_spin_unlock(zbudpage);
			
 
				-	reset_page_mapcount(page);
			
 
				+	page_mapcount_reset(page);
			
 
				 	init_page_count(page);
			
 
				 	page->index = 0;
			
 
				 	return page;
			
--- a/drivers/staging/zsmalloc/zsmalloc-main.c
+++ b/drivers/staging/zsmalloc/zsmalloc-main.c
@@ -472,7 +472,7 @@ static void reset_page(struct page *page)
 
				 	set_page_private(page, 0);
			
 
				 	page->mapping = NULL;
			
 
				 	page->freelist = NULL;
			
 
				-	reset_page_mapcount(page);
			
 
				+	page_mapcount_reset(page);
			
 
				 }
			
 
				 
			
 
				 static void free_zspage(struct page *first_page)
			
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -5177,6 +5177,7 @@ int usb_reset_device(struct usb_device *udev)
 
				 {
			
 
				 	int ret;
			
 
				 	int i;
			
 
				+	unsigned int noio_flag;
			
 
				 	struct usb_host_config *config = udev->actconfig;
			
 
				 
			
 
				 	if (udev->state == USB_STATE_NOTATTACHED ||
			
@@ -5186,6 +5187,17 @@ int usb_reset_device(struct usb_device *udev)
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * Don't allocate memory with GFP_KERNEL in current
			
 
				+	 * context to avoid possible deadlock if usb mass
			
 
				+	 * storage interface or usbnet interface(iSCSI case)
			
 
				+	 * is included in current configuration. The easist
			
 
				+	 * approach is to do it for every device reset,
			
 
				+	 * because the device 'memalloc_noio' flag may have
			
 
				+	 * not been set before reseting the usb device.
			
 
				+	 */
			
 
				+	noio_flag = memalloc_noio_save();
			
 
				+
			
 
				 	/* Prevent autosuspend during the reset */
			
 
				 	usb_autoresume_device(udev);
			
 
				 
			
@@ -5230,6 +5242,7 @@ int usb_reset_device(struct usb_device *udev)
 
				 	}
			
 
				 
			
 
				 	usb_autosuspend_device(udev);
			
 
				+	memalloc_noio_restore(noio_flag);
			
 
				 	return ret;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(usb_reset_device);
			
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
				 	struct aio_ring *ring;
			
 
				 	struct aio_ring_info *info = &ctx->ring_info;
			
 
				 	unsigned nr_events = ctx->max_reqs;
			
 
				-	unsigned long size;
			
 
				+	unsigned long size, populate;
			
 
				 	int nr_pages;
			
 
				 
			
 
				 	/* Compensate for the ring buffer's head/tail overlap entry */
			
@@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 
				 	down_write(&ctx->mm->mmap_sem);
			
 
				 	info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
			
 
				 					PROT_READ|PROT_WRITE,
			
 
				-					MAP_ANONYMOUS|MAP_PRIVATE, 0);
			
 
				+					MAP_ANONYMOUS|MAP_PRIVATE, 0,
			
 
				+					&populate);
			
 
				 	if (IS_ERR((void *)info->mmap_base)) {
			
 
				 		up_write(&ctx->mm->mmap_sem);
			
 
				 		info->mmap_size = 0;
			
@@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 
				 		aio_free_ring(ctx);
			
 
				 		return -EAGAIN;
			
 
				 	}
			
 
				+	if (populate)
			
 
				+		mm_populate(info->mmap_base, populate);
			
 
				 
			
 
				 	ctx->user_id = info->mmap_base;
			
 
				 
			
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3227,7 +3227,7 @@ static struct kmem_cache *bh_cachep __read_mostly;
 
				  * Once the number of bh's in the machine exceeds this level, we start
			
 
				  * stripping them in writeback.
			
 
				  */
			
 
				-static int max_buffer_heads;
			
 
				+static unsigned long max_buffer_heads;
			
 
				 
			
 
				 int buffer_heads_over_limit;
			
 
				 
			
@@ -3343,7 +3343,7 @@ EXPORT_SYMBOL(bh_submit_read);
 
				 
			
 
				 void __init buffer_init(void)
			
 
				 {
			
 
				-	int nrpages;
			
 
				+	unsigned long nrpages;
			
 
				 
			
 
				 	bh_cachep = kmem_cache_create("buffer_head",
			
 
				 			sizeof(struct buffer_head), 0,
			
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -151,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
 
				 }
			
 
				 
			
 
				 static int num_delegations;
			
 
				-unsigned int max_delegations;
			
 
				+unsigned long max_delegations;
			
 
				 
			
 
				 /*
			
 
				  * Open owner state (share locks)
			
@@ -700,8 +700,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)
 
				 	num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
			
 
				 
			
 
				 	spin_lock(&nfsd_drc_lock);
			
 
				-	avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
			
 
				-			nfsd_drc_max_mem - nfsd_drc_mem_used);
			
 
				+	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
			
 
				+		    nfsd_drc_max_mem - nfsd_drc_mem_used);
			
 
				 	num = min_t(int, num, avail / slotsize);
			
 
				 	nfsd_drc_mem_used += num * slotsize;
			
 
				 	spin_unlock(&nfsd_drc_lock);
			
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -56,8 +56,8 @@ extern struct svc_version	nfsd_version2, nfsd_version3,
 
				 extern u32			nfsd_supported_minorversion;
			
 
				 extern struct mutex		nfsd_mutex;
			
 
				 extern spinlock_t		nfsd_drc_lock;
			
 
				-extern unsigned int		nfsd_drc_max_mem;
			
 
				-extern unsigned int		nfsd_drc_mem_used;
			
 
				+extern unsigned long		nfsd_drc_max_mem;
			
 
				+extern unsigned long		nfsd_drc_mem_used;
			
 
				 
			
 
				 extern const struct seq_operations nfs_exports_op;
			
 
				 
			
@@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
 
				  * NFSv4 State
			
 
				  */
			
 
				 #ifdef CONFIG_NFSD_V4
			
 
				-extern unsigned int max_delegations;
			
 
				+extern unsigned long max_delegations;
			
 
				 void nfs4_state_init(void);
			
 
				 int nfsd4_init_slabs(void);
			
 
				 void nfsd4_free_slabs(void);
			
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex);
 
				  * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
			
 
				  */
			
 
				 spinlock_t	nfsd_drc_lock;
			
 
				-unsigned int	nfsd_drc_max_mem;
			
 
				-unsigned int	nfsd_drc_mem_used;
			
 
				+unsigned long	nfsd_drc_max_mem;
			
 
				+unsigned long	nfsd_drc_mem_used;
			
 
				 
			
 
				 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
			
 
				 static struct svc_stat	nfsd_acl_svcstats;
			
@@ -342,7 +342,7 @@ static void set_max_drc(void)
 
				 					>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
			
 
				 	nfsd_drc_mem_used = 0;
			
 
				 	spin_lock_init(&nfsd_drc_lock);
			
 
				-	dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
			
 
				+	dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
			
 
				 }
			
 
				 
			
 
				 static int nfsd_get_default_max_blksize(void)
			
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
				 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
			
 
				 
			
 
				 	cached = global_page_state(NR_FILE_PAGES) -
			
 
				-			total_swapcache_pages - i.bufferram;
			
 
				+			total_swapcache_pages() - i.bufferram;
			
 
				 	if (cached < 0)
			
 
				 		cached = 0;
			
 
				 
			
@@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
				 		K(i.freeram),
			
 
				 		K(i.bufferram),
			
 
				 		K(cached),
			
 
				-		K(total_swapcache_pages),
			
 
				+		K(total_swapcache_pages()),
			
 
				 		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
			
 
				 		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
			
 
				 		K(pages[LRU_ACTIVE_ANON]),
			
@@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
				 		vmi.used >> 10,
			
 
				 		vmi.largest_chunk >> 10
			
 
				 #ifdef CONFIG_MEMORY_FAILURE
			
 
				-		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
			
 
				+		,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
			
 
				 #endif
			
 
				 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				 		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
			
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -485,6 +485,14 @@ static inline bool acpi_driver_match_device(struct device *dev,
 
				 
			
 
				 #endif	/* !CONFIG_ACPI */
			
 
				 
			
 
				+#ifdef CONFIG_ACPI_NUMA
			
 
				+void __init early_parse_srat(void);
			
 
				+#else
			
 
				+static inline void early_parse_srat(void)
			
 
				+{
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 #ifdef CONFIG_ACPI
			
 
				 void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
			
 
				 			       u32 pm1a_ctrl,  u32 pm1b_ctrl));
			
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
 
				 			      unsigned long size);
			
 
				 extern void free_bootmem(unsigned long physaddr, unsigned long size);
			
 
				 extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
			
 
				+extern void __free_pages_bootmem(struct page *page, unsigned int order);
			
 
				 
			
 
				 /*
			
 
				  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
			
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -23,7 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
 
				 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
			
 
				 			int order, gfp_t gfp_mask, nodemask_t *mask,
			
 
				 			bool sync, bool *contended);
			
 
				-extern int compact_pgdat(pg_data_t *pgdat, int order);
			
 
				+extern void compact_pgdat(pg_data_t *pgdat, int order);
			
 
				 extern void reset_isolation_suitable(pg_data_t *pgdat);
			
 
				 extern unsigned long compaction_suitable(struct zone *zone, int order);
			
 
				 
			
@@ -80,9 +80,8 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
				 	return COMPACT_CONTINUE;
			
 
				 }
			
 
				 
			
 
				-static inline int compact_pgdat(pg_data_t *pgdat, int order)
			
 
				+static inline void compact_pgdat(pg_data_t *pgdat, int order)
			
 
				 {
			
 
				-	return COMPACT_CONTINUE;
			
 
				 }
			
 
				 
			
 
				 static inline void reset_isolation_suitable(pg_data_t *pgdat)
			
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -25,6 +25,7 @@
 
				 
			
 
				 int firmware_map_add_early(u64 start, u64 end, const char *type);
			
 
				 int firmware_map_add_hotplug(u64 start, u64 end, const char *type);
			
 
				+int firmware_map_remove(u64 start, u64 end, const char *type);
			
 
				 
			
 
				 #else /* CONFIG_FIRMWARE_MEMMAP */
			
 
				 
			
@@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 end, const char *type)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static inline int firmware_map_remove(u64 start, u64 end, const char *type)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 #endif /* CONFIG_FIRMWARE_MEMMAP */
			
 
				 
			
 
				 #endif /* _LINUX_FIRMWARE_MAP_H */
			
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -219,12 +219,6 @@ static inline void zero_user(struct page *page,
 
				 	zero_user_segments(page, start, start + size, 0, 0);
			
 
				 }
			
 
				 
			
 
				-static inline void __deprecated memclear_highpage_flush(struct page *page,
			
 
				-			unsigned int offset, unsigned int size)
			
 
				-{
			
 
				-	zero_user(page, offset, size);
			
 
				-}
			
 
				-
			
 
				 #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
			
 
				 
			
 
				 static inline void copy_user_highpage(struct page *to, struct page *from,
			
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -113,7 +113,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
 
				 	do {								\
			
 
				 		pmd_t *____pmd = (__pmd);				\
			
 
				 		anon_vma_lock_write(__anon_vma);			\
			
 
				-		anon_vma_unlock(__anon_vma);				\
			
 
				+		anon_vma_unlock_write(__anon_vma);			\
			
 
				 		BUG_ON(pmd_trans_splitting(*____pmd) ||			\
			
 
				 		       pmd_trans_huge(*____pmd));			\
			
 
				 	} while (0)
			
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,9 +43,9 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
 
				 #endif
			
 
				 
			
 
				 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
			
 
				-int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
			
 
				-			struct page **, struct vm_area_struct **,
			
 
				-			unsigned long *, int *, int, unsigned int flags);
			
 
				+long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
			
 
				+			 struct page **, struct vm_area_struct **,
			
 
				+			 unsigned long *, unsigned long *, long, unsigned int);
			
 
				 void unmap_hugepage_range(struct vm_area_struct *,
			
 
				 			  unsigned long, unsigned long, struct page *);
			
 
				 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
			
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -16,9 +16,6 @@
 
				 struct stable_node;
			
 
				 struct mem_cgroup;
			
 
				 
			
 
				-struct page *ksm_does_need_to_copy(struct page *page,
			
 
				-			struct vm_area_struct *vma, unsigned long address);
			
 
				-
			
 
				 #ifdef CONFIG_KSM
			
 
				 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
			
 
				 		unsigned long end, int advice, unsigned long *vm_flags);
			
@@ -73,15 +70,8 @@ static inline void set_page_stable_node(struct page *page,
 
				  * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
			
 
				  * but what if the vma was unmerged while the page was swapped out?
			
 
				  */
			
 
				-static inline int ksm_might_need_to_copy(struct page *page,
			
 
				-			struct vm_area_struct *vma, unsigned long address)
			
 
				-{
			
 
				-	struct anon_vma *anon_vma = page_anon_vma(page);
			
 
				-
			
 
				-	return anon_vma &&
			
 
				-		(anon_vma->root != vma->anon_vma->root ||
			
 
				-		 page->index != linear_page_index(vma, address));
			
 
				-}
			
 
				+struct page *ksm_might_need_to_copy(struct page *page,
			
 
				+			struct vm_area_struct *vma, unsigned long address);
			
 
				 
			
 
				 int page_referenced_ksm(struct page *page,
			
 
				 			struct mem_cgroup *memcg, unsigned long *vm_flags);
			
@@ -113,10 +103,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static inline int ksm_might_need_to_copy(struct page *page,
			
 
				+static inline struct page *ksm_might_need_to_copy(struct page *page,
			
 
				 			struct vm_area_struct *vma, unsigned long address)
			
 
				 {
			
 
				-	return 0;
			
 
				+	return page;
			
 
				 }
			
 
				 
			
 
				 static inline int page_referenced_ksm(struct page *page,
			
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,7 @@ struct memblock {
 
				 
			
 
				 extern struct memblock memblock;
			
 
				 extern int memblock_debug;
			
 
				+extern struct movablemem_map movablemem_map;
			
 
				 
			
 
				 #define memblock_dbg(fmt, ...) \
			
 
				 	if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
			
@@ -60,6 +61,7 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 
				 void memblock_trim_memory(phys_addr_t align);
			
 
				 
			
 
				 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
			
 
				+
			
 
				 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
			
 
				 			  unsigned long *out_end_pfn, int *out_nid);
			
 
				 
			
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -116,7 +116,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 
				  * For memory reclaim.
			
 
				  */
			
 
				 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
			
 
				-int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec);
			
 
				 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
			
 
				 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
			
 
				 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
			
@@ -321,12 +320,6 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static inline int
			
 
				-mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
			
 
				-{
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				 static inline unsigned long
			
 
				 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
			
 
				 {
			
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -96,6 +96,7 @@ extern void __online_page_free(struct page *page);
 
				 
			
 
				 #ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				 extern bool is_pageblock_removable_nolock(struct page *page);
			
 
				+extern int arch_remove_memory(u64 start, u64 size);
			
 
				 #endif /* CONFIG_MEMORY_HOTREMOVE */
			
 
				 
			
 
				 /* reasonably generic interface to expand the physical pages in a zone  */
			
@@ -173,17 +174,16 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 
				 #endif /* CONFIG_NUMA */
			
 
				 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
			
 
				 
			
 
				-#ifdef CONFIG_SPARSEMEM_VMEMMAP
			
 
				+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
			
 
				+extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
			
 
				+#else
			
 
				 static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
			
 
				 {
			
 
				 }
			
 
				-static inline void put_page_bootmem(struct page *page)
			
 
				-{
			
 
				-}
			
 
				-#else
			
 
				-extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
			
 
				-extern void put_page_bootmem(struct page *page);
			
 
				 #endif
			
 
				+extern void put_page_bootmem(struct page *page);
			
 
				+extern void get_page_bootmem(unsigned long ingo, struct page *page,
			
 
				+			     unsigned long type);
			
 
				 
			
 
				 /*
			
 
				  * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
			
@@ -233,6 +233,7 @@ static inline void unlock_memory_hotplug(void) {}
 
				 #ifdef CONFIG_MEMORY_HOTREMOVE
			
 
				 
			
 
				 extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
			
 
				+extern void try_offline_node(int nid);
			
 
				 
			
 
				 #else
			
 
				 static inline int is_mem_section_removable(unsigned long pfn,
			
@@ -240,6 +241,8 @@ static inline int is_mem_section_removable(unsigned long pfn,
 
				 {
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+static inline void try_offline_node(int nid) {}
			
 
				 #endif /* CONFIG_MEMORY_HOTREMOVE */
			
 
				 
			
 
				 extern int mem_online_node(int nid);
			
@@ -247,7 +250,8 @@ extern int add_memory(int nid, u64 start, u64 size);
 
				 extern int arch_add_memory(int nid, u64 start, u64 size);
			
 
				 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
			
 
				 extern int offline_memory_block(struct memory_block *mem);
			
 
				-extern int remove_memory(u64 start, u64 size);
			
 
				+extern bool is_memblock_offlined(struct memory_block *mem);
			
 
				+extern int remove_memory(int nid, u64 start, u64 size);
			
 
				 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
			
 
				 								int nr_pages);
			
 
				 extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
			
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -40,11 +40,9 @@ extern void putback_movable_pages(struct list_head *l);
 
				 extern int migrate_page(struct address_space *,
			
 
				 			struct page *, struct page *, enum migrate_mode);
			
 
				 extern int migrate_pages(struct list_head *l, new_page_t x,
			
 
				-			unsigned long private, bool offlining,
			
 
				-			enum migrate_mode mode, int reason);
			
 
				+		unsigned long private, enum migrate_mode mode, int reason);
			
 
				 extern int migrate_huge_page(struct page *, new_page_t x,
			
 
				-			unsigned long private, bool offlining,
			
 
				-			enum migrate_mode mode);
			
 
				+		unsigned long private, enum migrate_mode mode);
			
 
				 
			
 
				 extern int fail_migrate_page(struct address_space *,
			
 
				 			struct page *, struct page *);
			
@@ -62,11 +60,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 
				 static inline void putback_lru_pages(struct list_head *l) {}
			
 
				 static inline void putback_movable_pages(struct list_head *l) {}
			
 
				 static inline int migrate_pages(struct list_head *l, new_page_t x,
			
 
				-		unsigned long private, bool offlining,
			
 
				-		enum migrate_mode mode, int reason) { return -ENOSYS; }
			
 
				+		unsigned long private, enum migrate_mode mode, int reason)
			
 
				+	{ return -ENOSYS; }
			
 
				 static inline int migrate_huge_page(struct page *page, new_page_t x,
			
 
				-		unsigned long private, bool offlining,
			
 
				-		enum migrate_mode mode) { return -ENOSYS; }
			
 
				+		unsigned long private, enum migrate_mode mode)
			
 
				+	{ return -ENOSYS; }
			
 
				 
			
 
				 static inline int migrate_prep(void) { return -ENOSYS; }
			
 
				 static inline int migrate_prep_local(void) { return -ENOSYS; }
			
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -87,6 +87,7 @@ extern unsigned int kobjsize(const void *objp);
 
				 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
			
 
				 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
			
 
				 
			
 
				+#define VM_POPULATE     0x00001000
			
 
				 #define VM_LOCKED	0x00002000
			
 
				 #define VM_IO           0x00004000	/* Memory mapped I/O or similar */
			
 
				 
			
@@ -366,7 +367,7 @@ static inline struct page *compound_head(struct page *page)
 
				  * both from it and to it can be tracked, using atomic_inc_and_test
			
 
				  * and atomic_add_negative(-1).
			
 
				  */
			
 
				-static inline void reset_page_mapcount(struct page *page)
			
 
				+static inline void page_mapcount_reset(struct page *page)
			
 
				 {
			
 
				 	atomic_set(&(page)->_mapcount, -1);
			
 
				 }
			
@@ -580,50 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 
				  * sets it, so none of the operations on it need to be atomic.
			
 
				  */
			
 
				 
			
 
				-
			
 
				-/*
			
 
				- * page->flags layout:
			
 
				- *
			
 
				- * There are three possibilities for how page->flags get
			
 
				- * laid out.  The first is for the normal case, without
			
 
				- * sparsemem.  The second is for sparsemem when there is
			
 
				- * plenty of space for node and section.  The last is when
			
 
				- * we have run out of space and have to fall back to an
			
 
				- * alternate (slower) way of determining the node.
			
 
				- *
			
 
				- * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE | ... | FLAGS |
			
 
				- * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
			
 
				- * classic sparse no space for node:  | SECTION |     ZONE    | ... | FLAGS |
			
 
				- */
			
 
				-#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
			
 
				-#define SECTIONS_WIDTH		SECTIONS_SHIFT
			
 
				-#else
			
 
				-#define SECTIONS_WIDTH		0
			
 
				-#endif
			
 
				-
			
 
				-#define ZONES_WIDTH		ZONES_SHIFT
			
 
				-
			
 
				-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				-#define NODES_WIDTH		NODES_SHIFT
			
 
				-#else
			
 
				-#ifdef CONFIG_SPARSEMEM_VMEMMAP
			
 
				-#error "Vmemmap: No space for nodes field in page flags"
			
 
				-#endif
			
 
				-#define NODES_WIDTH		0
			
 
				-#endif
			
 
				-
			
 
				-/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
			
 
				+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */
			
 
				 #define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
			
 
				 #define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
			
 
				 #define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
			
 
				-
			
 
				-/*
			
 
				- * We are going to use the flags for the page to node mapping if its in
			
 
				- * there.  This includes the case where there is no node, so it is implicit.
			
 
				- */
			
 
				-#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
			
 
				-#define NODE_NOT_IN_PAGE_FLAGS
			
 
				-#endif
			
 
				+#define LAST_NID_PGOFF		(ZONES_PGOFF - LAST_NID_WIDTH)
			
 
				 
			
 
				 /*
			
 
				  * Define the bit shifts to access each section.  For non-existent
			
@@ -633,6 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 
				 #define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
			
 
				 #define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
			
 
				 #define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
			
 
				+#define LAST_NID_PGSHIFT	(LAST_NID_PGOFF * (LAST_NID_WIDTH != 0))
			
 
				 
			
 
				 /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
			
 
				 #ifdef NODE_NOT_IN_PAGE_FLAGS
			
@@ -654,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 
				 #define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
			
 
				 #define NODES_MASK		((1UL << NODES_WIDTH) - 1)
			
 
				 #define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
			
 
				+#define LAST_NID_MASK		((1UL << LAST_NID_WIDTH) - 1)
			
 
				 #define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
			
 
				 
			
 
				 static inline enum zone_type page_zonenum(const struct page *page)
			
@@ -661,6 +625,10 @@ static inline enum zone_type page_zonenum(const struct page *page)
 
				 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
			
 
				 }
			
 
				 
			
 
				+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
			
 
				+#define SECTION_IN_PAGE_FLAGS
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * The identification function is only used by the buddy allocator for
			
 
				  * determining if two pages could be buddies. We are not really
			
@@ -693,31 +661,48 @@ static inline int page_to_nid(const struct page *page)
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				-static inline int page_xchg_last_nid(struct page *page, int nid)
			
 
				+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
			
 
				+static inline int page_nid_xchg_last(struct page *page, int nid)
			
 
				 {
			
 
				 	return xchg(&page->_last_nid, nid);
			
 
				 }
			
 
				 
			
 
				-static inline int page_last_nid(struct page *page)
			
 
				+static inline int page_nid_last(struct page *page)
			
 
				 {
			
 
				 	return page->_last_nid;
			
 
				 }
			
 
				-static inline void reset_page_last_nid(struct page *page)
			
 
				+static inline void page_nid_reset_last(struct page *page)
			
 
				 {
			
 
				 	page->_last_nid = -1;
			
 
				 }
			
 
				 #else
			
 
				-static inline int page_xchg_last_nid(struct page *page, int nid)
			
 
				+static inline int page_nid_last(struct page *page)
			
 
				+{
			
 
				+	return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK;
			
 
				+}
			
 
				+
			
 
				+extern int page_nid_xchg_last(struct page *page, int nid);
			
 
				+
			
 
				+static inline void page_nid_reset_last(struct page *page)
			
 
				+{
			
 
				+	int nid = (1 << LAST_NID_SHIFT) - 1;
			
 
				+
			
 
				+	page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
			
 
				+	page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
			
 
				+}
			
 
				+#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
			
 
				+#else
			
 
				+static inline int page_nid_xchg_last(struct page *page, int nid)
			
 
				 {
			
 
				 	return page_to_nid(page);
			
 
				 }
			
 
				 
			
 
				-static inline int page_last_nid(struct page *page)
			
 
				+static inline int page_nid_last(struct page *page)
			
 
				 {
			
 
				 	return page_to_nid(page);
			
 
				 }
			
 
				 
			
 
				-static inline void reset_page_last_nid(struct page *page)
			
 
				+static inline void page_nid_reset_last(struct page *page)
			
 
				 {
			
 
				 }
			
 
				 #endif
			
@@ -727,7 +712,7 @@ static inline struct zone *page_zone(const struct page *page)
 
				 	return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
			
 
				 }
			
 
				 
			
 
				-#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
			
 
				+#ifdef SECTION_IN_PAGE_FLAGS
			
 
				 static inline void set_page_section(struct page *page, unsigned long section)
			
 
				 {
			
 
				 	page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
			
@@ -757,7 +742,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
 
				 {
			
 
				 	set_page_zone(page, zone);
			
 
				 	set_page_node(page, node);
			
 
				-#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
			
 
				+#ifdef SECTION_IN_PAGE_FLAGS
			
 
				 	set_page_section(page, pfn_to_section_nr(pfn));
			
 
				 #endif
			
 
				 }
			
@@ -817,18 +802,7 @@ void page_address_init(void);
 
				 #define PAGE_MAPPING_KSM	2
			
 
				 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
			
 
				 
			
 
				-extern struct address_space swapper_space;
			
 
				-static inline struct address_space *page_mapping(struct page *page)
			
 
				-{
			
 
				-	struct address_space *mapping = page->mapping;
			
 
				-
			
 
				-	VM_BUG_ON(PageSlab(page));
			
 
				-	if (unlikely(PageSwapCache(page)))
			
 
				-		mapping = &swapper_space;
			
 
				-	else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
			
 
				-		mapping = NULL;
			
 
				-	return mapping;
			
 
				-}
			
 
				+extern struct address_space *page_mapping(struct page *page);
			
 
				 
			
 
				 /* Neutral page->mapping pointer to address_space or anon_vma or other */
			
 
				 static inline void *page_rmapping(struct page *page)
			
@@ -1035,18 +1009,18 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-extern int make_pages_present(unsigned long addr, unsigned long end);
			
 
				 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
			
 
				 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
			
 
				 		void *buf, int len, int write);
			
 
				 
			
 
				-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				-		     unsigned long start, int len, unsigned int foll_flags,
			
 
				-		     struct page **pages, struct vm_area_struct **vmas,
			
 
				-		     int *nonblocking);
			
 
				-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				-			unsigned long start, int nr_pages, int write, int force,
			
 
				-			struct page **pages, struct vm_area_struct **vmas);
			
 
				+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				+		      unsigned long start, unsigned long nr_pages,
			
 
				+		      unsigned int foll_flags, struct page **pages,
			
 
				+		      struct vm_area_struct **vmas, int *nonblocking);
			
 
				+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				+		    unsigned long start, unsigned long nr_pages,
			
 
				+		    int write, int force, struct page **pages,
			
 
				+		    struct vm_area_struct **vmas);
			
 
				 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
			
 
				 			struct page **pages);
			
 
				 struct kvec;
			
@@ -1359,6 +1333,24 @@ extern void free_bootmem_with_active_regions(int nid,
 
				 						unsigned long max_low_pfn);
			
 
				 extern void sparse_memory_present_with_active_regions(int nid);
			
 
				 
			
 
				+#define MOVABLEMEM_MAP_MAX MAX_NUMNODES
			
 
				+struct movablemem_entry {
			
 
				+	unsigned long start_pfn;    /* start pfn of memory segment */
			
 
				+	unsigned long end_pfn;      /* end pfn of memory segment (exclusive) */
			
 
				+};
			
 
				+
			
 
				+struct movablemem_map {
			
 
				+	bool acpi;	/* true if using SRAT info */
			
 
				+	int nr_map;
			
 
				+	struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
			
 
				+	nodemask_t numa_nodes_hotplug;	/* on which nodes we specify memory */
			
 
				+	nodemask_t numa_nodes_kernel;	/* on which nodes kernel resides in */
			
 
				+};
			
 
				+
			
 
				+extern void __init insert_movablemem_map(unsigned long start_pfn,
			
 
				+					 unsigned long end_pfn);
			
 
				+extern int __init movablemem_map_overlap(unsigned long start_pfn,
			
 
				+					 unsigned long end_pfn);
			
 
				 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
			
 
				 
			
 
				 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
			
@@ -1395,6 +1387,9 @@ extern void setup_per_cpu_pageset(void);
 
				 extern void zone_pcp_update(struct zone *zone);
			
 
				 extern void zone_pcp_reset(struct zone *zone);
			
 
				 
			
 
				+/* page_alloc.c */
			
 
				+extern int min_free_kbytes;
			
 
				+
			
 
				 /* nommu.c */
			
 
				 extern atomic_long_t mmap_pages_allocated;
			
 
				 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
			
@@ -1472,13 +1467,24 @@ extern int install_special_mapping(struct mm_struct *mm,
 
				 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
			
 
				 
			
 
				 extern unsigned long mmap_region(struct file *file, unsigned long addr,
			
 
				-	unsigned long len, unsigned long flags,
			
 
				-	vm_flags_t vm_flags, unsigned long pgoff);
			
 
				-extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
			
 
				-        unsigned long, unsigned long,
			
 
				-        unsigned long, unsigned long);
			
 
				+	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
			
 
				+extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
			
 
				+	unsigned long len, unsigned long prot, unsigned long flags,
			
 
				+	unsigned long pgoff, unsigned long *populate);
			
 
				 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
			
 
				 
			
 
				+#ifdef CONFIG_MMU
			
 
				+extern int __mm_populate(unsigned long addr, unsigned long len,
			
 
				+			 int ignore_errors);
			
 
				+static inline void mm_populate(unsigned long addr, unsigned long len)
			
 
				+{
			
 
				+	/* Ignore errors */
			
 
				+	(void) __mm_populate(addr, len, 1);
			
 
				+}
			
 
				+#else
			
 
				+static inline void mm_populate(unsigned long addr, unsigned long len) {}
			
 
				+#endif
			
 
				+
			
 
				 /* These take the mm semaphore themselves */
			
 
				 extern unsigned long vm_brk(unsigned long, unsigned long);
			
 
				 extern int vm_munmap(unsigned long, size_t);
			
@@ -1623,8 +1629,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
				 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
			
 
				 			unsigned long pfn);
			
 
				 
			
 
				-struct page *follow_page(struct vm_area_struct *, unsigned long address,
			
 
				-			unsigned int foll_flags);
			
 
				+struct page *follow_page_mask(struct vm_area_struct *vma,
			
 
				+			      unsigned long address, unsigned int foll_flags,
			
 
				+			      unsigned int *page_mask);
			
 
				+
			
 
				+static inline struct page *follow_page(struct vm_area_struct *vma,
			
 
				+		unsigned long address, unsigned int foll_flags)
			
 
				+{
			
 
				+	unsigned int unused_page_mask;
			
 
				+	return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
			
 
				+}
			
 
				+
			
 
				 #define FOLL_WRITE	0x01	/* check pte is writable */
			
 
				 #define FOLL_TOUCH	0x02	/* mark page accessed */
			
 
				 #define FOLL_GET	0x04	/* do get_page on page */
			
@@ -1636,6 +1651,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 
				 #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
			
 
				 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
			
 
				 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
			
 
				+#define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
			
 
				 
			
 
				 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
			
 
				 			void *data);
			
@@ -1707,7 +1723,11 @@ int vmemmap_populate_basepages(struct page *start_page,
 
				 						unsigned long pages, int node);
			
 
				 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
			
 
				 void vmemmap_populate_print_last(void);
			
 
				-
			
 
				+#ifdef CONFIG_MEMORY_HOTPLUG
			
 
				+void vmemmap_free(struct page *memmap, unsigned long nr_pages);
			
 
				+#endif
			
 
				+void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
			
 
				+				  unsigned long size);
			
 
				 
			
 
				 enum mf_flags {
			
 
				 	MF_COUNT_INCREASED = 1 << 0,
			
@@ -1720,7 +1740,7 @@ extern int unpoison_memory(unsigned long pfn);
 
				 extern int sysctl_memory_failure_early_kill;
			
 
				 extern int sysctl_memory_failure_recovery;
			
 
				 extern void shake_page(struct page *p, int access);
			
 
				-extern atomic_long_t mce_bad_pages;
			
 
				+extern atomic_long_t num_poisoned_pages;
			
 
				 extern int soft_offline_page(struct page *page, int flags);
			
 
				 
			
 
				 extern void dump_page(struct page *page);
			
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 
				 #include <linux/cpumask.h>
			
 
				 #include <linux/page-debug-flags.h>
			
 
				 #include <linux/uprobes.h>
			
 
				+#include <linux/page-flags-layout.h>
			
 
				 #include <asm/page.h>
			
 
				 #include <asm/mmu.h>
			
 
				 
			
@@ -173,7 +174,7 @@ struct page {
 
				 	void *shadow;
			
 
				 #endif
			
 
				 
			
 
				-#ifdef CONFIG_NUMA_BALANCING
			
 
				+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
			
 
				 	int _last_nid;
			
 
				 #endif
			
 
				 }
			
@@ -414,9 +415,9 @@ struct mm_struct {
 
				 #endif
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				 	/*
			
 
				-	 * numa_next_scan is the next time when the PTEs will me marked
			
 
				-	 * pte_numa to gather statistics and migrate pages to new nodes
			
 
				-	 * if necessary
			
 
				+	 * numa_next_scan is the next time that the PTEs will be marked
			
 
				+	 * pte_numa. NUMA hinting faults will gather statistics and migrate
			
 
				+	 * pages to new nodes if necessary.
			
 
				 	 */
			
 
				 	unsigned long numa_next_scan;
			
 
				 
			
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -79,6 +79,8 @@ calc_vm_flag_bits(unsigned long flags)
 
				 {
			
 
				 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
			
 
				 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
			
 
				-	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
			
 
				+	       ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) |
			
 
				+	       (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ?
			
 
				+							VM_POPULATE : 0);
			
 
				 }
			
 
				 #endif /* _LINUX_MMAN_H */
			
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -15,7 +15,7 @@
 
				 #include <linux/seqlock.h>
			
 
				 #include <linux/nodemask.h>
			
 
				 #include <linux/pageblock-flags.h>
			
 
				-#include <generated/bounds.h>
			
 
				+#include <linux/page-flags-layout.h>
			
 
				 #include <linux/atomic.h>
			
 
				 #include <asm/page.h>
			
 
				 
			
@@ -57,7 +57,9 @@ enum {
 
				 	 */
			
 
				 	MIGRATE_CMA,
			
 
				 #endif
			
 
				+#ifdef CONFIG_MEMORY_ISOLATION
			
 
				 	MIGRATE_ISOLATE,	/* can't allocate from here */
			
 
				+#endif
			
 
				 	MIGRATE_TYPES
			
 
				 };
			
 
				 
			
@@ -308,24 +310,6 @@ enum zone_type {
 
				 
			
 
				 #ifndef __GENERATING_BOUNDS_H
			
 
				 
			
 
				-/*
			
 
				- * When a memory allocation must conform to specific limitations (such
			
 
				- * as being suitable for DMA) the caller will pass in hints to the
			
 
				- * allocator in the gfp_mask, in the zone modifier bits.  These bits
			
 
				- * are used to select a priority ordered list of memory zones which
			
 
				- * match the requested limits. See gfp_zone() in include/linux/gfp.h
			
 
				- */
			
 
				-
			
 
				-#if MAX_NR_ZONES < 2
			
 
				-#define ZONES_SHIFT 0
			
 
				-#elif MAX_NR_ZONES <= 2
			
 
				-#define ZONES_SHIFT 1
			
 
				-#elif MAX_NR_ZONES <= 4
			
 
				-#define ZONES_SHIFT 2
			
 
				-#else
			
 
				-#error ZONES_SHIFT -- too many zones configured adjust calculation
			
 
				-#endif
			
 
				-
			
 
				 struct zone {
			
 
				 	/* Fields commonly accessed by the page allocator */
			
 
				 
			
@@ -543,6 +527,26 @@ static inline int zone_is_oom_locked(const struct zone *zone)
 
				 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
			
 
				 }
			
 
				 
			
 
				+static inline unsigned zone_end_pfn(const struct zone *zone)
			
 
				+{
			
 
				+	return zone->zone_start_pfn + zone->spanned_pages;
			
 
				+}
			
 
				+
			
 
				+static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
			
 
				+{
			
 
				+	return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
			
 
				+}
			
 
				+
			
 
				+static inline bool zone_is_initialized(struct zone *zone)
			
 
				+{
			
 
				+	return !!zone->wait_table;
			
 
				+}
			
 
				+
			
 
				+static inline bool zone_is_empty(struct zone *zone)
			
 
				+{
			
 
				+	return zone->spanned_pages == 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * The "priority" of VM scanning is how much of the queues we will scan in one
			
 
				  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
			
@@ -752,11 +756,17 @@ typedef struct pglist_data {
 
				 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
			
 
				 
			
 
				 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
			
 
				+#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
			
 
				 
			
 
				-#define node_end_pfn(nid) ({\
			
 
				-	pg_data_t *__pgdat = NODE_DATA(nid);\
			
 
				-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;\
			
 
				-})
			
 
				+static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
			
 
				+{
			
 
				+	return pgdat->node_start_pfn + pgdat->node_spanned_pages;
			
 
				+}
			
 
				+
			
 
				+static inline bool pgdat_is_empty(pg_data_t *pgdat)
			
 
				+{
			
 
				+	return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
			
 
				+}
			
 
				 
			
 
				 #include <linux/memory_hotplug.h>
			
 
				 
			
@@ -1053,8 +1063,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 
				  * PA_SECTION_SHIFT		physical address to/from section number
			
 
				  * PFN_SECTION_SHIFT		pfn to/from section number
			
 
				  */
			
 
				-#define SECTIONS_SHIFT		(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
			
 
				-
			
 
				 #define PA_SECTION_SHIFT	(SECTION_SIZE_BITS)
			
 
				 #define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)
			
 
				 
			
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -0,0 +1,88 @@
 
				+#ifndef PAGE_FLAGS_LAYOUT_H
			
 
				+#define PAGE_FLAGS_LAYOUT_H
			
 
				+
			
 
				+#include <linux/numa.h>
			
 
				+#include <generated/bounds.h>
			
 
				+
			
 
				+/*
			
 
				+ * When a memory allocation must conform to specific limitations (such
			
 
				+ * as being suitable for DMA) the caller will pass in hints to the
			
 
				+ * allocator in the gfp_mask, in the zone modifier bits.  These bits
			
 
				+ * are used to select a priority ordered list of memory zones which
			
 
				+ * match the requested limits. See gfp_zone() in include/linux/gfp.h
			
 
				+ */
			
 
				+#if MAX_NR_ZONES < 2
			
 
				+#define ZONES_SHIFT 0
			
 
				+#elif MAX_NR_ZONES <= 2
			
 
				+#define ZONES_SHIFT 1
			
 
				+#elif MAX_NR_ZONES <= 4
			
 
				+#define ZONES_SHIFT 2
			
 
				+#else
			
 
				+#error ZONES_SHIFT -- too many zones configured adjust calculation
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_SPARSEMEM
			
 
				+#include <asm/sparsemem.h>
			
 
				+
			
 
				+/* SECTION_SHIFT	#bits space required to store a section # */
			
 
				+#define SECTIONS_SHIFT	(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
			
 
				+
			
 
				+#endif /* CONFIG_SPARSEMEM */
			
 
				+
			
 
				+/*
			
 
				+ * page->flags layout:
			
 
				+ *
			
 
				+ * There are five possibilities for how page->flags get laid out.  The first
			
 
				+ * pair is for the normal case without sparsemem. The second pair is for
			
 
				+ * sparsemem when there is plenty of space for node and section information.
			
 
				+ * The last is when there is insufficient space in page->flags and a separate
			
 
				+ * lookup is necessary.
			
 
				+ *
			
 
				+ * No sparsemem or sparsemem vmemmap: |       NODE     | ZONE |          ... | FLAGS |
			
 
				+ *         " plus space for last_nid: |       NODE     | ZONE | LAST_NID ... | FLAGS |
			
 
				+ * classic sparse with space for node:| SECTION | NODE | ZONE |          ... | FLAGS |
			
 
				+ *         " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS |
			
 
				+ * classic sparse no space for node:  | SECTION |     ZONE    | ... | FLAGS |
			
 
				+ */
			
 
				+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
			
 
				+#define SECTIONS_WIDTH		SECTIONS_SHIFT
			
 
				+#else
			
 
				+#define SECTIONS_WIDTH		0
			
 
				+#endif
			
 
				+
			
 
				+#define ZONES_WIDTH		ZONES_SHIFT
			
 
				+
			
 
				+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				+#define NODES_WIDTH		NODES_SHIFT
			
 
				+#else
			
 
				+#ifdef CONFIG_SPARSEMEM_VMEMMAP
			
 
				+#error "Vmemmap: No space for nodes field in page flags"
			
 
				+#endif
			
 
				+#define NODES_WIDTH		0
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_NUMA_BALANCING
			
 
				+#define LAST_NID_SHIFT NODES_SHIFT
			
 
				+#else
			
 
				+#define LAST_NID_SHIFT 0
			
 
				+#endif
			
 
				+
			
 
				+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				+#define LAST_NID_WIDTH LAST_NID_SHIFT
			
 
				+#else
			
 
				+#define LAST_NID_WIDTH 0
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * We are going to use the flags for the page to node mapping if its in
			
 
				+ * there.  This includes the case where there is no node, so it is implicit.
			
 
				+ */
			
 
				+#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
			
 
				+#define NODE_NOT_IN_PAGE_FLAGS
			
 
				+#endif
			
 
				+
			
 
				+#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0
			
 
				+#define LAST_NID_NOT_IN_PAGE_FLAGS
			
 
				+#endif
			
 
				+
			
 
				+#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
			
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -1,6 +1,25 @@
 
				 #ifndef __LINUX_PAGEISOLATION_H
			
 
				 #define __LINUX_PAGEISOLATION_H
			
 
				 
			
 
				+#ifdef CONFIG_MEMORY_ISOLATION
			
 
				+static inline bool is_migrate_isolate_page(struct page *page)
			
 
				+{
			
 
				+	return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
			
 
				+}
			
 
				+static inline bool is_migrate_isolate(int migratetype)
			
 
				+{
			
 
				+	return migratetype == MIGRATE_ISOLATE;
			
 
				+}
			
 
				+#else
			
 
				+static inline bool is_migrate_isolate_page(struct page *page)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+static inline bool is_migrate_isolate(int migratetype)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+#endif
			
 
				 
			
 
				 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
			
 
				 			 bool skip_hwpoisoned_pages);
			
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -537,6 +537,7 @@ struct dev_pm_info {
 
				 	unsigned int		irq_safe:1;
			
 
				 	unsigned int		use_autosuspend:1;
			
 
				 	unsigned int		timer_autosuspends:1;
			
 
				+	unsigned int		memalloc_noio:1;
			
 
				 	enum rpm_request	request;
			
 
				 	enum rpm_status		runtime_status;
			
 
				 	int			runtime_error;
			
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -47,6 +47,7 @@ extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
 
				 extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
			
 
				 extern void pm_runtime_update_max_time_suspended(struct device *dev,
			
 
				 						 s64 delta_ns);
			
 
				+extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
			
 
				 
			
 
				 static inline bool pm_children_suspended(struct device *dev)
			
 
				 {
			
@@ -156,6 +157,8 @@ static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
 
				 						int delay) {}
			
 
				 static inline unsigned long pm_runtime_autosuspend_expiration(
			
 
				 				struct device *dev) { return 0; }
			
 
				+static inline void pm_runtime_set_memalloc_noio(struct device *dev,
			
 
				+						bool enable){}
			
 
				 
			
 
				 #endif /* !CONFIG_PM_RUNTIME */
			
 
				 
			
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -123,7 +123,7 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
 
				 	down_write(&anon_vma->root->rwsem);
			
 
				 }
			
 
				 
			
 
				-static inline void anon_vma_unlock(struct anon_vma *anon_vma)
			
 
				+static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
			
 
				 {
			
 
				 	up_write(&anon_vma->root->rwsem);
			
 
				 }
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
 
				 #include <linux/cred.h>
			
 
				 #include <linux/llist.h>
			
 
				 #include <linux/uidgid.h>
			
 
				+#include <linux/gfp.h>
			
 
				 
			
 
				 #include <asm/processor.h>
			
 
				 
			
@@ -1791,6 +1792,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 
				 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
			
 
				 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
			
 
				 #define PF_KSWAPD	0x00040000	/* I am kswapd */
			
 
				+#define PF_MEMALLOC_NOIO 0x00080000	/* Allocating memory without IO involved */
			
 
				 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
			
 
				 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
			
 
				 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
			
@@ -1828,6 +1830,26 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 
				 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
			
 
				 #define used_math() tsk_used_math(current)
			
 
				 
			
 
				+/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
			
 
				+static inline gfp_t memalloc_noio_flags(gfp_t flags)
			
 
				+{
			
 
				+	if (unlikely(current->flags & PF_MEMALLOC_NOIO))
			
 
				+		flags &= ~__GFP_IO;
			
 
				+	return flags;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned int memalloc_noio_save(void)
			
 
				+{
			
 
				+	unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
			
 
				+	current->flags |= PF_MEMALLOC_NOIO;
			
 
				+	return flags;
			
 
				+}
			
 
				+
			
 
				+static inline void memalloc_noio_restore(unsigned int flags)
			
 
				+{
			
 
				+	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * task->jobctl flags
			
 
				  */
			
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -8,7 +8,7 @@
 
				 #include <linux/memcontrol.h>
			
 
				 #include <linux/sched.h>
			
 
				 #include <linux/node.h>
			
 
				-
			
 
				+#include <linux/fs.h>
			
 
				 #include <linux/atomic.h>
			
 
				 #include <asm/page.h>
			
 
				 
			
@@ -156,7 +156,7 @@ enum {
 
				 	SWP_SCANNING	= (1 << 8),	/* refcount in scan_swap_map */
			
 
				 };
			
 
				 
			
 
				-#define SWAP_CLUSTER_MAX 32
			
 
				+#define SWAP_CLUSTER_MAX 32UL
			
 
				 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
			
 
				 
			
 
				 /*
			
@@ -202,6 +202,18 @@ struct swap_info_struct {
 
				 	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
			
 
				 	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
			
 
				 #endif
			
 
				+	spinlock_t lock;		/*
			
 
				+					 * protect map scan related fields like
			
 
				+					 * swap_map, lowest_bit, highest_bit,
			
 
				+					 * inuse_pages, cluster_next,
			
 
				+					 * cluster_nr, lowest_alloc and
			
 
				+					 * highest_alloc. other fields are only
			
 
				+					 * changed at swapon/swapoff, so are
			
 
				+					 * protected by swap_lock. changing
			
 
				+					 * flags need hold this lock and
			
 
				+					 * swap_lock. If both locks need hold,
			
 
				+					 * hold swap_lock first.
			
 
				+					 */
			
 
				 };
			
 
				 
			
 
				 struct swap_list_t {
			
@@ -209,15 +221,12 @@ struct swap_list_t {
 
				 	int next;	/* swapfile to be used next */
			
 
				 };
			
 
				 
			
 
				-/* Swap 50% full? Release swapcache more aggressively.. */
			
 
				-#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
			
 
				-
			
 
				 /* linux/mm/page_alloc.c */
			
 
				 extern unsigned long totalram_pages;
			
 
				 extern unsigned long totalreserve_pages;
			
 
				 extern unsigned long dirty_balance_reserve;
			
 
				-extern unsigned int nr_free_buffer_pages(void);
			
 
				-extern unsigned int nr_free_pagecache_pages(void);
			
 
				+extern unsigned long nr_free_buffer_pages(void);
			
 
				+extern unsigned long nr_free_pagecache_pages(void);
			
 
				 
			
 
				 /* Definition of global_page_state not available yet */
			
 
				 #define nr_free_pages() global_page_state(NR_FREE_PAGES)
			
@@ -266,7 +275,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 
				 extern unsigned long shrink_all_memory(unsigned long nr_pages);
			
 
				 extern int vm_swappiness;
			
 
				 extern int remove_mapping(struct address_space *mapping, struct page *page);
			
 
				-extern long vm_total_pages;
			
 
				+extern unsigned long vm_total_pages;
			
 
				 
			
 
				 #ifdef CONFIG_NUMA
			
 
				 extern int zone_reclaim_mode;
			
@@ -330,8 +339,9 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
 
				 		sector_t *);
			
 
				 
			
 
				 /* linux/mm/swap_state.c */
			
 
				-extern struct address_space swapper_space;
			
 
				-#define total_swapcache_pages  swapper_space.nrpages
			
 
				+extern struct address_space swapper_spaces[];
			
 
				+#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
			
 
				+extern unsigned long total_swapcache_pages(void);
			
 
				 extern void show_swap_cache_info(void);
			
 
				 extern int add_to_swap(struct page *);
			
 
				 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
			
@@ -346,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 
				 			struct vm_area_struct *vma, unsigned long addr);
			
 
				 
			
 
				 /* linux/mm/swapfile.c */
			
 
				-extern long nr_swap_pages;
			
 
				+extern atomic_long_t nr_swap_pages;
			
 
				 extern long total_swap_pages;
			
 
				+
			
 
				+/* Swap 50% full? Release swapcache more aggressively.. */
			
 
				+static inline bool vm_swap_full(void)
			
 
				+{
			
 
				+	return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
			
 
				+}
			
 
				+
			
 
				+static inline long get_nr_swap_pages(void)
			
 
				+{
			
 
				+	return atomic_long_read(&nr_swap_pages);
			
 
				+}
			
 
				+
			
 
				 extern void si_swapinfo(struct sysinfo *);
			
 
				 extern swp_entry_t get_swap_page(void);
			
 
				 extern swp_entry_t get_swap_page_of_type(int);
			
@@ -380,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 
				 
			
 
				 #else /* CONFIG_SWAP */
			
 
				 
			
 
				-#define nr_swap_pages				0L
			
 
				+#define get_nr_swap_pages()			0L
			
 
				 #define total_swap_pages			0L
			
 
				-#define total_swapcache_pages			0UL
			
 
				+#define total_swapcache_pages()			0UL
			
 
				+#define vm_swap_full()				0
			
 
				 
			
 
				 #define si_swapinfo(val) \
			
 
				 	do { (val)->freeswap = (val)->totalswap = 0; } while (0)
			
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -36,7 +36,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 
				 #endif
			
 
				 		PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
			
 
				 		KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
			
 
				-		KSWAPD_SKIP_CONGESTION_WAIT,
			
 
				 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				 		NUMA_PTE_UPDATES,
			
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -85,7 +85,7 @@ static inline void vm_events_fold_cpu(int cpu)
 
				 #define count_vm_numa_events(x, y) count_vm_events(x, y)
			
 
				 #else
			
 
				 #define count_vm_numa_event(x) do {} while (0)
			
 
				-#define count_vm_numa_events(x, y) do {} while (0)
			
 
				+#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
			
 
				 #endif /* CONFIG_NUMA_BALANCING */
			
 
				 
			
 
				 #define __count_zone_vm_events(item, zone, delta) \
			
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 
				 	unsigned long flags;
			
 
				 	unsigned long prot;
			
 
				 	int acc_mode;
			
 
				-	unsigned long user_addr;
			
 
				 	struct ipc_namespace *ns;
			
 
				 	struct shm_file_data *sfd;
			
 
				 	struct path path;
			
 
				 	fmode_t f_mode;
			
 
				+	unsigned long populate = 0;
			
 
				 
			
 
				 	err = -EINVAL;
			
 
				 	if (shmid < 0)
			
@@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 
				 			goto invalid;
			
 
				 	}
			
 
				 		
			
 
				-	user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
			
 
				-	*raddr = user_addr;
			
 
				+	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
			
 
				+	*raddr = addr;
			
 
				 	err = 0;
			
 
				-	if (IS_ERR_VALUE(user_addr))
			
 
				-		err = (long)user_addr;
			
 
				+	if (IS_ERR_VALUE(addr))
			
 
				+		err = (long)addr;
			
 
				 invalid:
			
 
				 	up_write(&current->mm->mmap_sem);
			
 
				+	if (populate)
			
 
				+		mm_populate(addr, populate);
			
 
				 
			
 
				 out_fput:
			
 
				 	fput(file);
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
 
				  */
			
 
				 static int select_fallback_rq(int cpu, struct task_struct *p)
			
 
				 {
			
 
				-	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
			
 
				+	int nid = cpu_to_node(cpu);
			
 
				+	const struct cpumask *nodemask = NULL;
			
 
				 	enum { cpuset, possible, fail } state = cpuset;
			
 
				 	int dest_cpu;
			
 
				 
			
 
				-	/* Look for allowed, online CPU in same node. */
			
 
				-	for_each_cpu(dest_cpu, nodemask) {
			
 
				-		if (!cpu_online(dest_cpu))
			
 
				-			continue;
			
 
				-		if (!cpu_active(dest_cpu))
			
 
				-			continue;
			
 
				-		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
			
 
				-			return dest_cpu;
			
 
				+	/*
			
 
				+	 * If the node that the cpu is on has been offlined, cpu_to_node()
			
 
				+	 * will return -1. There is no cpu on the node, and we should
			
 
				+	 * select the cpu on the other node.
			
 
				+	 */
			
 
				+	if (nid != -1) {
			
 
				+		nodemask = cpumask_of_node(nid);
			
 
				+
			
 
				+		/* Look for allowed, online CPU in same node. */
			
 
				+		for_each_cpu(dest_cpu, nodemask) {
			
 
				+			if (!cpu_online(dest_cpu))
			
 
				+				continue;
			
 
				+			if (!cpu_active(dest_cpu))
			
 
				+				continue;
			
 
				+			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
			
 
				+				return dest_cpu;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	for (;;) {
			
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,7 +105,6 @@ extern char core_pattern[];
 
				 extern unsigned int core_pipe_limit;
			
 
				 #endif
			
 
				 extern int pid_max;
			
 
				-extern int min_free_kbytes;
			
 
				 extern int pid_max_min, pid_max_max;
			
 
				 extern int sysctl_drop_caches;
			
 
				 extern int percpu_pagelist_fraction;
			
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -162,10 +162,16 @@ config MOVABLE_NODE
 
				 	  Say Y here if you want to hotplug a whole node.
			
 
				 	  Say N here if you want kernel to use memory on all nodes evenly.
			
 
				 
			
 
				+#
			
 
				+# Only be set on architectures that have completely implemented memory hotplug
			
 
				+# feature. If you are not sure, don't touch it.
			
 
				+#
			
 
				+config HAVE_BOOTMEM_INFO_NODE
			
 
				+	def_bool n
			
 
				+
			
 
				 # eventually, we can have this option just 'select SPARSEMEM'
			
 
				 config MEMORY_HOTPLUG
			
 
				 	bool "Allow for memory hot-add"
			
 
				-	select MEMORY_ISOLATION
			
 
				 	depends on SPARSEMEM || X86_64_ACPI_NUMA
			
 
				 	depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
			
 
				 	depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
			
@@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE
 
				 
			
 
				 config MEMORY_HOTREMOVE
			
 
				 	bool "Allow for memory hot remove"
			
 
				+	select MEMORY_ISOLATION
			
 
				+	select HAVE_BOOTMEM_INFO_NODE if X86_64
			
 
				 	depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
			
 
				 	depends on MIGRATION
			
 
				 
			
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -15,6 +15,7 @@
 
				 #include <linux/sysctl.h>
			
 
				 #include <linux/sysfs.h>
			
 
				 #include <linux/balloon_compaction.h>
			
 
				+#include <linux/page-isolation.h>
			
 
				 #include "internal.h"
			
 
				 
			
 
				 #ifdef CONFIG_COMPACTION
			
@@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
 
				 static void __reset_isolation_suitable(struct zone *zone)
			
 
				 {
			
 
				 	unsigned long start_pfn = zone->zone_start_pfn;
			
 
				-	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
			
 
				+	unsigned long end_pfn = zone_end_pfn(zone);
			
 
				 	unsigned long pfn;
			
 
				 
			
 
				 	zone->compact_cached_migrate_pfn = start_pfn;
			
@@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page)
 
				 	int migratetype = get_pageblock_migratetype(page);
			
 
				 
			
 
				 	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
			
 
				-	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
			
 
				+	if (migratetype == MIGRATE_RESERVE)
			
 
				+		return false;
			
 
				+
			
 
				+	if (is_migrate_isolate(migratetype))
			
 
				 		return false;
			
 
				 
			
 
				 	/* If the page is a large free page, then allow migration */
			
@@ -611,8 +615,7 @@ check_compact_cluster:
 
				 		continue;
			
 
				 
			
 
				 next_pageblock:
			
 
				-		low_pfn += pageblock_nr_pages;
			
 
				-		low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
			
 
				+		low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
			
 
				 		last_pageblock_nr = pageblock_nr;
			
 
				 	}
			
 
				 
			
@@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone,
 
				 				struct compact_control *cc)
			
 
				 {
			
 
				 	struct page *page;
			
 
				-	unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
			
 
				+	unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
			
 
				 	int nr_freepages = cc->nr_freepages;
			
 
				 	struct list_head *freelist = &cc->freepages;
			
 
				 
			
@@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone,
 
				 	 */
			
 
				 	high_pfn = min(low_pfn, pfn);
			
 
				 
			
 
				-	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
			
 
				+	z_end_pfn = zone_end_pfn(zone);
			
 
				 
			
 
				 	/*
			
 
				 	 * Isolate free pages until enough are available to migrate the
			
@@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone,
 
				 		 * only scans within a pageblock
			
 
				 		 */
			
 
				 		end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
			
 
				-		end_pfn = min(end_pfn, zone_end_pfn);
			
 
				+		end_pfn = min(end_pfn, z_end_pfn);
			
 
				 		isolated = isolate_freepages_block(cc, pfn, end_pfn,
			
 
				 						   freelist, false);
			
 
				 		nr_freepages += isolated;
			
@@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 
				 	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
			
 
				 
			
 
				 	/* Only scan within a pageblock boundary */
			
 
				-	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
			
 
				+	end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
			
 
				 
			
 
				 	/* Do not cross the free scanner or scan within a memory hole */
			
 
				 	if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
			
@@ -920,7 +923,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
				 {
			
 
				 	int ret;
			
 
				 	unsigned long start_pfn = zone->zone_start_pfn;
			
 
				-	unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
			
 
				+	unsigned long end_pfn = zone_end_pfn(zone);
			
 
				 
			
 
				 	ret = compaction_suitable(zone, cc->order);
			
 
				 	switch (ret) {
			
@@ -977,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
				 
			
 
				 		nr_migrate = cc->nr_migratepages;
			
 
				 		err = migrate_pages(&cc->migratepages, compaction_alloc,
			
 
				-				(unsigned long)cc, false,
			
 
				+				(unsigned long)cc,
			
 
				 				cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
			
 
				 				MR_COMPACTION);
			
 
				 		update_nr_listpages(cc);
			
@@ -1086,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
				 
			
 
				 
			
 
				 /* Compact all zones within a node */
			
 
				-static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
			
 
				+static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
			
 
				 {
			
 
				 	int zoneid;
			
 
				 	struct zone *zone;
			
@@ -1119,28 +1122,26 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
 
				 		VM_BUG_ON(!list_empty(&cc->freepages));
			
 
				 		VM_BUG_ON(!list_empty(&cc->migratepages));
			
 
				 	}
			
 
				-
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				-int compact_pgdat(pg_data_t *pgdat, int order)
			
 
				+void compact_pgdat(pg_data_t *pgdat, int order)
			
 
				 {
			
 
				 	struct compact_control cc = {
			
 
				 		.order = order,
			
 
				 		.sync = false,
			
 
				 	};
			
 
				 
			
 
				-	return __compact_pgdat(pgdat, &cc);
			
 
				+	__compact_pgdat(pgdat, &cc);
			
 
				 }
			
 
				 
			
 
				-static int compact_node(int nid)
			
 
				+static void compact_node(int nid)
			
 
				 {
			
 
				 	struct compact_control cc = {
			
 
				 		.order = -1,
			
 
				 		.sync = true,
			
 
				 	};
			
 
				 
			
 
				-	return __compact_pgdat(NODE_DATA(nid), &cc);
			
 
				+	__compact_pgdat(NODE_DATA(nid), &cc);
			
 
				 }
			
 
				 
			
 
				 /* Compact all nodes in the system */
			
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,6 +17,7 @@
 
				 #include <linux/fadvise.h>
			
 
				 #include <linux/writeback.h>
			
 
				 #include <linux/syscalls.h>
			
 
				+#include <linux/swap.h>
			
 
				 
			
 
				 #include <asm/unistd.h>
			
 
				 
			
@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 
				 		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
			
 
				 		end_index = (endbyte >> PAGE_CACHE_SHIFT);
			
 
				 
			
 
				-		if (end_index >= start_index)
			
 
				-			invalidate_mapping_pages(mapping, start_index,
			
 
				+		if (end_index >= start_index) {
			
 
				+			unsigned long count = invalidate_mapping_pages(mapping,
			
 
				+						start_index, end_index);
			
 
				+
			
 
				+			/*
			
 
				+			 * If fewer pages were invalidated than expected then
			
 
				+			 * it is possible that some of the pages were on
			
 
				+			 * a per-cpu pagevec for a remote CPU. Drain all
			
 
				+			 * pagevecs and try again.
			
 
				+			 */
			
 
				+			if (count < (end_index - start_index + 1)) {
			
 
				+				lru_add_drain_all();
			
 
				+				invalidate_mapping_pages(mapping, start_index,
			
 
				 						end_index);
			
 
				+			}
			
 
				+		}
			
 
				 		break;
			
 
				 	default:
			
 
				 		ret = -EINVAL;
			
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
				 	struct vm_area_struct *vma;
			
 
				 	int err = -EINVAL;
			
 
				 	int has_write_lock = 0;
			
 
				+	vm_flags_t vm_flags;
			
 
				 
			
 
				 	if (prot)
			
 
				 		return err;
			
@@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
				 	/*
			
 
				 	 * Make sure the vma is shared, that it supports prefaulting,
			
 
				 	 * and that the remapped range is valid and fully within
			
 
				-	 * the single existing vma.  vm_private_data is used as a
			
 
				-	 * swapout cursor in a VM_NONLINEAR vma.
			
 
				+	 * the single existing vma.
			
 
				 	 */
			
 
				 	if (!vma || !(vma->vm_flags & VM_SHARED))
			
 
				 		goto out;
			
 
				 
			
 
				-	if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
			
 
				-		goto out;
			
 
				-
			
 
				 	if (!vma->vm_ops || !vma->vm_ops->remap_pages)
			
 
				 		goto out;
			
 
				 
			
@@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
				 
			
 
				 	/* Must set VM_NONLINEAR before any pages are populated. */
			
 
				 	if (!(vma->vm_flags & VM_NONLINEAR)) {
			
 
				+		/*
			
 
				+		 * vm_private_data is used as a swapout cursor
			
 
				+		 * in a VM_NONLINEAR vma.
			
 
				+		 */
			
 
				+		if (vma->vm_private_data)
			
 
				+			goto out;
			
 
				+
			
 
				 		/* Don't need a nonlinear mapping, exit success */
			
 
				 		if (pgoff == linear_page_index(vma, start)) {
			
 
				 			err = 0;
			
@@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
				 		}
			
 
				 
			
 
				 		if (!has_write_lock) {
			
 
				+get_write_lock:
			
 
				 			up_read(&mm->mmap_sem);
			
 
				 			down_write(&mm->mmap_sem);
			
 
				 			has_write_lock = 1;
			
@@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
				 			unsigned long addr;
			
 
				 			struct file *file = get_file(vma->vm_file);
			
 
				 
			
 
				-			flags &= MAP_NONBLOCK;
			
 
				-			addr = mmap_region(file, start, size,
			
 
				-					flags, vma->vm_flags, pgoff);
			
 
				+			vm_flags = vma->vm_flags;
			
 
				+			if (!(flags & MAP_NONBLOCK))
			
 
				+				vm_flags |= VM_POPULATE;
			
 
				+			addr = mmap_region(file, start, size, vm_flags, pgoff);
			
 
				 			fput(file);
			
 
				 			if (IS_ERR_VALUE(addr)) {
			
 
				 				err = addr;
			
@@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
				 		mutex_unlock(&mapping->i_mmap_mutex);
			
 
				 	}
			
 
				 
			
 
				+	if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
			
 
				+		if (!has_write_lock)
			
 
				+			goto get_write_lock;
			
 
				+		vma->vm_flags |= VM_POPULATE;
			
 
				+	}
			
 
				+
			
 
				 	if (vma->vm_flags & VM_LOCKED) {
			
 
				 		/*
			
 
				 		 * drop PG_Mlocked flag for over-mapped range
			
 
				 		 */
			
 
				-		vm_flags_t saved_flags = vma->vm_flags;
			
 
				+		if (!has_write_lock)
			
 
				+			goto get_write_lock;
			
 
				+		vm_flags = vma->vm_flags;
			
 
				 		munlock_vma_pages_range(vma, start, start + size);
			
 
				-		vma->vm_flags = saved_flags;
			
 
				+		vma->vm_flags = vm_flags;
			
 
				 	}
			
 
				 
			
 
				 	mmu_notifier_invalidate_range_start(mm, start, start + size);
			
 
				 	err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
			
 
				 	mmu_notifier_invalidate_range_end(mm, start, start + size);
			
 
				-	if (!err && !(flags & MAP_NONBLOCK)) {
			
 
				-		if (vma->vm_flags & VM_LOCKED) {
			
 
				-			/*
			
 
				-			 * might be mapping previously unmapped range of file
			
 
				-			 */
			
 
				-			mlock_vma_pages_range(vma, start, start + size);
			
 
				-		} else {
			
 
				-			if (unlikely(has_write_lock)) {
			
 
				-				downgrade_write(&mm->mmap_sem);
			
 
				-				has_write_lock = 0;
			
 
				-			}
			
 
				-			make_pages_present(start, start+size);
			
 
				-		}
			
 
				-	}
			
 
				 
			
 
				 	/*
			
 
				 	 * We can't clear VM_NONLINEAR because we'd have to do
			
@@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
				 	 */
			
 
				 
			
 
				 out:
			
 
				+	vm_flags = vma->vm_flags;
			
 
				 	if (likely(!has_write_lock))
			
 
				 		up_read(&mm->mmap_sem);
			
 
				 	else
			
 
				 		up_write(&mm->mmap_sem);
			
 
				+	if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
			
 
				+		mm_populate(start, size);
			
 
				 
			
 
				 	return err;
			
 
				 }
			
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -20,6 +20,7 @@
 
				 #include <linux/mman.h>
			
 
				 #include <linux/pagemap.h>
			
 
				 #include <linux/migrate.h>
			
 
				+#include <linux/hashtable.h>
			
 
				 
			
 
				 #include <asm/tlb.h>
			
 
				 #include <asm/pgalloc.h>
			
@@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
 
				 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
			
 
				 
			
 
				 static int khugepaged(void *none);
			
 
				-static int mm_slots_hash_init(void);
			
 
				 static int khugepaged_slab_init(void);
			
 
				-static void khugepaged_slab_free(void);
			
 
				 
			
 
				-#define MM_SLOTS_HASH_HEADS 1024
			
 
				-static struct hlist_head *mm_slots_hash __read_mostly;
			
 
				+#define MM_SLOTS_HASH_BITS 10
			
 
				+static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
			
 
				+
			
 
				 static struct kmem_cache *mm_slot_cache __read_mostly;
			
 
				 
			
 
				 /**
			
@@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void)
 
				 	struct zone *zone;
			
 
				 	int nr_zones = 0;
			
 
				 	unsigned long recommended_min;
			
 
				-	extern int min_free_kbytes;
			
 
				 
			
 
				 	if (!khugepaged_enabled())
			
 
				 		return 0;
			
@@ -634,12 +633,6 @@ static int __init hugepage_init(void)
 
				 	if (err)
			
 
				 		goto out;
			
 
				 
			
 
				-	err = mm_slots_hash_init();
			
 
				-	if (err) {
			
 
				-		khugepaged_slab_free();
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				 	register_shrinker(&huge_zero_page_shrinker);
			
 
				 
			
 
				 	/*
			
@@ -1302,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	int target_nid;
			
 
				 	int current_nid = -1;
			
 
				 	bool migrated;
			
 
				-	bool page_locked = false;
			
 
				 
			
 
				 	spin_lock(&mm->page_table_lock);
			
 
				 	if (unlikely(!pmd_same(pmd, *pmdp)))
			
@@ -1324,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	/* Acquire the page lock to serialise THP migrations */
			
 
				 	spin_unlock(&mm->page_table_lock);
			
 
				 	lock_page(page);
			
 
				-	page_locked = true;
			
 
				 
			
 
				 	/* Confirm the PTE did not while locked */
			
 
				 	spin_lock(&mm->page_table_lock);
			
@@ -1337,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 
			
 
				 	/* Migrate the THP to the requested node */
			
 
				 	migrated = migrate_misplaced_transhuge_page(mm, vma,
			
 
				-				pmdp, pmd, addr,
			
 
				-				page, target_nid);
			
 
				-	if (migrated)
			
 
				-		current_nid = target_nid;
			
 
				-	else {
			
 
				-		spin_lock(&mm->page_table_lock);
			
 
				-		if (unlikely(!pmd_same(pmd, *pmdp))) {
			
 
				-			unlock_page(page);
			
 
				-			goto out_unlock;
			
 
				-		}
			
 
				-		goto clear_pmdnuma;
			
 
				-	}
			
 
				+				pmdp, pmd, addr, page, target_nid);
			
 
				+	if (!migrated)
			
 
				+		goto check_same;
			
 
				 
			
 
				-	task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
			
 
				+	task_numa_fault(target_nid, HPAGE_PMD_NR, true);
			
 
				 	return 0;
			
 
				 
			
 
				+check_same:
			
 
				+	spin_lock(&mm->page_table_lock);
			
 
				+	if (unlikely(!pmd_same(pmd, *pmdp)))
			
 
				+		goto out_unlock;
			
 
				 clear_pmdnuma:
			
 
				 	pmd = pmd_mknonnuma(pmd);
			
 
				 	set_pmd_at(mm, haddr, pmdp, pmd);
			
 
				 	VM_BUG_ON(pmd_numa(*pmdp));
			
 
				 	update_mmu_cache_pmd(vma, addr, pmdp);
			
 
				-	if (page_locked)
			
 
				-		unlock_page(page);
			
 
				-
			
 
				 out_unlock:
			
 
				 	spin_unlock(&mm->page_table_lock);
			
 
				 	if (current_nid != -1)
			
 
				-		task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
			
 
				+		task_numa_fault(current_nid, HPAGE_PMD_NR, false);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1656,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page)
 
				 		page_tail->mapping = page->mapping;
			
 
				 
			
 
				 		page_tail->index = page->index + i;
			
 
				-		page_xchg_last_nid(page_tail, page_last_nid(page));
			
 
				+		page_nid_xchg_last(page_tail, page_nid_last(page));
			
 
				 
			
 
				 		BUG_ON(!PageAnon(page_tail));
			
 
				 		BUG_ON(!PageUptodate(page_tail));
			
@@ -1846,7 +1829,7 @@ int split_huge_page(struct page *page)
 
				 
			
 
				 	BUG_ON(PageCompound(page));
			
 
				 out_unlock:
			
 
				-	anon_vma_unlock(anon_vma);
			
 
				+	anon_vma_unlock_write(anon_vma);
			
 
				 	put_anon_vma(anon_vma);
			
 
				 out:
			
 
				 	return ret;
			
@@ -1908,12 +1891,6 @@ static int __init khugepaged_slab_init(void)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void __init khugepaged_slab_free(void)
			
 
				-{
			
 
				-	kmem_cache_destroy(mm_slot_cache);
			
 
				-	mm_slot_cache = NULL;
			
 
				-}
			
 
				-
			
 
				 static inline struct mm_slot *alloc_mm_slot(void)
			
 
				 {
			
 
				 	if (!mm_slot_cache)	/* initialization failed */
			
@@ -1926,47 +1903,23 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
 
				 	kmem_cache_free(mm_slot_cache, mm_slot);
			
 
				 }
			
 
				 
			
 
				-static int __init mm_slots_hash_init(void)
			
 
				-{
			
 
				-	mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
			
 
				-				GFP_KERNEL);
			
 
				-	if (!mm_slots_hash)
			
 
				-		return -ENOMEM;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-#if 0
			
 
				-static void __init mm_slots_hash_free(void)
			
 
				-{
			
 
				-	kfree(mm_slots_hash);
			
 
				-	mm_slots_hash = NULL;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
			
 
				 {
			
 
				 	struct mm_slot *mm_slot;
			
 
				-	struct hlist_head *bucket;
			
 
				 	struct hlist_node *node;
			
 
				 
			
 
				-	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
			
 
				-				% MM_SLOTS_HASH_HEADS];
			
 
				-	hlist_for_each_entry(mm_slot, node, bucket, hash) {
			
 
				+	hash_for_each_possible(mm_slots_hash, mm_slot, node, hash, (unsigned long)mm)
			
 
				 		if (mm == mm_slot->mm)
			
 
				 			return mm_slot;
			
 
				-	}
			
 
				+
			
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				 static void insert_to_mm_slots_hash(struct mm_struct *mm,
			
 
				 				    struct mm_slot *mm_slot)
			
 
				 {
			
 
				-	struct hlist_head *bucket;
			
 
				-
			
 
				-	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
			
 
				-				% MM_SLOTS_HASH_HEADS];
			
 
				 	mm_slot->mm = mm;
			
 
				-	hlist_add_head(&mm_slot->hash, bucket);
			
 
				+	hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
			
 
				 }
			
 
				 
			
 
				 static inline int khugepaged_test_exit(struct mm_struct *mm)
			
@@ -2035,7 +1988,7 @@ void __khugepaged_exit(struct mm_struct *mm)
 
				 	spin_lock(&khugepaged_mm_lock);
			
 
				 	mm_slot = get_mm_slot(mm);
			
 
				 	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
			
 
				-		hlist_del(&mm_slot->hash);
			
 
				+		hash_del(&mm_slot->hash);
			
 
				 		list_del(&mm_slot->mm_node);
			
 
				 		free = 1;
			
 
				 	}
			
@@ -2368,7 +2321,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 
				 		BUG_ON(!pmd_none(*pmd));
			
 
				 		set_pmd_at(mm, address, pmd, _pmd);
			
 
				 		spin_unlock(&mm->page_table_lock);
			
 
				-		anon_vma_unlock(vma->anon_vma);
			
 
				+		anon_vma_unlock_write(vma->anon_vma);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -2376,7 +2329,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 
				 	 * All pages are isolated and locked so anon_vma rmap
			
 
				 	 * can't run anymore.
			
 
				 	 */
			
 
				-	anon_vma_unlock(vma->anon_vma);
			
 
				+	anon_vma_unlock_write(vma->anon_vma);
			
 
				 
			
 
				 	__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
			
 
				 	pte_unmap(pte);
			
@@ -2423,7 +2376,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 
				 	struct page *page;
			
 
				 	unsigned long _address;
			
 
				 	spinlock_t *ptl;
			
 
				-	int node = -1;
			
 
				+	int node = NUMA_NO_NODE;
			
 
				 
			
 
				 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
			
 
				 
			
@@ -2453,7 +2406,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 
				 		 * be more sophisticated and look at more pages,
			
 
				 		 * but isn't for now.
			
 
				 		 */
			
 
				-		if (node == -1)
			
 
				+		if (node == NUMA_NO_NODE)
			
 
				 			node = page_to_nid(page);
			
 
				 		VM_BUG_ON(PageCompound(page));
			
 
				 		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
			
@@ -2484,7 +2437,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 
				 
			
 
				 	if (khugepaged_test_exit(mm)) {
			
 
				 		/* free mm_slot */
			
 
				-		hlist_del(&mm_slot->hash);
			
 
				+		hash_del(&mm_slot->hash);
			
 
				 		list_del(&mm_slot->mm_node);
			
 
				 
			
 
				 		/*
			
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1293,8 +1293,7 @@ static void __init report_hugepages(void)
 
				 
			
 
				 	for_each_hstate(h) {
			
 
				 		char buf[32];
			
 
				-		printk(KERN_INFO "HugeTLB registered %s page size, "
			
 
				-				 "pre-allocated %ld pages\n",
			
 
				+		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
			
 
				 			memfmt(buf, huge_page_size(h)),
			
 
				 			h->free_huge_pages);
			
 
				 	}
			
@@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void)
 
				 		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
			
 
				 					 hstate_kobjs, &hstate_attr_group);
			
 
				 		if (err)
			
 
				-			printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
			
 
				-								h->name);
			
 
				+			pr_err("Hugetlb: Unable to add hstate %s", h->name);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node)
 
				 						nhs->hstate_kobjs,
			
 
				 						&per_node_hstate_attr_group);
			
 
				 		if (err) {
			
 
				-			printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
			
 
				-					" for node %d\n",
			
 
				-						h->name, node->dev.id);
			
 
				+			pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
			
 
				+				h->name, node->dev.id);
			
 
				 			hugetlb_unregister_node(node);
			
 
				 			break;
			
 
				 		}
			
@@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order)
 
				 	unsigned long i;
			
 
				 
			
 
				 	if (size_to_hstate(PAGE_SIZE << order)) {
			
 
				-		printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
			
 
				+		pr_warning("hugepagesz= specified twice, ignoring\n");
			
 
				 		return;
			
 
				 	}
			
 
				 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
			
@@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s)
 
				 		mhp = &parsed_hstate->max_huge_pages;
			
 
				 
			
 
				 	if (mhp == last_mhp) {
			
 
				-		printk(KERN_WARNING "hugepages= specified twice without "
			
 
				-			"interleaving hugepagesz=, ignoring\n");
			
 
				+		pr_warning("hugepages= specified twice without "
			
 
				+			   "interleaving hugepagesz=, ignoring\n");
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
@@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	 * COW. Warn that such a situation has occurred as it may not be obvious
			
 
				 	 */
			
 
				 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
			
 
				-		printk(KERN_WARNING
			
 
				-			"PID %d killed due to inadequate hugepage pool\n",
			
 
				-			current->pid);
			
 
				+		pr_warning("PID %d killed due to inadequate hugepage pool\n",
			
 
				+			   current->pid);
			
 
				 		return ret;
			
 
				 	}
			
 
				 
			
@@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				-			struct page **pages, struct vm_area_struct **vmas,
			
 
				-			unsigned long *position, int *length, int i,
			
 
				-			unsigned int flags)
			
 
				+long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				+			 struct page **pages, struct vm_area_struct **vmas,
			
 
				+			 unsigned long *position, unsigned long *nr_pages,
			
 
				+			 long i, unsigned int flags)
			
 
				 {
			
 
				 	unsigned long pfn_offset;
			
 
				 	unsigned long vaddr = *position;
			
 
				-	int remainder = *length;
			
 
				+	unsigned long remainder = *nr_pages;
			
 
				 	struct hstate *h = hstate_vma(vma);
			
 
				 
			
 
				 	spin_lock(&mm->page_table_lock);
			
@@ -3001,7 +2997,7 @@ same_page:
 
				 		}
			
 
				 	}
			
 
				 	spin_unlock(&mm->page_table_lock);
			
 
				-	*length = remainder;
			
 
				+	*nr_pages = remainder;
			
 
				 	*position = vaddr;
			
 
				 
			
 
				 	return i ? i : -EFAULT;
			
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -162,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 		struct vm_area_struct *prev, struct rb_node *rb_parent);
			
 
				 
			
 
				 #ifdef CONFIG_MMU
			
 
				-extern long mlock_vma_pages_range(struct vm_area_struct *vma,
			
 
				-			unsigned long start, unsigned long end);
			
 
				+extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
			
 
				+		unsigned long start, unsigned long end, int *nonblocking);
			
 
				 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
			
 
				 			unsigned long start, unsigned long end);
			
 
				 static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
			
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1300,9 +1300,8 @@ static void kmemleak_scan(void)
 
				 	 */
			
 
				 	lock_memory_hotplug();
			
 
				 	for_each_online_node(i) {
			
 
				-		pg_data_t *pgdat = NODE_DATA(i);
			
 
				-		unsigned long start_pfn = pgdat->node_start_pfn;
			
 
				-		unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
			
 
				+		unsigned long start_pfn = node_start_pfn(i);
			
 
				+		unsigned long end_pfn = node_end_pfn(i);
			
 
				 		unsigned long pfn;
			
 
				 
			
 
				 		for (pfn = start_pfn; pfn < end_pfn; pfn++) {
			
--- a/mm/ksm.c
+++ b/mm/ksm.c
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -16,6 +16,9 @@
 
				 #include <linux/ksm.h>
			
 
				 #include <linux/fs.h>
			
 
				 #include <linux/file.h>
			
 
				+#include <linux/blkdev.h>
			
 
				+#include <linux/swap.h>
			
 
				+#include <linux/swapops.h>
			
 
				 
			
 
				 /*
			
 
				  * Any behaviour which results in changes to the vma->vm_flags needs to
			
@@ -131,6 +134,84 @@ out:
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_SWAP
			
 
				+static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
			
 
				+	unsigned long end, struct mm_walk *walk)
			
 
				+{
			
 
				+	pte_t *orig_pte;
			
 
				+	struct vm_area_struct *vma = walk->private;
			
 
				+	unsigned long index;
			
 
				+
			
 
				+	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
			
 
				+		return 0;
			
 
				+
			
 
				+	for (index = start; index != end; index += PAGE_SIZE) {
			
 
				+		pte_t pte;
			
 
				+		swp_entry_t entry;
			
 
				+		struct page *page;
			
 
				+		spinlock_t *ptl;
			
 
				+
			
 
				+		orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
			
 
				+		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
			
 
				+		pte_unmap_unlock(orig_pte, ptl);
			
 
				+
			
 
				+		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
			
 
				+			continue;
			
 
				+		entry = pte_to_swp_entry(pte);
			
 
				+		if (unlikely(non_swap_entry(entry)))
			
 
				+			continue;
			
 
				+
			
 
				+		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
			
 
				+								vma, index);
			
 
				+		if (page)
			
 
				+			page_cache_release(page);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void force_swapin_readahead(struct vm_area_struct *vma,
			
 
				+		unsigned long start, unsigned long end)
			
 
				+{
			
 
				+	struct mm_walk walk = {
			
 
				+		.mm = vma->vm_mm,
			
 
				+		.pmd_entry = swapin_walk_pmd_entry,
			
 
				+		.private = vma,
			
 
				+	};
			
 
				+
			
 
				+	walk_page_range(start, end, &walk);
			
 
				+
			
 
				+	lru_add_drain();	/* Push any new pages onto the LRU now */
			
 
				+}
			
 
				+
			
 
				+static void force_shm_swapin_readahead(struct vm_area_struct *vma,
			
 
				+		unsigned long start, unsigned long end,
			
 
				+		struct address_space *mapping)
			
 
				+{
			
 
				+	pgoff_t index;
			
 
				+	struct page *page;
			
 
				+	swp_entry_t swap;
			
 
				+
			
 
				+	for (; start < end; start += PAGE_SIZE) {
			
 
				+		index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
			
 
				+
			
 
				+		page = find_get_page(mapping, index);
			
 
				+		if (!radix_tree_exceptional_entry(page)) {
			
 
				+			if (page)
			
 
				+				page_cache_release(page);
			
 
				+			continue;
			
 
				+		}
			
 
				+		swap = radix_to_swp_entry(page);
			
 
				+		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
			
 
				+								NULL, 0);
			
 
				+		if (page)
			
 
				+			page_cache_release(page);
			
 
				+	}
			
 
				+
			
 
				+	lru_add_drain();	/* Push any new pages onto the LRU now */
			
 
				+}
			
 
				+#endif		/* CONFIG_SWAP */
			
 
				+
			
 
				 /*
			
 
				  * Schedule all required I/O operations.  Do not wait for completion.
			
 
				  */
			
@@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma,
 
				 {
			
 
				 	struct file *file = vma->vm_file;
			
 
				 
			
 
				+#ifdef CONFIG_SWAP
			
 
				+	if (!file || mapping_cap_swap_backed(file->f_mapping)) {
			
 
				+		*prev = vma;
			
 
				+		if (!file)
			
 
				+			force_swapin_readahead(vma, start, end);
			
 
				+		else
			
 
				+			force_shm_swapin_readahead(vma, start, end,
			
 
				+						file->f_mapping);
			
 
				+		return 0;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	if (!file)
			
 
				 		return -EBADF;
			
 
				 
			
@@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 
				 	int error = -EINVAL;
			
 
				 	int write;
			
 
				 	size_t len;
			
 
				+	struct blk_plug plug;
			
 
				 
			
 
				 #ifdef CONFIG_MEMORY_FAILURE
			
 
				 	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
			
@@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 
				 	if (vma && start > vma->vm_start)
			
 
				 		prev = vma;
			
 
				 
			
 
				+	blk_start_plug(&plug);
			
 
				 	for (;;) {
			
 
				 		/* Still start < end. */
			
 
				 		error = -ENOMEM;
			
 
				 		if (!vma)
			
 
				-			goto out;
			
 
				+			goto out_plug;
			
 
				 
			
 
				 		/* Here start < (end|vma->vm_end). */
			
 
				 		if (start < vma->vm_start) {
			
 
				 			unmapped_error = -ENOMEM;
			
 
				 			start = vma->vm_start;
			
 
				 			if (start >= end)
			
 
				-				goto out;
			
 
				+				goto out_plug;
			
 
				 		}
			
 
				 
			
 
				 		/* Here vma->vm_start <= start < (end|vma->vm_end) */
			
@@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 
				 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
			
 
				 		error = madvise_vma(vma, &prev, start, tmp, behavior);
			
 
				 		if (error)
			
 
				-			goto out;
			
 
				+			goto out_plug;
			
 
				 		start = tmp;
			
 
				 		if (prev && start < prev->vm_end)
			
 
				 			start = prev->vm_end;
			
 
				 		error = unmapped_error;
			
 
				 		if (start >= end)
			
 
				-			goto out;
			
 
				+			goto out_plug;
			
 
				 		if (prev)
			
 
				 			vma = prev->vm_next;
			
 
				 		else	/* madvise_remove dropped mmap_sem */
			
 
				 			vma = find_vma(current->mm, start);
			
 
				 	}
			
 
				+out_plug:
			
 
				+	blk_finish_plug(&plug);
			
 
				 out:
			
 
				 	if (write)
			
 
				 		up_write(&current->mm->mmap_sem);
			
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
 
				  *
			
 
				  * Find @size free area aligned to @align in the specified range and node.
			
 
				  *
			
 
				+ * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
			
 
				+ * memory we found if not in hotpluggable ranges.
			
 
				+ *
			
 
				  * RETURNS:
			
 
				  * Found address on success, %0 on failure.
			
 
				  */
			
 
				+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
			
 
				+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
			
 
				+					phys_addr_t end, phys_addr_t size,
			
 
				+					phys_addr_t align, int nid)
			
 
				+{
			
 
				+	phys_addr_t this_start, this_end, cand;
			
 
				+	u64 i;
			
 
				+	int curr = movablemem_map.nr_map - 1;
			
 
				+
			
 
				+	/* pump up @end */
			
 
				+	if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
			
 
				+		end = memblock.current_limit;
			
 
				+
			
 
				+	/* avoid allocating the first page */
			
 
				+	start = max_t(phys_addr_t, start, PAGE_SIZE);
			
 
				+	end = max(start, end);
			
 
				+
			
 
				+	for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
			
 
				+		this_start = clamp(this_start, start, end);
			
 
				+		this_end = clamp(this_end, start, end);
			
 
				+
			
 
				+restart:
			
 
				+		if (this_end <= this_start || this_end < size)
			
 
				+			continue;
			
 
				+
			
 
				+		for (; curr >= 0; curr--) {
			
 
				+			if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
			
 
				+			    < this_end)
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+		cand = round_down(this_end - size, align);
			
 
				+		if (curr >= 0 &&
			
 
				+		    cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
			
 
				+			this_end = movablemem_map.map[curr].start_pfn
			
 
				+				   << PAGE_SHIFT;
			
 
				+			goto restart;
			
 
				+		}
			
 
				+
			
 
				+		if (cand >= this_start)
			
 
				+			return cand;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
			
 
				 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
			
 
				 					phys_addr_t end, phys_addr_t size,
			
 
				 					phys_addr_t align, int nid)
			
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
 
				+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
			
 
				 
			
 
				 /**
			
 
				  * memblock_find_in_range - find free area in given range
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
 
				 	"pgmajfault",
			
 
				 };
			
 
				 
			
 
				+static const char * const mem_cgroup_lru_names[] = {
			
 
				+	"inactive_anon",
			
 
				+	"active_anon",
			
 
				+	"inactive_file",
			
 
				+	"active_file",
			
 
				+	"unevictable",
			
 
				+};
			
 
				+
			
 
				 /*
			
 
				  * Per memcg event counter is incremented at every pagein/pageout. With THP,
			
 
				  * it will be incremated by the number of pages. This counter is used for
			
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
 
				 };
			
 
				 
			
 
				 struct mem_cgroup_lru_info {
			
 
				-	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
			
 
				+	struct mem_cgroup_per_node *nodeinfo[0];
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -275,17 +283,6 @@ struct mem_cgroup {
 
				 	 * the counter to account for kernel memory usage.
			
 
				 	 */
			
 
				 	struct res_counter kmem;
			
 
				-	/*
			
 
				-	 * Per cgroup active and inactive list, similar to the
			
 
				-	 * per zone LRU lists.
			
 
				-	 */
			
 
				-	struct mem_cgroup_lru_info info;
			
 
				-	int last_scanned_node;
			
 
				-#if MAX_NUMNODES > 1
			
 
				-	nodemask_t	scan_nodes;
			
 
				-	atomic_t	numainfo_events;
			
 
				-	atomic_t	numainfo_updating;
			
 
				-#endif
			
 
				 	/*
			
 
				 	 * Should the accounting and control be hierarchical, per subtree?
			
 
				 	 */
			
@@ -349,8 +346,29 @@ struct mem_cgroup {
 
				         /* Index in the kmem_cache->memcg_params->memcg_caches array */
			
 
				 	int kmemcg_id;
			
 
				 #endif
			
 
				+
			
 
				+	int last_scanned_node;
			
 
				+#if MAX_NUMNODES > 1
			
 
				+	nodemask_t	scan_nodes;
			
 
				+	atomic_t	numainfo_events;
			
 
				+	atomic_t	numainfo_updating;
			
 
				+#endif
			
 
				+	/*
			
 
				+	 * Per cgroup active and inactive list, similar to the
			
 
				+	 * per zone LRU lists.
			
 
				+	 *
			
 
				+	 * WARNING: This has to be the last element of the struct. Don't
			
 
				+	 * add new fields after this point.
			
 
				+	 */
			
 
				+	struct mem_cgroup_lru_info info;
			
 
				 };
			
 
				 
			
 
				+static size_t memcg_size(void)
			
 
				+{
			
 
				+	return sizeof(struct mem_cgroup) +
			
 
				+		nr_node_ids * sizeof(struct mem_cgroup_per_node);
			
 
				+}
			
 
				+
			
 
				 /* internal only representation about the status of kmem accounting. */
			
 
				 enum {
			
 
				 	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
			
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
 
				 
			
 
				 /* Stuffs for move charges at task migration. */
			
 
				 /*
			
 
				- * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
			
 
				- * left-shifted bitmap of these types.
			
 
				+ * Types of charges to be moved. "move_charge_at_immitgrate" and
			
 
				+ * "immigrate_flags" are treated as a left-shifted bitmap of these types.
			
 
				  */
			
 
				 enum move_type {
			
 
				 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
			
@@ -412,6 +430,7 @@ static struct move_charge_struct {
 
				 	spinlock_t	  lock; /* for from, to */
			
 
				 	struct mem_cgroup *from;
			
 
				 	struct mem_cgroup *to;
			
 
				+	unsigned long immigrate_flags;
			
 
				 	unsigned long precharge;
			
 
				 	unsigned long moved_charge;
			
 
				 	unsigned long moved_swap;
			
@@ -424,14 +443,12 @@ static struct move_charge_struct {
 
				 
			
 
				 static bool move_anon(void)
			
 
				 {
			
 
				-	return test_bit(MOVE_CHARGE_TYPE_ANON,
			
 
				-					&mc.to->move_charge_at_immigrate);
			
 
				+	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
			
 
				 }
			
 
				 
			
 
				 static bool move_file(void)
			
 
				 {
			
 
				-	return test_bit(MOVE_CHARGE_TYPE_FILE,
			
 
				-					&mc.to->move_charge_at_immigrate);
			
 
				+	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -471,6 +488,13 @@ enum res_type {
 
				 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
			
 
				 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
			
 
				 
			
 
				+/*
			
 
				+ * The memcg_create_mutex will be held whenever a new cgroup is created.
			
 
				+ * As a consequence, any change that needs to protect against new child cgroups
			
 
				+ * appearing has to hold it as well.
			
 
				+ */
			
 
				+static DEFINE_MUTEX(memcg_create_mutex);
			
 
				+
			
 
				 static void mem_cgroup_get(struct mem_cgroup *memcg);
			
 
				 static void mem_cgroup_put(struct mem_cgroup *memcg);
			
 
				 
			
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
 
				 static struct mem_cgroup_per_zone *
			
 
				 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
			
 
				 {
			
 
				+	VM_BUG_ON((unsigned)nid >= nr_node_ids);
			
 
				 	return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
			
 
				 }
			
 
				 
			
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 
				 	return inactive * inactive_ratio < active;
			
 
				 }
			
 
				 
			
 
				-int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
			
 
				-{
			
 
				-	unsigned long active;
			
 
				-	unsigned long inactive;
			
 
				-
			
 
				-	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
			
 
				-	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
			
 
				-
			
 
				-	return (active > inactive);
			
 
				-}
			
 
				-
			
 
				 #define mem_cgroup_from_res_counter(counter, member)	\
			
 
				 	container_of(counter, struct mem_cgroup, member)
			
 
				 
			
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
 
				 	spin_unlock_irqrestore(&memcg->move_lock, *flags);
			
 
				 }
			
 
				 
			
 
				+#define K(x) ((x) << (PAGE_SHIFT-10))
			
 
				 /**
			
 
				- * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
			
 
				+ * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
			
 
				  * @memcg: The memory cgroup that went over limit
			
 
				  * @p: Task that is going to be killed
			
 
				  *
			
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
				 	 */
			
 
				 	static char memcg_name[PATH_MAX];
			
 
				 	int ret;
			
 
				+	struct mem_cgroup *iter;
			
 
				+	unsigned int i;
			
 
				 
			
 
				-	if (!memcg || !p)
			
 
				+	if (!p)
			
 
				 		return;
			
 
				 
			
 
				 	rcu_read_lock();
			
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
				 	}
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				-	printk(KERN_INFO "Task in %s killed", memcg_name);
			
 
				+	pr_info("Task in %s killed", memcg_name);
			
 
				 
			
 
				 	rcu_read_lock();
			
 
				 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
			
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
				 	/*
			
 
				 	 * Continues from above, so we don't need an KERN_ level
			
 
				 	 */
			
 
				-	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
			
 
				+	pr_cont(" as a result of limit of %s\n", memcg_name);
			
 
				 done:
			
 
				 
			
 
				-	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
			
 
				+	pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
			
 
				 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
			
 
				 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
			
 
				 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
			
 
				-	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
			
 
				-		"failcnt %llu\n",
			
 
				+	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
			
 
				 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
			
 
				 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
			
 
				 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
			
 
				-	printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
			
 
				+	pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
			
 
				 		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
			
 
				 		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
			
 
				 		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
			
 
				+
			
 
				+	for_each_mem_cgroup_tree(iter, memcg) {
			
 
				+		pr_info("Memory cgroup stats");
			
 
				+
			
 
				+		rcu_read_lock();
			
 
				+		ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
			
 
				+		if (!ret)
			
 
				+			pr_cont(" for %s", memcg_name);
			
 
				+		rcu_read_unlock();
			
 
				+		pr_cont(":");
			
 
				+
			
 
				+		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
			
 
				+			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
			
 
				+				continue;
			
 
				+			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
			
 
				+				K(mem_cgroup_read_stat(iter, i)));
			
 
				+		}
			
 
				+
			
 
				+		for (i = 0; i < NR_LRU_LISTS; i++)
			
 
				+			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
			
 
				+				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
			
 
				+
			
 
				+		pr_cont("\n");
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy)
 
				 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
			
 
				 }
			
 
				 
			
 
				+static void __init memcg_stock_init(void)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				+	for_each_possible_cpu(cpu) {
			
 
				+		struct memcg_stock_pcp *stock =
			
 
				+					&per_cpu(memcg_stock, cpu);
			
 
				+		INIT_WORK(&stock->work, drain_local_stock);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Cache charges(val) which is from res_counter, to local per_cpu area.
			
 
				  * This will be consumed by consume_stock() function, later.
			
@@ -4391,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page)
 
				 
			
 
				 	pc = lookup_page_cgroup_used(page);
			
 
				 	if (pc) {
			
 
				-		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
			
 
				-		       pc, pc->flags, pc->mem_cgroup);
			
 
				+		pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
			
 
				+			 pc, pc->flags, pc->mem_cgroup);
			
 
				 	}
			
 
				 }
			
 
				 #endif
			
@@ -4718,6 +4769,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 
				 	} while (usage > 0);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * This mainly exists for tests during the setting of set of use_hierarchy.
			
 
				+ * Since this is the very setting we are changing, the current hierarchy value
			
 
				+ * is meaningless
			
 
				+ */
			
 
				+static inline bool __memcg_has_children(struct mem_cgroup *memcg)
			
 
				+{
			
 
				+	struct cgroup *pos;
			
 
				+
			
 
				+	/* bounce at first found */
			
 
				+	cgroup_for_each_child(pos, memcg->css.cgroup)
			
 
				+		return true;
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
			
 
				+ * to be already dead (as in mem_cgroup_force_empty, for instance).  This is
			
 
				+ * from mem_cgroup_count_children(), in the sense that we don't really care how
			
 
				+ * many children we have; we only need to know if we have any.  It also counts
			
 
				+ * any memcg without hierarchy as infertile.
			
 
				+ */
			
 
				+static inline bool memcg_has_children(struct mem_cgroup *memcg)
			
 
				+{
			
 
				+	return memcg->use_hierarchy && __memcg_has_children(memcg);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Reclaims as many pages from the given memcg as possible and moves
			
 
				  * the rest to the parent.
			
@@ -4788,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 
				 	if (parent)
			
 
				 		parent_memcg = mem_cgroup_from_cont(parent);
			
 
				 
			
 
				-	cgroup_lock();
			
 
				+	mutex_lock(&memcg_create_mutex);
			
 
				 
			
 
				 	if (memcg->use_hierarchy == val)
			
 
				 		goto out;
			
@@ -4803,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 
				 	 */
			
 
				 	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
			
 
				 				(val == 1 || val == 0)) {
			
 
				-		if (list_empty(&cont->children))
			
 
				+		if (!__memcg_has_children(memcg))
			
 
				 			memcg->use_hierarchy = val;
			
 
				 		else
			
 
				 			retval = -EBUSY;
			
@@ -4811,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 
				 		retval = -EINVAL;
			
 
				 
			
 
				 out:
			
 
				-	cgroup_unlock();
			
 
				+	mutex_unlock(&memcg_create_mutex);
			
 
				 
			
 
				 	return retval;
			
 
				 }
			
@@ -4896,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
 
				 {
			
 
				 	int ret = -EINVAL;
			
 
				 #ifdef CONFIG_MEMCG_KMEM
			
 
				-	bool must_inc_static_branch = false;
			
 
				-
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
			
 
				 	/*
			
 
				 	 * For simplicity, we won't allow this to be disabled.  It also can't
			
@@ -4910,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
 
				 	 *
			
 
				 	 * After it first became limited, changes in the value of the limit are
			
 
				 	 * of course permitted.
			
 
				-	 *
			
 
				-	 * Taking the cgroup_lock is really offensive, but it is so far the only
			
 
				-	 * way to guarantee that no children will appear. There are plenty of
			
 
				-	 * other offenders, and they should all go away. Fine grained locking
			
 
				-	 * is probably the way to go here. When we are fully hierarchical, we
			
 
				-	 * can also get rid of the use_hierarchy check.
			
 
				 	 */
			
 
				-	cgroup_lock();
			
 
				+	mutex_lock(&memcg_create_mutex);
			
 
				 	mutex_lock(&set_limit_mutex);
			
 
				 	if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
			
 
				-		if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
			
 
				-						!list_empty(&cont->children))) {
			
 
				+		if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
			
 
				 			ret = -EBUSY;
			
 
				 			goto out;
			
 
				 		}
			
@@ -4933,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
 
				 			res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
			
 
				 			goto out;
			
 
				 		}
			
 
				-		must_inc_static_branch = true;
			
 
				+		static_key_slow_inc(&memcg_kmem_enabled_key);
			
 
				+		/*
			
 
				+		 * setting the active bit after the inc will guarantee no one
			
 
				+		 * starts accounting before all call sites are patched
			
 
				+		 */
			
 
				+		memcg_kmem_set_active(memcg);
			
 
				+
			
 
				 		/*
			
 
				 		 * kmem charges can outlive the cgroup. In the case of slab
			
 
				 		 * pages, for instance, a page contain objects from various
			
@@ -4945,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
 
				 		ret = res_counter_set_limit(&memcg->kmem, val);
			
 
				 out:
			
 
				 	mutex_unlock(&set_limit_mutex);
			
 
				-	cgroup_unlock();
			
 
				-
			
 
				-	/*
			
 
				-	 * We are by now familiar with the fact that we can't inc the static
			
 
				-	 * branch inside cgroup_lock. See disarm functions for details. A
			
 
				-	 * worker here is overkill, but also wrong: After the limit is set, we
			
 
				-	 * must start accounting right away. Since this operation can't fail,
			
 
				-	 * we can safely defer it to here - no rollback will be needed.
			
 
				-	 *
			
 
				-	 * The boolean used to control this is also safe, because
			
 
				-	 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
			
 
				-	 * able to set it to true;
			
 
				-	 */
			
 
				-	if (must_inc_static_branch) {
			
 
				-		static_key_slow_inc(&memcg_kmem_enabled_key);
			
 
				-		/*
			
 
				-		 * setting the active bit after the inc will guarantee no one
			
 
				-		 * starts accounting before all call sites are patched
			
 
				-		 */
			
 
				-		memcg_kmem_set_active(memcg);
			
 
				-	}
			
 
				-
			
 
				+	mutex_unlock(&memcg_create_mutex);
			
 
				 #endif
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_MEMCG_KMEM
			
 
				 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
			
 
				 {
			
 
				 	int ret = 0;
			
@@ -4979,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 
				 		goto out;
			
 
				 
			
 
				 	memcg->kmem_account_flags = parent->kmem_account_flags;
			
 
				-#ifdef CONFIG_MEMCG_KMEM
			
 
				 	/*
			
 
				 	 * When that happen, we need to disable the static branch only on those
			
 
				 	 * memcgs that enabled it. To achieve this, we would be forced to
			
@@ -5005,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 
				 	mutex_lock(&set_limit_mutex);
			
 
				 	ret = memcg_update_cache_sizes(memcg);
			
 
				 	mutex_unlock(&set_limit_mutex);
			
 
				-#endif
			
 
				 out:
			
 
				 	return ret;
			
 
				 }
			
 
				+#endif /* CONFIG_MEMCG_KMEM */
			
 
				 
			
 
				 /*
			
 
				  * The user of this function is...
			
@@ -5148,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 
				 
			
 
				 	if (val >= (1 << NR_MOVE_TYPE))
			
 
				 		return -EINVAL;
			
 
				+
			
 
				 	/*
			
 
				-	 * We check this value several times in both in can_attach() and
			
 
				-	 * attach(), so we need cgroup lock to prevent this value from being
			
 
				-	 * inconsistent.
			
 
				+	 * No kind of locking is needed in here, because ->can_attach() will
			
 
				+	 * check this value once in the beginning of the process, and then carry
			
 
				+	 * on with stale data. This means that changes to this value will only
			
 
				+	 * affect task migrations starting after the change.
			
 
				 	 */
			
 
				-	cgroup_lock();
			
 
				 	memcg->move_charge_at_immigrate = val;
			
 
				-	cgroup_unlock();
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 #else
			
@@ -5214,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 
				 }
			
 
				 #endif /* CONFIG_NUMA */
			
 
				 
			
 
				-static const char * const mem_cgroup_lru_names[] = {
			
 
				-	"inactive_anon",
			
 
				-	"active_anon",
			
 
				-	"inactive_file",
			
 
				-	"active_file",
			
 
				-	"unevictable",
			
 
				-};
			
 
				-
			
 
				 static inline void mem_cgroup_lru_names_not_uptodate(void)
			
 
				 {
			
 
				 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
			
@@ -5335,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
 
				 
			
 
				 	parent = mem_cgroup_from_cont(cgrp->parent);
			
 
				 
			
 
				-	cgroup_lock();
			
 
				+	mutex_lock(&memcg_create_mutex);
			
 
				 
			
 
				 	/* If under hierarchy, only empty-root can set this value */
			
 
				-	if ((parent->use_hierarchy) ||
			
 
				-	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
			
 
				-		cgroup_unlock();
			
 
				+	if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
			
 
				+		mutex_unlock(&memcg_create_mutex);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				 	memcg->swappiness = val;
			
 
				 
			
 
				-	cgroup_unlock();
			
 
				+	mutex_unlock(&memcg_create_mutex);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -5672,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 
				 
			
 
				 	parent = mem_cgroup_from_cont(cgrp->parent);
			
 
				 
			
 
				-	cgroup_lock();
			
 
				+	mutex_lock(&memcg_create_mutex);
			
 
				 	/* oom-kill-disable is a flag for subhierarchy. */
			
 
				-	if ((parent->use_hierarchy) ||
			
 
				-	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
			
 
				-		cgroup_unlock();
			
 
				+	if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
			
 
				+		mutex_unlock(&memcg_create_mutex);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 	memcg->oom_kill_disable = val;
			
 
				 	if (!val)
			
 
				 		memcg_oom_recover(memcg);
			
 
				-	cgroup_unlock();
			
 
				+	mutex_unlock(&memcg_create_mutex);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -5797,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = {
 
				 		.read_seq_string = memcg_numa_stat_show,
			
 
				 	},
			
 
				 #endif
			
 
				-#ifdef CONFIG_MEMCG_SWAP
			
 
				-	{
			
 
				-		.name = "memsw.usage_in_bytes",
			
 
				-		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
			
 
				-		.read = mem_cgroup_read,
			
 
				-		.register_event = mem_cgroup_usage_register_event,
			
 
				-		.unregister_event = mem_cgroup_usage_unregister_event,
			
 
				-	},
			
 
				-	{
			
 
				-		.name = "memsw.max_usage_in_bytes",
			
 
				-		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
			
 
				-		.trigger = mem_cgroup_reset,
			
 
				-		.read = mem_cgroup_read,
			
 
				-	},
			
 
				-	{
			
 
				-		.name = "memsw.limit_in_bytes",
			
 
				-		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
			
 
				-		.write_string = mem_cgroup_write,
			
 
				-		.read = mem_cgroup_read,
			
 
				-	},
			
 
				-	{
			
 
				-		.name = "memsw.failcnt",
			
 
				-		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
			
 
				-		.trigger = mem_cgroup_reset,
			
 
				-		.read = mem_cgroup_read,
			
 
				-	},
			
 
				-#endif
			
 
				 #ifdef CONFIG_MEMCG_KMEM
			
 
				 	{
			
 
				 		.name = "kmem.limit_in_bytes",
			
@@ -5858,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = {
 
				 	{ },	/* terminate */
			
 
				 };
			
 
				 
			
 
				+#ifdef CONFIG_MEMCG_SWAP
			
 
				+static struct cftype memsw_cgroup_files[] = {
			
 
				+	{
			
 
				+		.name = "memsw.usage_in_bytes",
			
 
				+		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
			
 
				+		.read = mem_cgroup_read,
			
 
				+		.register_event = mem_cgroup_usage_register_event,
			
 
				+		.unregister_event = mem_cgroup_usage_unregister_event,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "memsw.max_usage_in_bytes",
			
 
				+		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
			
 
				+		.trigger = mem_cgroup_reset,
			
 
				+		.read = mem_cgroup_read,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "memsw.limit_in_bytes",
			
 
				+		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
			
 
				+		.write_string = mem_cgroup_write,
			
 
				+		.read = mem_cgroup_read,
			
 
				+	},
			
 
				+	{
			
 
				+		.name = "memsw.failcnt",
			
 
				+		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
			
 
				+		.trigger = mem_cgroup_reset,
			
 
				+		.read = mem_cgroup_read,
			
 
				+	},
			
 
				+	{ },	/* terminate */
			
 
				+};
			
 
				+#endif
			
 
				 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
			
 
				 {
			
 
				 	struct mem_cgroup_per_node *pn;
			
@@ -5896,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 
				 static struct mem_cgroup *mem_cgroup_alloc(void)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg;
			
 
				-	int size = sizeof(struct mem_cgroup);
			
 
				+	size_t size = memcg_size();
			
 
				 
			
 
				-	/* Can be very big if MAX_NUMNODES is very big */
			
 
				+	/* Can be very big if nr_node_ids is very big */
			
 
				 	if (size < PAGE_SIZE)
			
 
				 		memcg = kzalloc(size, GFP_KERNEL);
			
 
				 	else
			
@@ -5935,7 +5981,7 @@ out_free:
 
				 static void __mem_cgroup_free(struct mem_cgroup *memcg)
			
 
				 {
			
 
				 	int node;
			
 
				-	int size = sizeof(struct mem_cgroup);
			
 
				+	size_t size = memcg_size();
			
 
				 
			
 
				 	mem_cgroup_remove_from_trees(memcg);
			
 
				 	free_css_id(&mem_cgroup_subsys, &memcg->css);
			
@@ -6017,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 
				 }
			
 
				 EXPORT_SYMBOL(parent_mem_cgroup);
			
 
				 
			
 
				-#ifdef CONFIG_MEMCG_SWAP
			
 
				-static void __init enable_swap_cgroup(void)
			
 
				-{
			
 
				-	if (!mem_cgroup_disabled() && really_do_swap_account)
			
 
				-		do_swap_account = 1;
			
 
				-}
			
 
				-#else
			
 
				-static void __init enable_swap_cgroup(void)
			
 
				-{
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static int mem_cgroup_soft_limit_tree_init(void)
			
 
				+static void __init mem_cgroup_soft_limit_tree_init(void)
			
 
				 {
			
 
				 	struct mem_cgroup_tree_per_node *rtpn;
			
 
				 	struct mem_cgroup_tree_per_zone *rtpz;
			
@@ -6040,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
 
				 		if (!node_state(node, N_NORMAL_MEMORY))
			
 
				 			tmp = -1;
			
 
				 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
			
 
				-		if (!rtpn)
			
 
				-			goto err_cleanup;
			
 
				+		BUG_ON(!rtpn);
			
 
				 
			
 
				 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
			
 
				 
			
@@ -6051,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
 
				 			spin_lock_init(&rtpz->lock);
			
 
				 		}
			
 
				 	}
			
 
				-	return 0;
			
 
				-
			
 
				-err_cleanup:
			
 
				-	for_each_node(node) {
			
 
				-		if (!soft_limit_tree.rb_tree_per_node[node])
			
 
				-			break;
			
 
				-		kfree(soft_limit_tree.rb_tree_per_node[node]);
			
 
				-		soft_limit_tree.rb_tree_per_node[node] = NULL;
			
 
				-	}
			
 
				-	return 1;
			
 
				-
			
 
				 }
			
 
				 
			
 
				 static struct cgroup_subsys_state * __ref
			
 
				 mem_cgroup_css_alloc(struct cgroup *cont)
			
 
				 {
			
 
				-	struct mem_cgroup *memcg, *parent;
			
 
				+	struct mem_cgroup *memcg;
			
 
				 	long error = -ENOMEM;
			
 
				 	int node;
			
 
				 
			
@@ -6081,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 
				 
			
 
				 	/* root ? */
			
 
				 	if (cont->parent == NULL) {
			
 
				-		int cpu;
			
 
				-		enable_swap_cgroup();
			
 
				-		parent = NULL;
			
 
				-		if (mem_cgroup_soft_limit_tree_init())
			
 
				-			goto free_out;
			
 
				 		root_mem_cgroup = memcg;
			
 
				-		for_each_possible_cpu(cpu) {
			
 
				-			struct memcg_stock_pcp *stock =
			
 
				-						&per_cpu(memcg_stock, cpu);
			
 
				-			INIT_WORK(&stock->work, drain_local_stock);
			
 
				-		}
			
 
				-	} else {
			
 
				-		parent = mem_cgroup_from_cont(cont->parent);
			
 
				-		memcg->use_hierarchy = parent->use_hierarchy;
			
 
				-		memcg->oom_kill_disable = parent->oom_kill_disable;
			
 
				+		res_counter_init(&memcg->res, NULL);
			
 
				+		res_counter_init(&memcg->memsw, NULL);
			
 
				+		res_counter_init(&memcg->kmem, NULL);
			
 
				 	}
			
 
				 
			
 
				-	if (parent && parent->use_hierarchy) {
			
 
				+	memcg->last_scanned_node = MAX_NUMNODES;
			
 
				+	INIT_LIST_HEAD(&memcg->oom_notify);
			
 
				+	atomic_set(&memcg->refcnt, 1);
			
 
				+	memcg->move_charge_at_immigrate = 0;
			
 
				+	mutex_init(&memcg->thresholds_lock);
			
 
				+	spin_lock_init(&memcg->move_lock);
			
 
				+
			
 
				+	return &memcg->css;
			
 
				+
			
 
				+free_out:
			
 
				+	__mem_cgroup_free(memcg);
			
 
				+	return ERR_PTR(error);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+mem_cgroup_css_online(struct cgroup *cont)
			
 
				+{
			
 
				+	struct mem_cgroup *memcg, *parent;
			
 
				+	int error = 0;
			
 
				+
			
 
				+	if (!cont->parent)
			
 
				+		return 0;
			
 
				+
			
 
				+	mutex_lock(&memcg_create_mutex);
			
 
				+	memcg = mem_cgroup_from_cont(cont);
			
 
				+	parent = mem_cgroup_from_cont(cont->parent);
			
 
				+
			
 
				+	memcg->use_hierarchy = parent->use_hierarchy;
			
 
				+	memcg->oom_kill_disable = parent->oom_kill_disable;
			
 
				+	memcg->swappiness = mem_cgroup_swappiness(parent);
			
 
				+
			
 
				+	if (parent->use_hierarchy) {
			
 
				 		res_counter_init(&memcg->res, &parent->res);
			
 
				 		res_counter_init(&memcg->memsw, &parent->memsw);
			
 
				 		res_counter_init(&memcg->kmem, &parent->kmem);
			
@@ -6119,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 
				 		 * much sense so let cgroup subsystem know about this
			
 
				 		 * unfortunate state in our controller.
			
 
				 		 */
			
 
				-		if (parent && parent != root_mem_cgroup)
			
 
				+		if (parent != root_mem_cgroup)
			
 
				 			mem_cgroup_subsys.broken_hierarchy = true;
			
 
				 	}
			
 
				-	memcg->last_scanned_node = MAX_NUMNODES;
			
 
				-	INIT_LIST_HEAD(&memcg->oom_notify);
			
 
				-
			
 
				-	if (parent)
			
 
				-		memcg->swappiness = mem_cgroup_swappiness(parent);
			
 
				-	atomic_set(&memcg->refcnt, 1);
			
 
				-	memcg->move_charge_at_immigrate = 0;
			
 
				-	mutex_init(&memcg->thresholds_lock);
			
 
				-	spin_lock_init(&memcg->move_lock);
			
 
				 
			
 
				 	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
			
 
				+	mutex_unlock(&memcg_create_mutex);
			
 
				 	if (error) {
			
 
				 		/*
			
 
				 		 * We call put now because our (and parent's) refcnts
			
@@ -6140,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 
				 		 * call __mem_cgroup_free, so return directly
			
 
				 		 */
			
 
				 		mem_cgroup_put(memcg);
			
 
				-		return ERR_PTR(error);
			
 
				+		if (parent->use_hierarchy)
			
 
				+			mem_cgroup_put(parent);
			
 
				 	}
			
 
				-	return &memcg->css;
			
 
				-free_out:
			
 
				-	__mem_cgroup_free(memcg);
			
 
				-	return ERR_PTR(error);
			
 
				+	return error;
			
 
				 }
			
 
				 
			
 
				 static void mem_cgroup_css_offline(struct cgroup *cont)
			
@@ -6281,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 
				 	 * Because lookup_swap_cache() updates some statistics counter,
			
 
				 	 * we call find_get_page() with swapper_space directly.
			
 
				 	 */
			
 
				-	page = find_get_page(&swapper_space, ent.val);
			
 
				+	page = find_get_page(swap_address_space(ent), ent.val);
			
 
				 	if (do_swap_account)
			
 
				 		entry->val = ent.val;
			
 
				 
			
@@ -6322,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 
				 		swp_entry_t swap = radix_to_swp_entry(page);
			
 
				 		if (do_swap_account)
			
 
				 			*entry = swap;
			
 
				-		page = find_get_page(&swapper_space, swap.val);
			
 
				+		page = find_get_page(swap_address_space(swap), swap.val);
			
 
				 	}
			
 
				 #endif
			
 
				 	return page;
			
@@ -6532,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
 
				 	struct task_struct *p = cgroup_taskset_first(tset);
			
 
				 	int ret = 0;
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
			
 
				+	unsigned long move_charge_at_immigrate;
			
 
				 
			
 
				-	if (memcg->move_charge_at_immigrate) {
			
 
				+	/*
			
 
				+	 * We are now commited to this value whatever it is. Changes in this
			
 
				+	 * tunable will only affect upcoming migrations, not the current one.
			
 
				+	 * So we need to save it, and keep it going.
			
 
				+	 */
			
 
				+	move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
			
 
				+	if (move_charge_at_immigrate) {
			
 
				 		struct mm_struct *mm;
			
 
				 		struct mem_cgroup *from = mem_cgroup_from_task(p);
			
 
				 
			
@@ -6553,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
 
				 			spin_lock(&mc.lock);
			
 
				 			mc.from = from;
			
 
				 			mc.to = memcg;
			
 
				+			mc.immigrate_flags = move_charge_at_immigrate;
			
 
				 			spin_unlock(&mc.lock);
			
 
				 			/* We set mc.moving_task later */
			
 
				 
			
@@ -6747,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
 
				 	.name = "memory",
			
 
				 	.subsys_id = mem_cgroup_subsys_id,
			
 
				 	.css_alloc = mem_cgroup_css_alloc,
			
 
				+	.css_online = mem_cgroup_css_online,
			
 
				 	.css_offline = mem_cgroup_css_offline,
			
 
				 	.css_free = mem_cgroup_css_free,
			
 
				 	.can_attach = mem_cgroup_can_attach,
			
@@ -6757,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
 
				 	.use_id = 1,
			
 
				 };
			
 
				 
			
 
				-/*
			
 
				- * The rest of init is performed during ->css_alloc() for root css which
			
 
				- * happens before initcalls.  hotcpu_notifier() can't be done together as
			
 
				- * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
			
 
				- * dependency.  Do it from a subsys_initcall().
			
 
				- */
			
 
				-static int __init mem_cgroup_init(void)
			
 
				-{
			
 
				-	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
			
 
				-	return 0;
			
 
				-}
			
 
				-subsys_initcall(mem_cgroup_init);
			
 
				-
			
 
				 #ifdef CONFIG_MEMCG_SWAP
			
 
				 static int __init enable_swap_account(char *s)
			
 
				 {
			
@@ -6782,4 +6810,39 @@ static int __init enable_swap_account(char *s)
 
				 }
			
 
				 __setup("swapaccount=", enable_swap_account);
			
 
				 
			
 
				+static void __init memsw_file_init(void)
			
 
				+{
			
 
				+	WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
			
 
				+}
			
 
				+
			
 
				+static void __init enable_swap_cgroup(void)
			
 
				+{
			
 
				+	if (!mem_cgroup_disabled() && really_do_swap_account) {
			
 
				+		do_swap_account = 1;
			
 
				+		memsw_file_init();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+static void __init enable_swap_cgroup(void)
			
 
				+{
			
 
				+}
			
 
				 #endif
			
 
				+
			
 
				+/*
			
 
				+ * subsys_initcall() for memory controller.
			
 
				+ *
			
 
				+ * Some parts like hotcpu_notifier() have to be initialized from this context
			
 
				+ * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
			
 
				+ * everything that doesn't depend on a specific mem_cgroup structure should
			
 
				+ * be initialized from here.
			
 
				+ */
			
 
				+static int __init mem_cgroup_init(void)
			
 
				+{
			
 
				+	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
			
 
				+	enable_swap_cgroup();
			
 
				+	mem_cgroup_soft_limit_tree_init();
			
 
				+	memcg_stock_init();
			
 
				+	return 0;
			
 
				+}
			
 
				+subsys_initcall(mem_cgroup_init);
			
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
 
				 
			
 
				 int sysctl_memory_failure_recovery __read_mostly = 1;
			
 
				 
			
 
				-atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
			
 
				+atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
			
 
				 
			
 
				 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
			
 
				 
			
@@ -784,12 +784,12 @@ static struct page_state {
 
				 	{ sc|dirty,	sc|dirty,	"dirty swapcache",	me_swapcache_dirty },
			
 
				 	{ sc|dirty,	sc,		"clean swapcache",	me_swapcache_clean },
			
 
				 
			
 
				-	{ unevict|dirty, unevict|dirty,	"dirty unevictable LRU", me_pagecache_dirty },
			
 
				-	{ unevict,	unevict,	"clean unevictable LRU", me_pagecache_clean },
			
 
				-
			
 
				 	{ mlock|dirty,	mlock|dirty,	"dirty mlocked LRU",	me_pagecache_dirty },
			
 
				 	{ mlock,	mlock,		"clean mlocked LRU",	me_pagecache_clean },
			
 
				 
			
 
				+	{ unevict|dirty, unevict|dirty,	"dirty unevictable LRU", me_pagecache_dirty },
			
 
				+	{ unevict,	unevict,	"clean unevictable LRU", me_pagecache_clean },
			
 
				+
			
 
				 	{ lru|dirty,	lru|dirty,	"dirty LRU",	me_pagecache_dirty },
			
 
				 	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
			
 
				 
			
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 
				 	struct page *hpage;
			
 
				 	int res;
			
 
				 	unsigned int nr_pages;
			
 
				+	unsigned long page_flags;
			
 
				 
			
 
				 	if (!sysctl_memory_failure_recovery)
			
 
				 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
			
@@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	nr_pages = 1 << compound_trans_order(hpage);
			
 
				-	atomic_long_add(nr_pages, &mce_bad_pages);
			
 
				+	/*
			
 
				+	 * Currently errors on hugetlbfs pages are measured in hugepage units,
			
 
				+	 * so nr_pages should be 1 << compound_order.  OTOH when errors are on
			
 
				+	 * transparent hugepages, they are supposed to be split and error
			
 
				+	 * measurement is done in normal page units.  So nr_pages should be one
			
 
				+	 * in this case.
			
 
				+	 */
			
 
				+	if (PageHuge(p))
			
 
				+		nr_pages = 1 << compound_order(hpage);
			
 
				+	else /* normal page or thp */
			
 
				+		nr_pages = 1;
			
 
				+	atomic_long_add(nr_pages, &num_poisoned_pages);
			
 
				 
			
 
				 	/*
			
 
				 	 * We need/can do nothing about count=0 pages.
			
@@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 
				 			if (!PageHWPoison(hpage)
			
 
				 			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
			
 
				 			    || (p != hpage && TestSetPageHWPoison(hpage))) {
			
 
				-				atomic_long_sub(nr_pages, &mce_bad_pages);
			
 
				+				atomic_long_sub(nr_pages, &num_poisoned_pages);
			
 
				 				return 0;
			
 
				 			}
			
 
				 			set_page_hwpoison_huge_page(hpage);
			
@@ -1118,6 +1129,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 
				 	 */
			
 
				 	lock_page(hpage);
			
 
				 
			
 
				+	/*
			
 
				+	 * We use page flags to determine what action should be taken, but
			
 
				+	 * the flags can be modified by the error containment action.  One
			
 
				+	 * example is an mlocked page, where PG_mlocked is cleared by
			
 
				+	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
			
 
				+	 * correctly, we save a copy of the page flags at this time.
			
 
				+	 */
			
 
				+	page_flags = p->flags;
			
 
				+
			
 
				 	/*
			
 
				 	 * unpoison always clear PG_hwpoison inside page lock
			
 
				 	 */
			
@@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 
				 	}
			
 
				 	if (hwpoison_filter(p)) {
			
 
				 		if (TestClearPageHWPoison(p))
			
 
				-			atomic_long_sub(nr_pages, &mce_bad_pages);
			
 
				+			atomic_long_sub(nr_pages, &num_poisoned_pages);
			
 
				 		unlock_page(hpage);
			
 
				 		put_page(hpage);
			
 
				 		return 0;
			
@@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 
				 	}
			
 
				 
			
 
				 	res = -EBUSY;
			
 
				-	for (ps = error_states;; ps++) {
			
 
				-		if ((p->flags & ps->mask) == ps->res) {
			
 
				-			res = page_action(ps, p, pfn);
			
 
				+	/*
			
 
				+	 * The first check uses the current page flags which may not have any
			
 
				+	 * relevant information. The second check with the saved page flagss is
			
 
				+	 * carried out only if the first check can't determine the page status.
			
 
				+	 */
			
 
				+	for (ps = error_states;; ps++)
			
 
				+		if ((p->flags & ps->mask) == ps->res)
			
 
				 			break;
			
 
				-		}
			
 
				-	}
			
 
				+	if (!ps->mask)
			
 
				+		for (ps = error_states;; ps++)
			
 
				+			if ((page_flags & ps->mask) == ps->res)
			
 
				+				break;
			
 
				+	res = page_action(ps, p, pfn);
			
 
				 out:
			
 
				 	unlock_page(hpage);
			
 
				 	return res;
			
@@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn)
 
				 			return 0;
			
 
				 		}
			
 
				 		if (TestClearPageHWPoison(p))
			
 
				-			atomic_long_sub(nr_pages, &mce_bad_pages);
			
 
				+			atomic_long_sub(nr_pages, &num_poisoned_pages);
			
 
				 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
			
 
				 		return 0;
			
 
				 	}
			
@@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn)
 
				 	 */
			
 
				 	if (TestClearPageHWPoison(page)) {
			
 
				 		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
			
 
				-		atomic_long_sub(nr_pages, &mce_bad_pages);
			
 
				+		atomic_long_sub(nr_pages, &num_poisoned_pages);
			
 
				 		freeit = 1;
			
 
				 		if (PageHuge(page))
			
 
				 			clear_page_hwpoison_huge_page(page);
			
@@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
 
				  * that is not free, and 1 for any other page type.
			
 
				  * For 1 the page is returned with increased page count, otherwise not.
			
 
				  */
			
 
				-static int get_any_page(struct page *p, unsigned long pfn, int flags)
			
 
				+static int __get_any_page(struct page *p, unsigned long pfn, int flags)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
@@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
 
				 	if (!get_page_unless_zero(compound_head(p))) {
			
 
				 		if (PageHuge(p)) {
			
 
				 			pr_info("%s: %#lx free huge page\n", __func__, pfn);
			
 
				-			ret = dequeue_hwpoisoned_huge_page(compound_head(p));
			
 
				+			ret = 0;
			
 
				 		} else if (is_free_buddy_page(p)) {
			
 
				 			pr_info("%s: %#lx free buddy page\n", __func__, pfn);
			
 
				-			/* Set hwpoison bit while page is still isolated */
			
 
				-			SetPageHWPoison(p);
			
 
				 			ret = 0;
			
 
				 		} else {
			
 
				 			pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
			
@@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static int get_any_page(struct page *page, unsigned long pfn, int flags)
			
 
				+{
			
 
				+	int ret = __get_any_page(page, pfn, flags);
			
 
				+
			
 
				+	if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
			
 
				+		/*
			
 
				+		 * Try to free it.
			
 
				+		 */
			
 
				+		put_page(page);
			
 
				+		shake_page(page, 1);
			
 
				+
			
 
				+		/*
			
 
				+		 * Did it turn free?
			
 
				+		 */
			
 
				+		ret = __get_any_page(page, pfn, 0);
			
 
				+		if (!PageLRU(page)) {
			
 
				+			pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
			
 
				+				pfn, page->flags);
			
 
				+			return -EIO;
			
 
				+		}
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static int soft_offline_huge_page(struct page *page, int flags)
			
 
				 {
			
 
				 	int ret;
			
 
				 	unsigned long pfn = page_to_pfn(page);
			
 
				 	struct page *hpage = compound_head(page);
			
 
				 
			
 
				-	ret = get_any_page(page, pfn, flags);
			
 
				-	if (ret < 0)
			
 
				-		return ret;
			
 
				-	if (ret == 0)
			
 
				-		goto done;
			
 
				-
			
 
				+	/*
			
 
				+	 * This double-check of PageHWPoison is to avoid the race with
			
 
				+	 * memory_failure(). See also comment in __soft_offline_page().
			
 
				+	 */
			
 
				+	lock_page(hpage);
			
 
				 	if (PageHWPoison(hpage)) {
			
 
				+		unlock_page(hpage);
			
 
				 		put_page(hpage);
			
 
				 		pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
			
 
				 		return -EBUSY;
			
 
				 	}
			
 
				+	unlock_page(hpage);
			
 
				 
			
 
				 	/* Keep page count to indicate a given hugepage is isolated. */
			
 
				-	ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
			
 
				+	ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
			
 
				 				MIGRATE_SYNC);
			
 
				 	put_page(hpage);
			
 
				 	if (ret) {
			
 
				 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
			
 
				 			pfn, ret, page->flags);
			
 
				-		return ret;
			
 
				-	}
			
 
				-done:
			
 
				-	if (!PageHWPoison(hpage))
			
 
				+	} else {
			
 
				+		set_page_hwpoison_huge_page(hpage);
			
 
				+		dequeue_hwpoisoned_huge_page(hpage);
			
 
				 		atomic_long_add(1 << compound_trans_order(hpage),
			
 
				-				&mce_bad_pages);
			
 
				-	set_page_hwpoison_huge_page(hpage);
			
 
				-	dequeue_hwpoisoned_huge_page(hpage);
			
 
				+				&num_poisoned_pages);
			
 
				+	}
			
 
				 	/* keep elevated page count for bad page */
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static int __soft_offline_page(struct page *page, int flags);
			
 
				+
			
 
				 /**
			
 
				  * soft_offline_page - Soft offline a page.
			
 
				  * @page: page to offline
			
@@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags)
 
				 	unsigned long pfn = page_to_pfn(page);
			
 
				 	struct page *hpage = compound_trans_head(page);
			
 
				 
			
 
				-	if (PageHuge(page))
			
 
				-		return soft_offline_huge_page(page, flags);
			
 
				-	if (PageTransHuge(hpage)) {
			
 
				+	if (PageHWPoison(page)) {
			
 
				+		pr_info("soft offline: %#lx page already poisoned\n", pfn);
			
 
				+		return -EBUSY;
			
 
				+	}
			
 
				+	if (!PageHuge(page) && PageTransHuge(hpage)) {
			
 
				 		if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
			
 
				 			pr_info("soft offline: %#lx: failed to split THP\n",
			
 
				 				pfn);
			
@@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags)
 
				 	ret = get_any_page(page, pfn, flags);
			
 
				 	if (ret < 0)
			
 
				 		return ret;
			
 
				-	if (ret == 0)
			
 
				-		goto done;
			
 
				-
			
 
				-	/*
			
 
				-	 * Page cache page we can handle?
			
 
				-	 */
			
 
				-	if (!PageLRU(page)) {
			
 
				-		/*
			
 
				-		 * Try to free it.
			
 
				-		 */
			
 
				-		put_page(page);
			
 
				-		shake_page(page, 1);
			
 
				-
			
 
				-		/*
			
 
				-		 * Did it turn free?
			
 
				-		 */
			
 
				-		ret = get_any_page(page, pfn, 0);
			
 
				-		if (ret < 0)
			
 
				-			return ret;
			
 
				-		if (ret == 0)
			
 
				-			goto done;
			
 
				-	}
			
 
				-	if (!PageLRU(page)) {
			
 
				-		pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
			
 
				-			pfn, page->flags);
			
 
				-		return -EIO;
			
 
				+	if (ret) { /* for in-use pages */
			
 
				+		if (PageHuge(page))
			
 
				+			ret = soft_offline_huge_page(page, flags);
			
 
				+		else
			
 
				+			ret = __soft_offline_page(page, flags);
			
 
				+	} else { /* for free pages */
			
 
				+		if (PageHuge(page)) {
			
 
				+			set_page_hwpoison_huge_page(hpage);
			
 
				+			dequeue_hwpoisoned_huge_page(hpage);
			
 
				+			atomic_long_add(1 << compound_trans_order(hpage),
			
 
				+					&num_poisoned_pages);
			
 
				+		} else {
			
 
				+			SetPageHWPoison(page);
			
 
				+			atomic_long_inc(&num_poisoned_pages);
			
 
				+		}
			
 
				 	}
			
 
				+	/* keep elevated page count for bad page */
			
 
				+	return ret;
			
 
				+}
			
 
				 
			
 
				-	lock_page(page);
			
 
				-	wait_on_page_writeback(page);
			
 
				+static int __soft_offline_page(struct page *page, int flags)
			
 
				+{
			
 
				+	int ret;
			
 
				+	unsigned long pfn = page_to_pfn(page);
			
 
				 
			
 
				 	/*
			
 
				-	 * Synchronized using the page lock with memory_failure()
			
 
				+	 * Check PageHWPoison again inside page lock because PageHWPoison
			
 
				+	 * is set by memory_failure() outside page lock. Note that
			
 
				+	 * memory_failure() also double-checks PageHWPoison inside page lock,
			
 
				+	 * so there's no race between soft_offline_page() and memory_failure().
			
 
				 	 */
			
 
				+	lock_page(page);
			
 
				+	wait_on_page_writeback(page);
			
 
				 	if (PageHWPoison(page)) {
			
 
				 		unlock_page(page);
			
 
				 		put_page(page);
			
 
				 		pr_info("soft offline: %#lx page already poisoned\n", pfn);
			
 
				 		return -EBUSY;
			
 
				 	}
			
 
				-
			
 
				 	/*
			
 
				 	 * Try to invalidate first. This should work for
			
 
				 	 * non dirty unmapped page cache pages.
			
@@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags)
 
				 	 */
			
 
				 	if (ret == 1) {
			
 
				 		put_page(page);
			
 
				-		ret = 0;
			
 
				 		pr_info("soft_offline: %#lx: invalidated\n", pfn);
			
 
				-		goto done;
			
 
				+		SetPageHWPoison(page);
			
 
				+		atomic_long_inc(&num_poisoned_pages);
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags)
 
				 	if (!ret) {
			
 
				 		LIST_HEAD(pagelist);
			
 
				 		inc_zone_page_state(page, NR_ISOLATED_ANON +
			
 
				-					    page_is_file_cache(page));
			
 
				+					page_is_file_cache(page));
			
 
				 		list_add(&page->lru, &pagelist);
			
 
				 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
			
 
				-							false, MIGRATE_SYNC,
			
 
				-							MR_MEMORY_FAILURE);
			
 
				+					MIGRATE_SYNC, MR_MEMORY_FAILURE);
			
 
				 		if (ret) {
			
 
				 			putback_lru_pages(&pagelist);
			
 
				 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
			
 
				 				pfn, ret, page->flags);
			
 
				 			if (ret > 0)
			
 
				 				ret = -EIO;
			
 
				+		} else {
			
 
				+			SetPageHWPoison(page);
			
 
				+			atomic_long_inc(&num_poisoned_pages);
			
 
				 		}
			
 
				 	} else {
			
 
				 		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
			
 
				 			pfn, ret, page_count(page), page->flags);
			
 
				 	}
			
 
				-	if (ret)
			
 
				-		return ret;
			
 
				-
			
 
				-done:
			
 
				-	atomic_long_add(1, &mce_bad_pages);
			
 
				-	SetPageHWPoison(page);
			
 
				-	/* keep elevated page count for bad page */
			
 
				 	return ret;
			
 
				 }
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,10 @@
 
				 
			
 
				 #include "internal.h"
			
 
				 
			
 
				+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
			
 
				+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
			
 
				+#endif
			
 
				+
			
 
				 #ifndef CONFIG_NEED_MULTIPLE_NODES
			
 
				 /* use the per-pgdat data instead for discontigmem - mbligh */
			
 
				 unsigned long max_mapnr;
			
@@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 
				 EXPORT_SYMBOL_GPL(zap_vma_ptes);
			
 
				 
			
 
				 /**
			
 
				- * follow_page - look up a page descriptor from a user-virtual address
			
 
				+ * follow_page_mask - look up a page descriptor from a user-virtual address
			
 
				  * @vma: vm_area_struct mapping @address
			
 
				  * @address: virtual address to look up
			
 
				  * @flags: flags modifying lookup behaviour
			
 
				+ * @page_mask: on output, *page_mask is set according to the size of the page
			
 
				  *
			
 
				  * @flags can have FOLL_ flags set, defined in <linux/mm.h>
			
 
				  *
			
@@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
 
				  * an error pointer if there is a mapping to something not represented
			
 
				  * by a page descriptor (see also vm_normal_page()).
			
 
				  */
			
 
				-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
			
 
				-			unsigned int flags)
			
 
				+struct page *follow_page_mask(struct vm_area_struct *vma,
			
 
				+			      unsigned long address, unsigned int flags,
			
 
				+			      unsigned int *page_mask)
			
 
				 {
			
 
				 	pgd_t *pgd;
			
 
				 	pud_t *pud;
			
@@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 
				 	struct page *page;
			
 
				 	struct mm_struct *mm = vma->vm_mm;
			
 
				 
			
 
				+	*page_mask = 0;
			
 
				+
			
 
				 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
			
 
				 	if (!IS_ERR(page)) {
			
 
				 		BUG_ON(flags & FOLL_GET);
			
@@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 
				 				page = follow_trans_huge_pmd(vma, address,
			
 
				 							     pmd, flags);
			
 
				 				spin_unlock(&mm->page_table_lock);
			
 
				+				*page_mask = HPAGE_PMD_NR - 1;
			
 
				 				goto out;
			
 
				 			}
			
 
				 		} else
			
@@ -1539,8 +1548,24 @@ split_fallthrough:
 
				 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
			
 
				 
			
 
				 	pte = *ptep;
			
 
				-	if (!pte_present(pte))
			
 
				-		goto no_page;
			
 
				+	if (!pte_present(pte)) {
			
 
				+		swp_entry_t entry;
			
 
				+		/*
			
 
				+		 * KSM's break_ksm() relies upon recognizing a ksm page
			
 
				+		 * even while it is being migrated, so for that case we
			
 
				+		 * need migration_entry_wait().
			
 
				+		 */
			
 
				+		if (likely(!(flags & FOLL_MIGRATION)))
			
 
				+			goto no_page;
			
 
				+		if (pte_none(pte) || pte_file(pte))
			
 
				+			goto no_page;
			
 
				+		entry = pte_to_swp_entry(pte);
			
 
				+		if (!is_migration_entry(entry))
			
 
				+			goto no_page;
			
 
				+		pte_unmap_unlock(ptep, ptl);
			
 
				+		migration_entry_wait(mm, pmd, address);
			
 
				+		goto split_fallthrough;
			
 
				+	}
			
 
				 	if ((flags & FOLL_NUMA) && pte_numa(pte))
			
 
				 		goto no_page;
			
 
				 	if ((flags & FOLL_WRITE) && !pte_write(pte))
			
@@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
 
				  * instead of __get_user_pages. __get_user_pages should be used only if
			
 
				  * you need some special @gup_flags.
			
 
				  */
			
 
				-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				-		     unsigned long start, int nr_pages, unsigned int gup_flags,
			
 
				-		     struct page **pages, struct vm_area_struct **vmas,
			
 
				-		     int *nonblocking)
			
 
				+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				+		unsigned long start, unsigned long nr_pages,
			
 
				+		unsigned int gup_flags, struct page **pages,
			
 
				+		struct vm_area_struct **vmas, int *nonblocking)
			
 
				 {
			
 
				-	int i;
			
 
				+	long i;
			
 
				 	unsigned long vm_flags;
			
 
				+	unsigned int page_mask;
			
 
				 
			
 
				-	if (nr_pages <= 0)
			
 
				+	if (!nr_pages)
			
 
				 		return 0;
			
 
				 
			
 
				 	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
			
@@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
				 				get_page(page);
			
 
				 			}
			
 
				 			pte_unmap(pte);
			
 
				+			page_mask = 0;
			
 
				 			goto next_page;
			
 
				 		}
			
 
				 
			
@@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
				 		do {
			
 
				 			struct page *page;
			
 
				 			unsigned int foll_flags = gup_flags;
			
 
				+			unsigned int page_increm;
			
 
				 
			
 
				 			/*
			
 
				 			 * If we have a pending SIGKILL, don't keep faulting
			
@@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
				 				return i ? i : -ERESTARTSYS;
			
 
				 
			
 
				 			cond_resched();
			
 
				-			while (!(page = follow_page(vma, start, foll_flags))) {
			
 
				+			while (!(page = follow_page_mask(vma, start,
			
 
				+						foll_flags, &page_mask))) {
			
 
				 				int ret;
			
 
				 				unsigned int fault_flags = 0;
			
 
				 
			
@@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
				 
			
 
				 				flush_anon_page(vma, page, start);
			
 
				 				flush_dcache_page(page);
			
 
				+				page_mask = 0;
			
 
				 			}
			
 
				 next_page:
			
 
				-			if (vmas)
			
 
				+			if (vmas) {
			
 
				 				vmas[i] = vma;
			
 
				-			i++;
			
 
				-			start += PAGE_SIZE;
			
 
				-			nr_pages--;
			
 
				+				page_mask = 0;
			
 
				+			}
			
 
				+			page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
			
 
				+			if (page_increm > nr_pages)
			
 
				+				page_increm = nr_pages;
			
 
				+			i += page_increm;
			
 
				+			start += page_increm * PAGE_SIZE;
			
 
				+			nr_pages -= page_increm;
			
 
				 		} while (nr_pages && start < vma->vm_end);
			
 
				 	} while (nr_pages);
			
 
				 	return i;
			
@@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 
				  *
			
 
				  * See also get_user_pages_fast, for performance critical applications.
			
 
				  */
			
 
				-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				-		unsigned long start, int nr_pages, int write, int force,
			
 
				-		struct page **pages, struct vm_area_struct **vmas)
			
 
				+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				+		unsigned long start, unsigned long nr_pages, int write,
			
 
				+		int force, struct page **pages, struct vm_area_struct **vmas)
			
 
				 {
			
 
				 	int flags = FOLL_TOUCH;
			
 
				 
			
@@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 		unsigned int flags, pte_t orig_pte)
			
 
				 {
			
 
				 	spinlock_t *ptl;
			
 
				-	struct page *page, *swapcache = NULL;
			
 
				+	struct page *page, *swapcache;
			
 
				 	swp_entry_t entry;
			
 
				 	pte_t pte;
			
 
				 	int locked;
			
@@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 		 */
			
 
				 		ret = VM_FAULT_HWPOISON;
			
 
				 		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
			
 
				+		swapcache = page;
			
 
				 		goto out_release;
			
 
				 	}
			
 
				 
			
 
				+	swapcache = page;
			
 
				 	locked = lock_page_or_retry(page, mm, flags);
			
 
				 
			
 
				 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
			
@@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
			
 
				 		goto out_page;
			
 
				 
			
 
				-	if (ksm_might_need_to_copy(page, vma, address)) {
			
 
				-		swapcache = page;
			
 
				-		page = ksm_does_need_to_copy(page, vma, address);
			
 
				-
			
 
				-		if (unlikely(!page)) {
			
 
				-			ret = VM_FAULT_OOM;
			
 
				-			page = swapcache;
			
 
				-			swapcache = NULL;
			
 
				-			goto out_page;
			
 
				-		}
			
 
				+	page = ksm_might_need_to_copy(page, vma, address);
			
 
				+	if (unlikely(!page)) {
			
 
				+		ret = VM_FAULT_OOM;
			
 
				+		page = swapcache;
			
 
				+		goto out_page;
			
 
				 	}
			
 
				 
			
 
				 	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
			
@@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	}
			
 
				 	flush_icache_page(vma, page);
			
 
				 	set_pte_at(mm, address, page_table, pte);
			
 
				-	do_page_add_anon_rmap(page, vma, address, exclusive);
			
 
				+	if (page == swapcache)
			
 
				+		do_page_add_anon_rmap(page, vma, address, exclusive);
			
 
				+	else /* ksm created a completely new copy */
			
 
				+		page_add_new_anon_rmap(page, vma, address);
			
 
				 	/* It's better to call commit-charge after rmap is established */
			
 
				 	mem_cgroup_commit_charge_swapin(page, ptr);
			
 
				 
			
@@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
			
 
				 		try_to_free_swap(page);
			
 
				 	unlock_page(page);
			
 
				-	if (swapcache) {
			
 
				+	if (page != swapcache) {
			
 
				 		/*
			
 
				 		 * Hold the lock to avoid the swap entry to be reused
			
 
				 		 * until we take the PT lock for the pte_same() check
			
@@ -3085,7 +3120,7 @@ out_page:
 
				 	unlock_page(page);
			
 
				 out_release:
			
 
				 	page_cache_release(page);
			
 
				-	if (swapcache) {
			
 
				+	if (page != swapcache) {
			
 
				 		unlock_page(swapcache);
			
 
				 		page_cache_release(swapcache);
			
 
				 	}
			
@@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 
				 }
			
 
				 #endif /* __PAGETABLE_PMD_FOLDED */
			
 
				 
			
 
				-int make_pages_present(unsigned long addr, unsigned long end)
			
 
				-{
			
 
				-	int ret, len, write;
			
 
				-	struct vm_area_struct * vma;
			
 
				-
			
 
				-	vma = find_vma(current->mm, addr);
			
 
				-	if (!vma)
			
 
				-		return -ENOMEM;
			
 
				-	/*
			
 
				-	 * We want to touch writable mappings with a write fault in order
			
 
				-	 * to break COW, except for shared mappings because these don't COW
			
 
				-	 * and we would not want to dirty them for nothing.
			
 
				-	 */
			
 
				-	write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
			
 
				-	BUG_ON(addr >= end);
			
 
				-	BUG_ON(end > vma->vm_end);
			
 
				-	len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
			
 
				-	ret = get_user_pages(current, current->mm, addr,
			
 
				-			len, write, 0, NULL, NULL);
			
 
				-	if (ret < 0)
			
 
				-		return ret;
			
 
				-	return ret == len ? 0 : -EFAULT;
			
 
				-}
			
 
				-
			
 
				 #if !defined(__HAVE_ARCH_GATE_AREA)
			
 
				 
			
 
				 #if defined(AT_SYSINFO_EHDR)
			
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
 
				 #include <linux/suspend.h>
			
 
				 #include <linux/mm_inline.h>
			
 
				 #include <linux/firmware-map.h>
			
 
				+#include <linux/stop_machine.h>
			
 
				 
			
 
				 #include <asm/tlbflush.h>
			
 
				 
			
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res)
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
			
 
				-#ifndef CONFIG_SPARSEMEM_VMEMMAP
			
 
				-static void get_page_bootmem(unsigned long info,  struct page *page,
			
 
				-			     unsigned long type)
			
 
				+void get_page_bootmem(unsigned long info,  struct page *page,
			
 
				+		      unsigned long type)
			
 
				 {
			
 
				 	page->lru.next = (struct list_head *) type;
			
 
				 	SetPagePrivate(page);
			
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page)
 
				 		mutex_lock(&ppb_lock);
			
 
				 		__free_pages_bootmem(page, 0);
			
 
				 		mutex_unlock(&ppb_lock);
			
 
				+		totalram_pages++;
			
 
				 	}
			
 
				 
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
			
 
				+#ifndef CONFIG_SPARSEMEM_VMEMMAP
			
 
				 static void register_page_bootmem_info_section(unsigned long start_pfn)
			
 
				 {
			
 
				 	unsigned long *usemap, mapsize, section_nr, i;
			
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
				 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
			
 
				 
			
 
				 }
			
 
				+#else /* CONFIG_SPARSEMEM_VMEMMAP */
			
 
				+static void register_page_bootmem_info_section(unsigned long start_pfn)
			
 
				+{
			
 
				+	unsigned long *usemap, mapsize, section_nr, i;
			
 
				+	struct mem_section *ms;
			
 
				+	struct page *page, *memmap;
			
 
				+
			
 
				+	if (!pfn_valid(start_pfn))
			
 
				+		return;
			
 
				+
			
 
				+	section_nr = pfn_to_section_nr(start_pfn);
			
 
				+	ms = __nr_to_section(section_nr);
			
 
				+
			
 
				+	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
			
 
				+
			
 
				+	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
			
 
				+
			
 
				+	usemap = __nr_to_section(section_nr)->pageblock_flags;
			
 
				+	page = virt_to_page(usemap);
			
 
				+
			
 
				+	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
			
 
				+
			
 
				+	for (i = 0; i < mapsize; i++, page++)
			
 
				+		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
			
 
				+}
			
 
				+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
			
 
				 
			
 
				 void register_page_bootmem_info_node(struct pglist_data *pgdat)
			
 
				 {
			
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 
				 	}
			
 
				 
			
 
				 	pfn = pgdat->node_start_pfn;
			
 
				-	end_pfn = pfn + pgdat->node_spanned_pages;
			
 
				+	end_pfn = pgdat_end_pfn(pgdat);
			
 
				 
			
 
				 	/* register_section info */
			
 
				 	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
			
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 
				 			register_page_bootmem_info_section(pfn);
			
 
				 	}
			
 
				 }
			
 
				-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
			
 
				+#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
			
 
				 
			
 
				 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
			
 
				 			   unsigned long end_pfn)
			
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
 
				 		set_page_links(pfn_to_page(pfn), zid, nid, pfn);
			
 
				 }
			
 
				 
			
 
				+/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
			
 
				+ * alloc_bootmem_node_nopanic() */
			
 
				+static int __ref ensure_zone_is_initialized(struct zone *zone,
			
 
				+			unsigned long start_pfn, unsigned long num_pages)
			
 
				+{
			
 
				+	if (!zone_is_initialized(zone))
			
 
				+		return init_currently_empty_zone(zone, start_pfn, num_pages,
			
 
				+						 MEMMAP_HOTPLUG);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
			
 
				 		unsigned long start_pfn, unsigned long end_pfn)
			
 
				 {
			
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
 
				 	unsigned long flags;
			
 
				 	unsigned long z1_start_pfn;
			
 
				 
			
 
				-	if (!z1->wait_table) {
			
 
				-		ret = init_currently_empty_zone(z1, start_pfn,
			
 
				-			end_pfn - start_pfn, MEMMAP_HOTPLUG);
			
 
				-		if (ret)
			
 
				-			return ret;
			
 
				-	}
			
 
				+	ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				 	pgdat_resize_lock(z1->zone_pgdat, &flags);
			
 
				 
			
 
				 	/* can't move pfns which are higher than @z2 */
			
 
				-	if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
			
 
				+	if (end_pfn > zone_end_pfn(z2))
			
 
				 		goto out_fail;
			
 
				 	/* the move out part mast at the left most of @z2 */
			
 
				 	if (start_pfn > z2->zone_start_pfn)
			
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
 
				 		z1_start_pfn = start_pfn;
			
 
				 
			
 
				 	resize_zone(z1, z1_start_pfn, end_pfn);
			
 
				-	resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
			
 
				+	resize_zone(z2, end_pfn, zone_end_pfn(z2));
			
 
				 
			
 
				 	pgdat_resize_unlock(z1->zone_pgdat, &flags);
			
 
				 
			
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
 
				 	unsigned long flags;
			
 
				 	unsigned long z2_end_pfn;
			
 
				 
			
 
				-	if (!z2->wait_table) {
			
 
				-		ret = init_currently_empty_zone(z2, start_pfn,
			
 
				-			end_pfn - start_pfn, MEMMAP_HOTPLUG);
			
 
				-		if (ret)
			
 
				-			return ret;
			
 
				-	}
			
 
				+	ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				 	pgdat_resize_lock(z1->zone_pgdat, &flags);
			
 
				 
			
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
 
				 	if (z1->zone_start_pfn > start_pfn)
			
 
				 		goto out_fail;
			
 
				 	/* the move out part mast at the right most of @z1 */
			
 
				-	if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn)
			
 
				+	if (zone_end_pfn(z1) >  end_pfn)
			
 
				 		goto out_fail;
			
 
				 	/* must included/overlap */
			
 
				-	if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
			
 
				+	if (start_pfn >= zone_end_pfn(z1))
			
 
				 		goto out_fail;
			
 
				 
			
 
				 	/* use end_pfn for z2's end_pfn if z2 is empty */
			
 
				 	if (z2->spanned_pages)
			
 
				-		z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
			
 
				+		z2_end_pfn = zone_end_pfn(z2);
			
 
				 	else
			
 
				 		z2_end_pfn = end_pfn;
			
 
				 
			
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 
				 	int nid = pgdat->node_id;
			
 
				 	int zone_type;
			
 
				 	unsigned long flags;
			
 
				+	int ret;
			
 
				 
			
 
				 	zone_type = zone - pgdat->node_zones;
			
 
				-	if (!zone->wait_table) {
			
 
				-		int ret;
			
 
				+	ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				-		ret = init_currently_empty_zone(zone, phys_start_pfn,
			
 
				-						nr_pages, MEMMAP_HOTPLUG);
			
 
				-		if (ret)
			
 
				-			return ret;
			
 
				-	}
			
 
				 	pgdat_resize_lock(zone->zone_pgdat, &flags);
			
 
				 	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
			
 
				 	grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
			
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
 
				 	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
			
 
				 }
			
 
				 
			
 
				-#ifdef CONFIG_SPARSEMEM_VMEMMAP
			
 
				-static int __remove_section(struct zone *zone, struct mem_section *ms)
			
 
				+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
			
 
				+static int find_smallest_section_pfn(int nid, struct zone *zone,
			
 
				+				     unsigned long start_pfn,
			
 
				+				     unsigned long end_pfn)
			
 
				+{
			
 
				+	struct mem_section *ms;
			
 
				+
			
 
				+	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
			
 
				+		ms = __pfn_to_section(start_pfn);
			
 
				+
			
 
				+		if (unlikely(!valid_section(ms)))
			
 
				+			continue;
			
 
				+
			
 
				+		if (unlikely(pfn_to_nid(start_pfn) != nid))
			
 
				+			continue;
			
 
				+
			
 
				+		if (zone && zone != page_zone(pfn_to_page(start_pfn)))
			
 
				+			continue;
			
 
				+
			
 
				+		return start_pfn;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
			
 
				+static int find_biggest_section_pfn(int nid, struct zone *zone,
			
 
				+				    unsigned long start_pfn,
			
 
				+				    unsigned long end_pfn)
			
 
				+{
			
 
				+	struct mem_section *ms;
			
 
				+	unsigned long pfn;
			
 
				+
			
 
				+	/* pfn is the end pfn of a memory section. */
			
 
				+	pfn = end_pfn - 1;
			
 
				+	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
			
 
				+		ms = __pfn_to_section(pfn);
			
 
				+
			
 
				+		if (unlikely(!valid_section(ms)))
			
 
				+			continue;
			
 
				+
			
 
				+		if (unlikely(pfn_to_nid(pfn) != nid))
			
 
				+			continue;
			
 
				+
			
 
				+		if (zone && zone != page_zone(pfn_to_page(pfn)))
			
 
				+			continue;
			
 
				+
			
 
				+		return pfn;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
			
 
				+			     unsigned long end_pfn)
			
 
				 {
			
 
				+	unsigned long zone_start_pfn =  zone->zone_start_pfn;
			
 
				+	unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
			
 
				+	unsigned long pfn;
			
 
				+	struct mem_section *ms;
			
 
				+	int nid = zone_to_nid(zone);
			
 
				+
			
 
				+	zone_span_writelock(zone);
			
 
				+	if (zone_start_pfn == start_pfn) {
			
 
				+		/*
			
 
				+		 * If the section is smallest section in the zone, it need
			
 
				+		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
			
 
				+		 * In this case, we find second smallest valid mem_section
			
 
				+		 * for shrinking zone.
			
 
				+		 */
			
 
				+		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
			
 
				+						zone_end_pfn);
			
 
				+		if (pfn) {
			
 
				+			zone->zone_start_pfn = pfn;
			
 
				+			zone->spanned_pages = zone_end_pfn - pfn;
			
 
				+		}
			
 
				+	} else if (zone_end_pfn == end_pfn) {
			
 
				+		/*
			
 
				+		 * If the section is biggest section in the zone, it need
			
 
				+		 * shrink zone->spanned_pages.
			
 
				+		 * In this case, we find second biggest valid mem_section for
			
 
				+		 * shrinking zone.
			
 
				+		 */
			
 
				+		pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
			
 
				+					       start_pfn);
			
 
				+		if (pfn)
			
 
				+			zone->spanned_pages = pfn - zone_start_pfn + 1;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				-	 * XXX: Freeing memmap with vmemmap is not implement yet.
			
 
				-	 *      This should be removed later.
			
 
				+	 * The section is not biggest or smallest mem_section in the zone, it
			
 
				+	 * only creates a hole in the zone. So in this case, we need not
			
 
				+	 * change the zone. But perhaps, the zone has only hole data. Thus
			
 
				+	 * it check the zone has only hole or not.
			
 
				 	 */
			
 
				-	return -EBUSY;
			
 
				+	pfn = zone_start_pfn;
			
 
				+	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
			
 
				+		ms = __pfn_to_section(pfn);
			
 
				+
			
 
				+		if (unlikely(!valid_section(ms)))
			
 
				+			continue;
			
 
				+
			
 
				+		if (page_zone(pfn_to_page(pfn)) != zone)
			
 
				+			continue;
			
 
				+
			
 
				+		 /* If the section is current section, it continues the loop */
			
 
				+		if (start_pfn == pfn)
			
 
				+			continue;
			
 
				+
			
 
				+		/* If we find valid section, we have nothing to do */
			
 
				+		zone_span_writeunlock(zone);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* The zone has no valid section */
			
 
				+	zone->zone_start_pfn = 0;
			
 
				+	zone->spanned_pages = 0;
			
 
				+	zone_span_writeunlock(zone);
			
 
				 }
			
 
				-#else
			
 
				-static int __remove_section(struct zone *zone, struct mem_section *ms)
			
 
				+
			
 
				+static void shrink_pgdat_span(struct pglist_data *pgdat,
			
 
				+			      unsigned long start_pfn, unsigned long end_pfn)
			
 
				+{
			
 
				+	unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
			
 
				+	unsigned long pgdat_end_pfn =
			
 
				+		pgdat->node_start_pfn + pgdat->node_spanned_pages;
			
 
				+	unsigned long pfn;
			
 
				+	struct mem_section *ms;
			
 
				+	int nid = pgdat->node_id;
			
 
				+
			
 
				+	if (pgdat_start_pfn == start_pfn) {
			
 
				+		/*
			
 
				+		 * If the section is smallest section in the pgdat, it need
			
 
				+		 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
			
 
				+		 * In this case, we find second smallest valid mem_section
			
 
				+		 * for shrinking zone.
			
 
				+		 */
			
 
				+		pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
			
 
				+						pgdat_end_pfn);
			
 
				+		if (pfn) {
			
 
				+			pgdat->node_start_pfn = pfn;
			
 
				+			pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
			
 
				+		}
			
 
				+	} else if (pgdat_end_pfn == end_pfn) {
			
 
				+		/*
			
 
				+		 * If the section is biggest section in the pgdat, it need
			
 
				+		 * shrink pgdat->node_spanned_pages.
			
 
				+		 * In this case, we find second biggest valid mem_section for
			
 
				+		 * shrinking zone.
			
 
				+		 */
			
 
				+		pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
			
 
				+					       start_pfn);
			
 
				+		if (pfn)
			
 
				+			pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If the section is not biggest or smallest mem_section in the pgdat,
			
 
				+	 * it only creates a hole in the pgdat. So in this case, we need not
			
 
				+	 * change the pgdat.
			
 
				+	 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
			
 
				+	 * has only hole or not.
			
 
				+	 */
			
 
				+	pfn = pgdat_start_pfn;
			
 
				+	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
			
 
				+		ms = __pfn_to_section(pfn);
			
 
				+
			
 
				+		if (unlikely(!valid_section(ms)))
			
 
				+			continue;
			
 
				+
			
 
				+		if (pfn_to_nid(pfn) != nid)
			
 
				+			continue;
			
 
				+
			
 
				+		 /* If the section is current section, it continues the loop */
			
 
				+		if (start_pfn == pfn)
			
 
				+			continue;
			
 
				+
			
 
				+		/* If we find valid section, we have nothing to do */
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* The pgdat has no valid section */
			
 
				+	pgdat->node_start_pfn = 0;
			
 
				+	pgdat->node_spanned_pages = 0;
			
 
				+}
			
 
				+
			
 
				+static void __remove_zone(struct zone *zone, unsigned long start_pfn)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				 	struct pglist_data *pgdat = zone->zone_pgdat;
			
 
				+	int nr_pages = PAGES_PER_SECTION;
			
 
				+	int zone_type;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	zone_type = zone - pgdat->node_zones;
			
 
				+
			
 
				+	pgdat_resize_lock(zone->zone_pgdat, &flags);
			
 
				+	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
			
 
				+	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
			
 
				+	pgdat_resize_unlock(zone->zone_pgdat, &flags);
			
 
				+}
			
 
				+
			
 
				+static int __remove_section(struct zone *zone, struct mem_section *ms)
			
 
				+{
			
 
				+	unsigned long start_pfn;
			
 
				+	int scn_nr;
			
 
				 	int ret = -EINVAL;
			
 
				 
			
 
				 	if (!valid_section(ms))
			
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				-	pgdat_resize_lock(pgdat, &flags);
			
 
				+	scn_nr = __section_nr(ms);
			
 
				+	start_pfn = section_nr_to_pfn(scn_nr);
			
 
				+	__remove_zone(zone, start_pfn);
			
 
				+
			
 
				 	sparse_remove_one_section(zone, ms);
			
 
				-	pgdat_resize_unlock(pgdat, &flags);
			
 
				 	return 0;
			
 
				 }
			
 
				-#endif
			
 
				 
			
 
				 /*
			
 
				  * Reasonably generic function for adding memory.  It is
			
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 
				 	unsigned long zholes_size[MAX_NR_ZONES] = {0};
			
 
				 	unsigned long start_pfn = start >> PAGE_SHIFT;
			
 
				 
			
 
				-	pgdat = arch_alloc_nodedata(nid);
			
 
				-	if (!pgdat)
			
 
				-		return NULL;
			
 
				+	pgdat = NODE_DATA(nid);
			
 
				+	if (!pgdat) {
			
 
				+		pgdat = arch_alloc_nodedata(nid);
			
 
				+		if (!pgdat)
			
 
				+			return NULL;
			
 
				 
			
 
				-	arch_refresh_nodedata(nid, pgdat);
			
 
				+		arch_refresh_nodedata(nid, pgdat);
			
 
				+	}
			
 
				 
			
 
				 	/* we can use NODE_DATA(nid) from here */
			
 
				 
			
@@ -854,7 +1080,8 @@ out:
 
				 int __ref add_memory(int nid, u64 start, u64 size)
			
 
				 {
			
 
				 	pg_data_t *pgdat = NULL;
			
 
				-	int new_pgdat = 0;
			
 
				+	bool new_pgdat;
			
 
				+	bool new_node;
			
 
				 	struct resource *res;
			
 
				 	int ret;
			
 
				 
			
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
 
				 	if (!res)
			
 
				 		goto out;
			
 
				 
			
 
				-	if (!node_online(nid)) {
			
 
				+	{	/* Stupid hack to suppress address-never-null warning */
			
 
				+		void *p = NODE_DATA(nid);
			
 
				+		new_pgdat = !p;
			
 
				+	}
			
 
				+	new_node = !node_online(nid);
			
 
				+	if (new_node) {
			
 
				 		pgdat = hotadd_new_pgdat(nid, start);
			
 
				 		ret = -ENOMEM;
			
 
				 		if (!pgdat)
			
 
				 			goto error;
			
 
				-		new_pgdat = 1;
			
 
				 	}
			
 
				 
			
 
				 	/* call arch's memory hotadd */
			
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
 
				 	/* we online node here. we can't roll back from here. */
			
 
				 	node_set_online(nid);
			
 
				 
			
 
				-	if (new_pgdat) {
			
 
				+	if (new_node) {
			
 
				 		ret = register_one_node(nid);
			
 
				 		/*
			
 
				 		 * If sysfs file of new node can't create, cpu on the node
			
@@ -901,8 +1132,7 @@ error:
 
				 	/* rollback pgdat allocation and others */
			
 
				 	if (new_pgdat)
			
 
				 		rollback_node_hotadd(nid, pgdat);
			
 
				-	if (res)
			
 
				-		release_memory_resource(res);
			
 
				+	release_memory_resource(res);
			
 
				 
			
 
				 out:
			
 
				 	unlock_memory_hotplug();
			
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 
				 		 * migrate_pages returns # of failed pages.
			
 
				 		 */
			
 
				 		ret = migrate_pages(&source, alloc_migrate_target, 0,
			
 
				-							true, MIGRATE_SYNC,
			
 
				-							MR_MEMORY_HOTPLUG);
			
 
				+					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
			
 
				 		if (ret)
			
 
				 			putback_lru_pages(&source);
			
 
				 	}
			
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 
				 	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
			
 
				 }
			
 
				 
			
 
				-int remove_memory(u64 start, u64 size)
			
 
				+/**
			
 
				+ * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
			
 
				+ * @start_pfn: start pfn of the memory range
			
 
				+ * @end_pfn: end pft of the memory range
			
 
				+ * @arg: argument passed to func
			
 
				+ * @func: callback for each memory section walked
			
 
				+ *
			
 
				+ * This function walks through all present mem sections in range
			
 
				+ * [start_pfn, end_pfn) and call func on each mem section.
			
 
				+ *
			
 
				+ * Returns the return value of func.
			
 
				+ */
			
 
				+static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
			
 
				+		void *arg, int (*func)(struct memory_block *, void *))
			
 
				 {
			
 
				 	struct memory_block *mem = NULL;
			
 
				 	struct mem_section *section;
			
 
				-	unsigned long start_pfn, end_pfn;
			
 
				 	unsigned long pfn, section_nr;
			
 
				 	int ret;
			
 
				 
			
 
				-	start_pfn = PFN_DOWN(start);
			
 
				-	end_pfn = start_pfn + PFN_DOWN(size);
			
 
				-
			
 
				 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
			
 
				 		section_nr = pfn_to_section_nr(pfn);
			
 
				 		if (!present_section_nr(section_nr))
			
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size)
 
				 		if (!mem)
			
 
				 			continue;
			
 
				 
			
 
				-		ret = offline_memory_block(mem);
			
 
				+		ret = func(mem, arg);
			
 
				 		if (ret) {
			
 
				 			kobject_put(&mem->dev.kobj);
			
 
				 			return ret;
			
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+/**
			
 
				+ * offline_memory_block_cb - callback function for offlining memory block
			
 
				+ * @mem: the memory block to be offlined
			
 
				+ * @arg: buffer to hold error msg
			
 
				+ *
			
 
				+ * Always return 0, and put the error msg in arg if any.
			
 
				+ */
			
 
				+static int offline_memory_block_cb(struct memory_block *mem, void *arg)
			
 
				+{
			
 
				+	int *ret = arg;
			
 
				+	int error = offline_memory_block(mem);
			
 
				+
			
 
				+	if (error != 0 && *ret == 0)
			
 
				+		*ret = error;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
			
 
				+{
			
 
				+	int ret = !is_memblock_offlined(mem);
			
 
				+
			
 
				+	if (unlikely(ret))
			
 
				+		pr_warn("removing memory fails, because memory "
			
 
				+			"[%#010llx-%#010llx] is onlined\n",
			
 
				+			PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
			
 
				+			PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int check_cpu_on_node(void *data)
			
 
				+{
			
 
				+	struct pglist_data *pgdat = data;
			
 
				+	int cpu;
			
 
				+
			
 
				+	for_each_present_cpu(cpu) {
			
 
				+		if (cpu_to_node(cpu) == pgdat->node_id)
			
 
				+			/*
			
 
				+			 * the cpu on this node isn't removed, and we can't
			
 
				+			 * offline this node.
			
 
				+			 */
			
 
				+			return -EBUSY;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void unmap_cpu_on_node(void *data)
			
 
				+{
			
 
				+#ifdef CONFIG_ACPI_NUMA
			
 
				+	struct pglist_data *pgdat = data;
			
 
				+	int cpu;
			
 
				+
			
 
				+	for_each_possible_cpu(cpu)
			
 
				+		if (cpu_to_node(cpu) == pgdat->node_id)
			
 
				+			numa_clear_node(cpu);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static int check_and_unmap_cpu_on_node(void *data)
			
 
				+{
			
 
				+	int ret = check_cpu_on_node(data);
			
 
				+
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * the node will be offlined when we come here, so we can clear
			
 
				+	 * the cpu_to_node() now.
			
 
				+	 */
			
 
				+
			
 
				+	unmap_cpu_on_node(data);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* offline the node if all memory sections of this node are removed */
			
 
				+void try_offline_node(int nid)
			
 
				+{
			
 
				+	pg_data_t *pgdat = NODE_DATA(nid);
			
 
				+	unsigned long start_pfn = pgdat->node_start_pfn;
			
 
				+	unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
			
 
				+	unsigned long pfn;
			
 
				+	struct page *pgdat_page = virt_to_page(pgdat);
			
 
				+	int i;
			
 
				+
			
 
				+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
			
 
				+		unsigned long section_nr = pfn_to_section_nr(pfn);
			
 
				+
			
 
				+		if (!present_section_nr(section_nr))
			
 
				+			continue;
			
 
				+
			
 
				+		if (pfn_to_nid(pfn) != nid)
			
 
				+			continue;
			
 
				+
			
 
				+		/*
			
 
				+		 * some memory sections of this node are not removed, and we
			
 
				+		 * can't offline node now.
			
 
				+		 */
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * all memory/cpu of this node are removed, we can offline this
			
 
				+	 * node now.
			
 
				+	 */
			
 
				+	node_set_offline(nid);
			
 
				+	unregister_one_node(nid);
			
 
				+
			
 
				+	if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
			
 
				+		/* node data is allocated from boot memory */
			
 
				+		return;
			
 
				+
			
 
				+	/* free waittable in each zone */
			
 
				+	for (i = 0; i < MAX_NR_ZONES; i++) {
			
 
				+		struct zone *zone = pgdat->node_zones + i;
			
 
				+
			
 
				+		if (zone->wait_table)
			
 
				+			vfree(zone->wait_table);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Since there is no way to guarentee the address of pgdat/zone is not
			
 
				+	 * on stack of any kernel threads or used by other kernel objects
			
 
				+	 * without reference counting or other symchronizing method, do not
			
 
				+	 * reset node_data and free pgdat here. Just reset it to 0 and reuse
			
 
				+	 * the memory when the node is online again.
			
 
				+	 */
			
 
				+	memset(pgdat, 0, sizeof(*pgdat));
			
 
				+}
			
 
				+EXPORT_SYMBOL(try_offline_node);
			
 
				+
			
 
				+int __ref remove_memory(int nid, u64 start, u64 size)
			
 
				+{
			
 
				+	unsigned long start_pfn, end_pfn;
			
 
				+	int ret = 0;
			
 
				+	int retry = 1;
			
 
				+
			
 
				+	start_pfn = PFN_DOWN(start);
			
 
				+	end_pfn = start_pfn + PFN_DOWN(size);
			
 
				+
			
 
				+	/*
			
 
				+	 * When CONFIG_MEMCG is on, one memory block may be used by other
			
 
				+	 * blocks to store page cgroup when onlining pages. But we don't know
			
 
				+	 * in what order pages are onlined. So we iterate twice to offline
			
 
				+	 * memory:
			
 
				+	 * 1st iterate: offline every non primary memory block.
			
 
				+	 * 2nd iterate: offline primary (i.e. first added) memory block.
			
 
				+	 */
			
 
				+repeat:
			
 
				+	walk_memory_range(start_pfn, end_pfn, &ret,
			
 
				+			  offline_memory_block_cb);
			
 
				+	if (ret) {
			
 
				+		if (!retry)
			
 
				+			return ret;
			
 
				+
			
 
				+		retry = 0;
			
 
				+		ret = 0;
			
 
				+		goto repeat;
			
 
				+	}
			
 
				+
			
 
				+	lock_memory_hotplug();
			
 
				+
			
 
				+	/*
			
 
				+	 * we have offlined all memory blocks like this:
			
 
				+	 *   1. lock memory hotplug
			
 
				+	 *   2. offline a memory block
			
 
				+	 *   3. unlock memory hotplug
			
 
				+	 *
			
 
				+	 * repeat step1-3 to offline the memory block. All memory blocks
			
 
				+	 * must be offlined before removing memory. But we don't hold the
			
 
				+	 * lock in the whole operation. So we should check whether all
			
 
				+	 * memory blocks are offlined.
			
 
				+	 */
			
 
				+
			
 
				+	ret = walk_memory_range(start_pfn, end_pfn, NULL,
			
 
				+				is_memblock_offlined_cb);
			
 
				+	if (ret) {
			
 
				+		unlock_memory_hotplug();
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	/* remove memmap entry */
			
 
				+	firmware_map_remove(start, start + size, "System RAM");
			
 
				+
			
 
				+	arch_remove_memory(start, size);
			
 
				+
			
 
				+	try_offline_node(nid);
			
 
				+
			
 
				+	unlock_memory_hotplug();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				 #else
			
 
				 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
			
 
				 {
			
 
				 	return -EINVAL;
			
 
				 }
			
 
				-int remove_memory(u64 start, u64 size)
			
 
				+int remove_memory(int nid, u64 start, u64 size)
			
 
				 {
			
 
				 	return -EINVAL;
			
 
				 }
			
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -26,7 +26,7 @@
 
				  *                the allocation to memory nodes instead
			
 
				  *
			
 
				  * preferred       Try a specific node first before normal fallback.
			
 
				- *                As a special case node -1 here means do the allocation
			
 
				+ *                As a special case NUMA_NO_NODE here means do the allocation
			
 
				  *                on the local CPU. This is normally identical to default,
			
 
				  *                but useful to set in a VMA when you have a non default
			
 
				  *                process policy.
			
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p)
 
				 
			
 
				 	if (!pol) {
			
 
				 		node = numa_node_id();
			
 
				-		if (node != -1)
			
 
				+		if (node != NUMA_NO_NODE)
			
 
				 			pol = &preferred_node_policy[node];
			
 
				 
			
 
				 		/* preferred_node_policy is not initialised early in boot */
			
@@ -161,19 +161,7 @@ static const struct mempolicy_operations {
 
				 /* Check that the nodemask contains at least one populated zone */
			
 
				 static int is_valid_nodemask(const nodemask_t *nodemask)
			
 
				 {
			
 
				-	int nd, k;
			
 
				-
			
 
				-	for_each_node_mask(nd, *nodemask) {
			
 
				-		struct zone *z;
			
 
				-
			
 
				-		for (k = 0; k <= policy_zone; k++) {
			
 
				-			z = &NODE_DATA(nd)->node_zones[k];
			
 
				-			if (z->present_pages > 0)
			
 
				-				return 1;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				+	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
			
 
				 }
			
 
				 
			
 
				 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
			
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 
				 	struct mempolicy *policy;
			
 
				 
			
 
				 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
			
 
				-		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
			
 
				+		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
			
 
				 
			
 
				 	if (mode == MPOL_DEFAULT) {
			
 
				 		if (nodes && !nodes_empty(*nodes))
			
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
				 		/*
			
 
				 		 * vm_normal_page() filters out zero pages, but there might
			
 
				 		 * still be PageReserved pages to skip, perhaps in a VDSO.
			
 
				-		 * And we cannot move PageKsm pages sensibly or safely yet.
			
 
				 		 */
			
 
				-		if (PageReserved(page) || PageKsm(page))
			
 
				+		if (PageReserved(page))
			
 
				 			continue;
			
 
				 		nid = page_to_nid(page);
			
 
				 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
			
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
				 
			
 
				 	if (!list_empty(&pagelist)) {
			
 
				 		err = migrate_pages(&pagelist, new_node_page, dest,
			
 
				-							false, MIGRATE_SYNC,
			
 
				-							MR_SYSCALL);
			
 
				+					MIGRATE_SYNC, MR_SYSCALL);
			
 
				 		if (err)
			
 
				 			putback_lru_pages(&pagelist);
			
 
				 	}
			
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 
				 
			
 
				 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
			
 
				 		 start, start + len, mode, mode_flags,
			
 
				-		 nmask ? nodes_addr(*nmask)[0] : -1);
			
 
				+		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
			
 
				 
			
 
				 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
			
 
				 
			
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 
				 		if (!list_empty(&pagelist)) {
			
 
				 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
			
 
				 			nr_failed = migrate_pages(&pagelist, new_vma_page,
			
 
				-						(unsigned long)vma,
			
 
				-						false, MIGRATE_SYNC,
			
 
				-						MR_MEMPOLICY_MBIND);
			
 
				+					(unsigned long)vma,
			
 
				+					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
			
 
				 			if (nr_failed)
			
 
				 				putback_lru_pages(&pagelist);
			
 
				 		}
			
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
 
				 	return pol;
			
 
				 }
			
 
				 
			
 
				+static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
			
 
				+{
			
 
				+	enum zone_type dynamic_policy_zone = policy_zone;
			
 
				+
			
 
				+	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
			
 
				+
			
 
				+	/*
			
 
				+	 * if policy->v.nodes has movable memory only,
			
 
				+	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
			
 
				+	 *
			
 
				+	 * policy->v.nodes is intersect with node_states[N_MEMORY].
			
 
				+	 * so if the following test faile, it implies
			
 
				+	 * policy->v.nodes has movable memory only.
			
 
				+	 */
			
 
				+	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
			
 
				+		dynamic_policy_zone = ZONE_MOVABLE;
			
 
				+
			
 
				+	return zone >= dynamic_policy_zone;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Return a nodemask representing a mempolicy for filtering nodes for
			
 
				  * page allocation
			
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 
				 {
			
 
				 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
			
 
				 	if (unlikely(policy->mode == MPOL_BIND) &&
			
 
				-			gfp_zone(gfp) >= policy_zone &&
			
 
				+			apply_policy_zone(policy, gfp_zone(gfp)) &&
			
 
				 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
			
 
				 		return &policy->v.nodes;
			
 
				 
			
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 
				 		 * it less likely we act on an unlikely task<->page
			
 
				 		 * relation.
			
 
				 		 */
			
 
				-		last_nid = page_xchg_last_nid(page, polnid);
			
 
				+		last_nid = page_nid_xchg_last(page, polnid);
			
 
				 		if (last_nid != polnid)
			
 
				 			goto out;
			
 
				 	}
			
@@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
 
				 		 vma->vm_pgoff,
			
 
				 		 sz, npol ? npol->mode : -1,
			
 
				 		 npol ? npol->flags : -1,
			
 
				-		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
			
 
				+		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
			
 
				 
			
 
				 	if (npol) {
			
 
				 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
			
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -464,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 
				 
			
 
				 	mlock_migrate_page(newpage, page);
			
 
				 	ksm_migrate_page(newpage, page);
			
 
				-
			
 
				+	/*
			
 
				+	 * Please do not reorder this without considering how mm/ksm.c's
			
 
				+	 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
			
 
				+	 */
			
 
				 	ClearPageSwapCache(page);
			
 
				 	ClearPagePrivate(page);
			
 
				 	set_page_private(page, 0);
			
@@ -698,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 
				 }
			
 
				 
			
 
				 static int __unmap_and_move(struct page *page, struct page *newpage,
			
 
				-			int force, bool offlining, enum migrate_mode mode)
			
 
				+				int force, enum migrate_mode mode)
			
 
				 {
			
 
				 	int rc = -EAGAIN;
			
 
				 	int remap_swapcache = 1;
			
@@ -728,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 
				 		lock_page(page);
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Only memory hotplug's offline_pages() caller has locked out KSM,
			
 
				-	 * and can safely migrate a KSM page.  The other cases have skipped
			
 
				-	 * PageKsm along with PageReserved - but it is only now when we have
			
 
				-	 * the page lock that we can be certain it will not go KSM beneath us
			
 
				-	 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
			
 
				-	 * its pagecount raised, but only here do we take the page lock which
			
 
				-	 * serializes that).
			
 
				-	 */
			
 
				-	if (PageKsm(page) && !offlining) {
			
 
				-		rc = -EBUSY;
			
 
				-		goto unlock;
			
 
				-	}
			
 
				-
			
 
				 	/* charge against new page */
			
 
				 	mem_cgroup_prepare_migration(page, newpage, &mem);
			
 
				 
			
@@ -768,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 
				 	 * File Caches may use write_page() or lock_page() in migration, then,
			
 
				 	 * just care Anon page here.
			
 
				 	 */
			
 
				-	if (PageAnon(page)) {
			
 
				+	if (PageAnon(page) && !PageKsm(page)) {
			
 
				 		/*
			
 
				 		 * Only page_lock_anon_vma_read() understands the subtleties of
			
 
				 		 * getting a hold on an anon_vma from outside one of its mms.
			
@@ -848,7 +837,6 @@ uncharge:
 
				 	mem_cgroup_end_migration(mem, page, newpage,
			
 
				 				 (rc == MIGRATEPAGE_SUCCESS ||
			
 
				 				  rc == MIGRATEPAGE_BALLOON_SUCCESS));
			
 
				-unlock:
			
 
				 	unlock_page(page);
			
 
				 out:
			
 
				 	return rc;
			
@@ -859,8 +847,7 @@ out:
 
				  * to the newly allocated page in newpage.
			
 
				  */
			
 
				 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
			
 
				-			struct page *page, int force, bool offlining,
			
 
				-			enum migrate_mode mode)
			
 
				+			struct page *page, int force, enum migrate_mode mode)
			
 
				 {
			
 
				 	int rc = 0;
			
 
				 	int *result = NULL;
			
@@ -878,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 
				 		if (unlikely(split_huge_page(page)))
			
 
				 			goto out;
			
 
				 
			
 
				-	rc = __unmap_and_move(page, newpage, force, offlining, mode);
			
 
				+	rc = __unmap_and_move(page, newpage, force, mode);
			
 
				 
			
 
				 	if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
			
 
				 		/*
			
@@ -938,8 +925,7 @@ out:
 
				  */
			
 
				 static int unmap_and_move_huge_page(new_page_t get_new_page,
			
 
				 				unsigned long private, struct page *hpage,
			
 
				-				int force, bool offlining,
			
 
				-				enum migrate_mode mode)
			
 
				+				int force, enum migrate_mode mode)
			
 
				 {
			
 
				 	int rc = 0;
			
 
				 	int *result = NULL;
			
@@ -1001,9 +987,8 @@ out:
 
				  *
			
 
				  * Return: Number of pages not migrated or error code.
			
 
				  */
			
 
				-int migrate_pages(struct list_head *from,
			
 
				-		new_page_t get_new_page, unsigned long private, bool offlining,
			
 
				-		enum migrate_mode mode, int reason)
			
 
				+int migrate_pages(struct list_head *from, new_page_t get_new_page,
			
 
				+		unsigned long private, enum migrate_mode mode, int reason)
			
 
				 {
			
 
				 	int retry = 1;
			
 
				 	int nr_failed = 0;
			
@@ -1024,8 +1009,7 @@ int migrate_pages(struct list_head *from,
 
				 			cond_resched();
			
 
				 
			
 
				 			rc = unmap_and_move(get_new_page, private,
			
 
				-						page, pass > 2, offlining,
			
 
				-						mode);
			
 
				+						page, pass > 2, mode);
			
 
				 
			
 
				 			switch(rc) {
			
 
				 			case -ENOMEM:
			
@@ -1058,15 +1042,13 @@ out:
 
				 }
			
 
				 
			
 
				 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
			
 
				-		      unsigned long private, bool offlining,
			
 
				-		      enum migrate_mode mode)
			
 
				+		      unsigned long private, enum migrate_mode mode)
			
 
				 {
			
 
				 	int pass, rc;
			
 
				 
			
 
				 	for (pass = 0; pass < 10; pass++) {
			
 
				-		rc = unmap_and_move_huge_page(get_new_page,
			
 
				-					      private, hpage, pass > 2, offlining,
			
 
				-					      mode);
			
 
				+		rc = unmap_and_move_huge_page(get_new_page, private,
			
 
				+						hpage, pass > 2, mode);
			
 
				 		switch (rc) {
			
 
				 		case -ENOMEM:
			
 
				 			goto out;
			
@@ -1152,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 
				 			goto set_status;
			
 
				 
			
 
				 		/* Use PageReserved to check for zero page */
			
 
				-		if (PageReserved(page) || PageKsm(page))
			
 
				+		if (PageReserved(page))
			
 
				 			goto put_and_set;
			
 
				 
			
 
				 		pp->page = page;
			
@@ -1189,8 +1171,7 @@ set_status:
 
				 	err = 0;
			
 
				 	if (!list_empty(&pagelist)) {
			
 
				 		err = migrate_pages(&pagelist, new_page_node,
			
 
				-				(unsigned long)pm, 0, MIGRATE_SYNC,
			
 
				-				MR_SYSCALL);
			
 
				+				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
			
 
				 		if (err)
			
 
				 			putback_lru_pages(&pagelist);
			
 
				 	}
			
@@ -1314,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 
				 
			
 
				 		err = -ENOENT;
			
 
				 		/* Use PageReserved to check for zero page */
			
 
				-		if (!page || PageReserved(page) || PageKsm(page))
			
 
				+		if (!page || PageReserved(page))
			
 
				 			goto set_status;
			
 
				 
			
 
				 		err = page_to_nid(page);
			
@@ -1461,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
 
				  * pages. Currently it only checks the watermarks which crude
			
 
				  */
			
 
				 static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
			
 
				-				   int nr_migrate_pages)
			
 
				+				   unsigned long nr_migrate_pages)
			
 
				 {
			
 
				 	int z;
			
 
				 	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
			
@@ -1497,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 
				 					  __GFP_NOWARN) &
			
 
				 					 ~GFP_IOFS, 0);
			
 
				 	if (newpage)
			
 
				-		page_xchg_last_nid(newpage, page_last_nid(page));
			
 
				+		page_nid_xchg_last(newpage, page_nid_last(page));
			
 
				 
			
 
				 	return newpage;
			
 
				 }
			
@@ -1557,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
 
				 
			
 
				 int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
			
 
				 {
			
 
				-	int ret = 0;
			
 
				+	int page_lru;
			
 
				+
			
 
				+	VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
			
 
				 
			
 
				 	/* Avoid migrating to a node that is nearly full */
			
 
				-	if (migrate_balanced_pgdat(pgdat, 1)) {
			
 
				-		int page_lru;
			
 
				+	if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
			
 
				+		return 0;
			
 
				 
			
 
				-		if (isolate_lru_page(page)) {
			
 
				-			put_page(page);
			
 
				-			return 0;
			
 
				-		}
			
 
				+	if (isolate_lru_page(page))
			
 
				+		return 0;
			
 
				 
			
 
				-		/* Page is isolated */
			
 
				-		ret = 1;
			
 
				-		page_lru = page_is_file_cache(page);
			
 
				-		if (!PageTransHuge(page))
			
 
				-			inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
			
 
				-		else
			
 
				-			mod_zone_page_state(page_zone(page),
			
 
				-					NR_ISOLATED_ANON + page_lru,
			
 
				-					HPAGE_PMD_NR);
			
 
				+	/*
			
 
				+	 * migrate_misplaced_transhuge_page() skips page migration's usual
			
 
				+	 * check on page_count(), so we must do it here, now that the page
			
 
				+	 * has been isolated: a GUP pin, or any other pin, prevents migration.
			
 
				+	 * The expected page count is 3: 1 for page's mapcount and 1 for the
			
 
				+	 * caller's pin and 1 for the reference taken by isolate_lru_page().
			
 
				+	 */
			
 
				+	if (PageTransHuge(page) && page_count(page) != 3) {
			
 
				+		putback_lru_page(page);
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				+	page_lru = page_is_file_cache(page);
			
 
				+	mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
			
 
				+				hpage_nr_pages(page));
			
 
				+
			
 
				 	/*
			
 
				-	 * Page is either isolated or there is not enough space on the target
			
 
				-	 * node. If isolated, then it has taken a reference count and the
			
 
				-	 * callers reference can be safely dropped without the page
			
 
				-	 * disappearing underneath us during migration. Otherwise the page is
			
 
				-	 * not to be migrated but the callers reference should still be
			
 
				-	 * dropped so it does not leak.
			
 
				+	 * Isolating the page has taken another reference, so the
			
 
				+	 * caller's reference can be safely dropped without the page
			
 
				+	 * disappearing underneath us during migration.
			
 
				 	 */
			
 
				 	put_page(page);
			
 
				-
			
 
				-	return ret;
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1600,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 
				 int migrate_misplaced_page(struct page *page, int node)
			
 
				 {
			
 
				 	pg_data_t *pgdat = NODE_DATA(node);
			
 
				-	int isolated = 0;
			
 
				+	int isolated;
			
 
				 	int nr_remaining;
			
 
				 	LIST_HEAD(migratepages);
			
 
				 
			
@@ -1608,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node)
 
				 	 * Don't migrate pages that are mapped in multiple processes.
			
 
				 	 * TODO: Handle false sharing detection instead of this hammer
			
 
				 	 */
			
 
				-	if (page_mapcount(page) != 1) {
			
 
				-		put_page(page);
			
 
				+	if (page_mapcount(page) != 1)
			
 
				 		goto out;
			
 
				-	}
			
 
				 
			
 
				 	/*
			
 
				 	 * Rate-limit the amount of data that is being migrated to a node.
			
 
				 	 * Optimal placement is no good if the memory bus is saturated and
			
 
				 	 * all the time is being spent migrating!
			
 
				 	 */
			
 
				-	if (numamigrate_update_ratelimit(pgdat, 1)) {
			
 
				-		put_page(page);
			
 
				+	if (numamigrate_update_ratelimit(pgdat, 1))
			
 
				 		goto out;
			
 
				-	}
			
 
				 
			
 
				 	isolated = numamigrate_isolate_page(pgdat, page);
			
 
				 	if (!isolated)
			
 
				 		goto out;
			
 
				 
			
 
				 	list_add(&page->lru, &migratepages);
			
 
				-	nr_remaining = migrate_pages(&migratepages,
			
 
				-			alloc_misplaced_dst_page,
			
 
				-			node, false, MIGRATE_ASYNC,
			
 
				-			MR_NUMA_MISPLACED);
			
 
				+	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
			
 
				+				     node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
			
 
				 	if (nr_remaining) {
			
 
				 		putback_lru_pages(&migratepages);
			
 
				 		isolated = 0;
			
 
				 	} else
			
 
				 		count_vm_numa_event(NUMA_PAGE_MIGRATE);
			
 
				 	BUG_ON(!list_empty(&migratepages));
			
 
				-out:
			
 
				 	return isolated;
			
 
				+
			
 
				+out:
			
 
				+	put_page(page);
			
 
				+	return 0;
			
 
				 }
			
 
				 #endif /* CONFIG_NUMA_BALANCING */
			
 
				 
			
 
				 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
			
 
				+/*
			
 
				+ * Migrates a THP to a given target node. page must be locked and is unlocked
			
 
				+ * before returning.
			
 
				+ */
			
 
				 int migrate_misplaced_transhuge_page(struct mm_struct *mm,
			
 
				 				struct vm_area_struct *vma,
			
 
				 				pmd_t *pmd, pmd_t entry,
			
@@ -1674,29 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
				 
			
 
				 	new_page = alloc_pages_node(node,
			
 
				 		(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
			
 
				-	if (!new_page) {
			
 
				-		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
			
 
				-		goto out_dropref;
			
 
				-	}
			
 
				-	page_xchg_last_nid(new_page, page_last_nid(page));
			
 
				+	if (!new_page)
			
 
				+		goto out_fail;
			
 
				 
			
 
				-	isolated = numamigrate_isolate_page(pgdat, page);
			
 
				+	page_nid_xchg_last(new_page, page_nid_last(page));
			
 
				 
			
 
				-	/*
			
 
				-	 * Failing to isolate or a GUP pin prevents migration. The expected
			
 
				-	 * page count is 2. 1 for anonymous pages without a mapping and 1
			
 
				-	 * for the callers pin. If the page was isolated, the page will
			
 
				-	 * need to be put back on the LRU.
			
 
				-	 */
			
 
				-	if (!isolated || page_count(page) != 2) {
			
 
				-		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
			
 
				+	isolated = numamigrate_isolate_page(pgdat, page);
			
 
				+	if (!isolated) {
			
 
				 		put_page(new_page);
			
 
				-		if (isolated) {
			
 
				-			putback_lru_page(page);
			
 
				-			isolated = 0;
			
 
				-			goto out;
			
 
				-		}
			
 
				-		goto out_keep_locked;
			
 
				+		goto out_fail;
			
 
				 	}
			
 
				 
			
 
				 	/* Prepare a page as a migration target */
			
@@ -1728,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
				 		putback_lru_page(page);
			
 
				 
			
 
				 		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
			
 
				+		isolated = 0;
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -1772,9 +1742,11 @@ out:
 
				 			-HPAGE_PMD_NR);
			
 
				 	return isolated;
			
 
				 
			
 
				+out_fail:
			
 
				+	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
			
 
				 out_dropref:
			
 
				+	unlock_page(page);
			
 
				 	put_page(page);
			
 
				-out_keep_locked:
			
 
				 	return 0;
			
 
				 }
			
 
				 #endif /* CONFIG_NUMA_BALANCING */
			
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 
				 	/* shmem/tmpfs may return swap: account for swapcache page too. */
			
 
				 	if (radix_tree_exceptional_entry(page)) {
			
 
				 		swp_entry_t swap = radix_to_swp_entry(page);
			
 
				-		page = find_get_page(&swapper_space, swap.val);
			
 
				+		page = find_get_page(swap_address_space(swap), swap.val);
			
 
				 	}
			
 
				 #endif
			
 
				 	if (page) {
			
@@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
				 			} else {
			
 
				 #ifdef CONFIG_SWAP
			
 
				 				pgoff = entry.val;
			
 
				-				*vec = mincore_page(&swapper_space, pgoff);
			
 
				+				*vec = mincore_page(swap_address_space(entry),
			
 
				+					pgoff);
			
 
				 #else
			
 
				 				WARN_ON(1);
			
 
				 				*vec = 1;
			
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ void munlock_vma_page(struct page *page)
 
				  *
			
 
				  * vma->vm_mm->mmap_sem must be held for at least read.
			
 
				  */
			
 
				-static long __mlock_vma_pages_range(struct vm_area_struct *vma,
			
 
				-				    unsigned long start, unsigned long end,
			
 
				-				    int *nonblocking)
			
 
				+long __mlock_vma_pages_range(struct vm_area_struct *vma,
			
 
				+		unsigned long start, unsigned long end, int *nonblocking)
			
 
				 {
			
 
				 	struct mm_struct *mm = vma->vm_mm;
			
 
				 	unsigned long addr = start;
			
 
				-	int nr_pages = (end - start) / PAGE_SIZE;
			
 
				+	unsigned long nr_pages = (end - start) / PAGE_SIZE;
			
 
				 	int gup_flags;
			
 
				 
			
 
				 	VM_BUG_ON(start & ~PAGE_MASK);
			
@@ -186,6 +185,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 
				 	if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
			
 
				 		gup_flags |= FOLL_FORCE;
			
 
				 
			
 
				+	/*
			
 
				+	 * We made sure addr is within a VMA, so the following will
			
 
				+	 * not result in a stack expansion that recurses back here.
			
 
				+	 */
			
 
				 	return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
			
 
				 				NULL, NULL, nonblocking);
			
 
				 }
			
@@ -202,56 +205,6 @@ static int __mlock_posix_error_return(long retval)
 
				 	return retval;
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * mlock_vma_pages_range() - mlock pages in specified vma range.
			
 
				- * @vma - the vma containing the specfied address range
			
 
				- * @start - starting address in @vma to mlock
			
 
				- * @end   - end address [+1] in @vma to mlock
			
 
				- *
			
 
				- * For mmap()/mremap()/expansion of mlocked vma.
			
 
				- *
			
 
				- * return 0 on success for "normal" vmas.
			
 
				- *
			
 
				- * return number of pages [> 0] to be removed from locked_vm on success
			
 
				- * of "special" vmas.
			
 
				- */
			
 
				-long mlock_vma_pages_range(struct vm_area_struct *vma,
			
 
				-			unsigned long start, unsigned long end)
			
 
				-{
			
 
				-	int nr_pages = (end - start) / PAGE_SIZE;
			
 
				-	BUG_ON(!(vma->vm_flags & VM_LOCKED));
			
 
				-
			
 
				-	/*
			
 
				-	 * filter unlockable vmas
			
 
				-	 */
			
 
				-	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
			
 
				-		goto no_mlock;
			
 
				-
			
 
				-	if (!((vma->vm_flags & VM_DONTEXPAND) ||
			
 
				-			is_vm_hugetlb_page(vma) ||
			
 
				-			vma == get_gate_vma(current->mm))) {
			
 
				-
			
 
				-		__mlock_vma_pages_range(vma, start, end, NULL);
			
 
				-
			
 
				-		/* Hide errors from mmap() and other callers */
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * User mapped kernel pages or huge pages:
			
 
				-	 * make these pages present to populate the ptes, but
			
 
				-	 * fall thru' to reset VM_LOCKED--no need to unlock, and
			
 
				-	 * return nr_pages so these don't get counted against task's
			
 
				-	 * locked limit.  huge pages are already counted against
			
 
				-	 * locked vm limit.
			
 
				-	 */
			
 
				-	make_pages_present(start, end);
			
 
				-
			
 
				-no_mlock:
			
 
				-	vma->vm_flags &= ~VM_LOCKED;	/* and don't come back! */
			
 
				-	return nr_pages;		/* error or pages NOT mlocked */
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * munlock_vma_pages_range() - munlock all pages in the vma range.'
			
 
				  * @vma - vma containing range to be munlock()ed.
			
@@ -303,7 +256,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 
				  *
			
 
				  * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
			
 
				  * munlock is a no-op.  However, for some special vmas, we go ahead and
			
 
				- * populate the ptes via make_pages_present().
			
 
				+ * populate the ptes.
			
 
				  *
			
 
				  * For vmas that pass the filters, merge/split as appropriate.
			
 
				  */
			
@@ -391,9 +344,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
 
				 
			
 
				 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
			
 
				 
			
 
				-		newflags = vma->vm_flags | VM_LOCKED;
			
 
				-		if (!on)
			
 
				-			newflags &= ~VM_LOCKED;
			
 
				+		newflags = vma->vm_flags & ~VM_LOCKED;
			
 
				+		if (on)
			
 
				+			newflags |= VM_LOCKED | VM_POPULATE;
			
 
				 
			
 
				 		tmp = vma->vm_end;
			
 
				 		if (tmp > end)
			
@@ -416,13 +369,20 @@ static int do_mlock(unsigned long start, size_t len, int on)
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				-static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
			
 
				+/*
			
 
				+ * __mm_populate - populate and/or mlock pages within a range of address space.
			
 
				+ *
			
 
				+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
			
 
				+ * flags. VMAs must be already marked with the desired vm_flags, and
			
 
				+ * mmap_sem must not be held.
			
 
				+ */
			
 
				+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
			
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	unsigned long end, nstart, nend;
			
 
				 	struct vm_area_struct *vma = NULL;
			
 
				 	int locked = 0;
			
 
				-	int ret = 0;
			
 
				+	long ret = 0;
			
 
				 
			
 
				 	VM_BUG_ON(start & ~PAGE_MASK);
			
 
				 	VM_BUG_ON(len != PAGE_ALIGN(len));
			
@@ -446,7 +406,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
 
				 		 * range with the first VMA. Also, skip undesirable VMA types.
			
 
				 		 */
			
 
				 		nend = min(end, vma->vm_end);
			
 
				-		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
			
 
				+		if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
			
 
				+		    VM_POPULATE)
			
 
				 			continue;
			
 
				 		if (nstart < vma->vm_start)
			
 
				 			nstart = vma->vm_start;
			
@@ -498,7 +459,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 
				 		error = do_mlock(start, len, 1);
			
 
				 	up_write(&current->mm->mmap_sem);
			
 
				 	if (!error)
			
 
				-		error = do_mlock_pages(start, len, 0);
			
 
				+		error = __mm_populate(start, len, 0);
			
 
				 	return error;
			
 
				 }
			
 
				 
			
@@ -519,18 +480,18 @@ static int do_mlockall(int flags)
 
				 	struct vm_area_struct * vma, * prev = NULL;
			
 
				 
			
 
				 	if (flags & MCL_FUTURE)
			
 
				-		current->mm->def_flags |= VM_LOCKED;
			
 
				+		current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
			
 
				 	else
			
 
				-		current->mm->def_flags &= ~VM_LOCKED;
			
 
				+		current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
			
 
				 	if (flags == MCL_FUTURE)
			
 
				 		goto out;
			
 
				 
			
 
				 	for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
			
 
				 		vm_flags_t newflags;
			
 
				 
			
 
				-		newflags = vma->vm_flags | VM_LOCKED;
			
 
				-		if (!(flags & MCL_CURRENT))
			
 
				-			newflags &= ~VM_LOCKED;
			
 
				+		newflags = vma->vm_flags & ~VM_LOCKED;
			
 
				+		if (flags & MCL_CURRENT)
			
 
				+			newflags |= VM_LOCKED | VM_POPULATE;
			
 
				 
			
 
				 		/* Ignore errors */
			
 
				 		mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
			
@@ -564,10 +525,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
 
				 	    capable(CAP_IPC_LOCK))
			
 
				 		ret = do_mlockall(flags);
			
 
				 	up_write(&current->mm->mmap_sem);
			
 
				-	if (!ret && (flags & MCL_CURRENT)) {
			
 
				-		/* Ignore errors */
			
 
				-		do_mlock_pages(0, TASK_SIZE, 1);
			
 
				-	}
			
 
				+	if (!ret && (flags & MCL_CURRENT))
			
 
				+		mm_populate(0, TASK_SIZE);
			
 
				 out:
			
 
				 	return ret;
			
 
				 }
			
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void)
 
				 	unsigned long or_mask, add_mask;
			
 
				 
			
 
				 	shift = 8 * sizeof(unsigned long);
			
 
				-	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
			
 
				+	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
			
 
				 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
			
 
				-		"Section %d Node %d Zone %d Flags %d\n",
			
 
				+		"Section %d Node %d Zone %d Lastnid %d Flags %d\n",
			
 
				 		SECTIONS_WIDTH,
			
 
				 		NODES_WIDTH,
			
 
				 		ZONES_WIDTH,
			
 
				+		LAST_NID_WIDTH,
			
 
				 		NR_PAGEFLAGS);
			
 
				 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
			
 
				-		"Section %d Node %d Zone %d\n",
			
 
				+		"Section %d Node %d Zone %d Lastnid %d\n",
			
 
				 		SECTIONS_SHIFT,
			
 
				 		NODES_SHIFT,
			
 
				-		ZONES_SHIFT);
			
 
				-	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
			
 
				-		"Section %lu Node %lu Zone %lu\n",
			
 
				+		ZONES_SHIFT,
			
 
				+		LAST_NID_SHIFT);
			
 
				+	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
			
 
				+		"Section %lu Node %lu Zone %lu Lastnid %lu\n",
			
 
				 		(unsigned long)SECTIONS_PGSHIFT,
			
 
				 		(unsigned long)NODES_PGSHIFT,
			
 
				-		(unsigned long)ZONES_PGSHIFT);
			
 
				-	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
			
 
				-		"Zone ID: %lu -> %lu\n",
			
 
				-		(unsigned long)ZONEID_PGOFF,
			
 
				-		(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
			
 
				+		(unsigned long)ZONES_PGSHIFT,
			
 
				+		(unsigned long)LAST_NID_PGSHIFT);
			
 
				+	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
			
 
				+		"Node/Zone ID: %lu -> %lu\n",
			
 
				+		(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
			
 
				+		(unsigned long)ZONEID_PGOFF);
			
 
				 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
			
 
				-		"location: %d -> %d unused %d -> %d flags %d -> %d\n",
			
 
				+		"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
			
 
				 		shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
			
 
				 #ifdef NODE_NOT_IN_PAGE_FLAGS
			
 
				 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
			
 
				 		"Node not in page flags");
			
 
				 #endif
			
 
				+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
			
 
				+	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
			
 
				+		"Last nid not in page flags");
			
 
				+#endif
			
 
				 
			
 
				 	if (SECTIONS_WIDTH) {
			
 
				 		shift -= SECTIONS_WIDTH;
			
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
				 		 */
			
 
				 		free -= global_page_state(NR_SHMEM);
			
 
				 
			
 
				-		free += nr_swap_pages;
			
 
				+		free += get_nr_swap_pages();
			
 
				 
			
 
				 		/*
			
 
				 		 * Any slabs which are created with the
			
@@ -256,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 
				 	unsigned long newbrk, oldbrk;
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	unsigned long min_brk;
			
 
				+	bool populate;
			
 
				 
			
 
				 	down_write(&mm->mmap_sem);
			
 
				 
			
@@ -305,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 
				 	/* Ok, looks good - let it rip. */
			
 
				 	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
			
 
				 		goto out;
			
 
				+
			
 
				 set_brk:
			
 
				 	mm->brk = brk;
			
 
				+	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
			
 
				+	up_write(&mm->mmap_sem);
			
 
				+	if (populate)
			
 
				+		mm_populate(oldbrk, newbrk - oldbrk);
			
 
				+	return brk;
			
 
				+
			
 
				 out:
			
 
				 	retval = mm->brk;
			
 
				 	up_write(&mm->mmap_sem);
			
@@ -801,7 +809,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 
				 		anon_vma_interval_tree_post_update_vma(vma);
			
 
				 		if (adjust_next)
			
 
				 			anon_vma_interval_tree_post_update_vma(next);
			
 
				-		anon_vma_unlock(anon_vma);
			
 
				+		anon_vma_unlock_write(anon_vma);
			
 
				 	}
			
 
				 	if (mapping)
			
 
				 		mutex_unlock(&mapping->i_mmap_mutex);
			
@@ -1154,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 
				 
			
 
				 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
			
 
				 			unsigned long len, unsigned long prot,
			
 
				-			unsigned long flags, unsigned long pgoff)
			
 
				+			unsigned long flags, unsigned long pgoff,
			
 
				+			unsigned long *populate)
			
 
				 {
			
 
				 	struct mm_struct * mm = current->mm;
			
 
				 	struct inode *inode;
			
 
				 	vm_flags_t vm_flags;
			
 
				 
			
 
				+	*populate = 0;
			
 
				+
			
 
				 	/*
			
 
				 	 * Does the application expect PROT_READ to imply PROT_EXEC?
			
 
				 	 *
			
@@ -1280,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
			
 
				+	/*
			
 
				+	 * Set 'VM_NORESERVE' if we should not account for the
			
 
				+	 * memory use of this mapping.
			
 
				+	 */
			
 
				+	if (flags & MAP_NORESERVE) {
			
 
				+		/* We honor MAP_NORESERVE if allowed to overcommit */
			
 
				+		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
			
 
				+			vm_flags |= VM_NORESERVE;
			
 
				+
			
 
				+		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
			
 
				+		if (file && is_file_hugepages(file))
			
 
				+			vm_flags |= VM_NORESERVE;
			
 
				+	}
			
 
				+
			
 
				+	addr = mmap_region(file, addr, len, vm_flags, pgoff);
			
 
				+	if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
			
 
				+		*populate = len;
			
 
				+	return addr;
			
 
				 }
			
 
				 
			
 
				 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
			
@@ -1395,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 
				 }
			
 
				 
			
 
				 unsigned long mmap_region(struct file *file, unsigned long addr,
			
 
				-			  unsigned long len, unsigned long flags,
			
 
				-			  vm_flags_t vm_flags, unsigned long pgoff)
			
 
				+		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
			
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	struct vm_area_struct *vma, *prev;
			
@@ -1419,20 +1446,6 @@ munmap_back:
 
				 	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	/*
			
 
				-	 * Set 'VM_NORESERVE' if we should not account for the
			
 
				-	 * memory use of this mapping.
			
 
				-	 */
			
 
				-	if ((flags & MAP_NORESERVE)) {
			
 
				-		/* We honor MAP_NORESERVE if allowed to overcommit */
			
 
				-		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
			
 
				-			vm_flags |= VM_NORESERVE;
			
 
				-
			
 
				-		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
			
 
				-		if (file && is_file_hugepages(file))
			
 
				-			vm_flags |= VM_NORESERVE;
			
 
				-	}
			
 
				-
			
 
				 	/*
			
 
				 	 * Private writable mapping: check memory availability
			
 
				 	 */
			
@@ -1531,10 +1544,12 @@ out:
 
				 
			
 
				 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
			
 
				 	if (vm_flags & VM_LOCKED) {
			
 
				-		if (!mlock_vma_pages_range(vma, addr, addr + len))
			
 
				+		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
			
 
				+					vma == get_gate_vma(current->mm)))
			
 
				 			mm->locked_vm += (len >> PAGE_SHIFT);
			
 
				-	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
			
 
				-		make_pages_present(addr, addr + len);
			
 
				+		else
			
 
				+			vma->vm_flags &= ~VM_LOCKED;
			
 
				+	}
			
 
				 
			
 
				 	if (file)
			
 
				 		uprobe_mmap(vma);
			
@@ -2187,9 +2202,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
 
				 		return vma;
			
 
				 	if (!prev || expand_stack(prev, addr))
			
 
				 		return NULL;
			
 
				-	if (prev->vm_flags & VM_LOCKED) {
			
 
				-		mlock_vma_pages_range(prev, addr, prev->vm_end);
			
 
				-	}
			
 
				+	if (prev->vm_flags & VM_LOCKED)
			
 
				+		__mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
			
 
				 	return prev;
			
 
				 }
			
 
				 #else
			
@@ -2215,9 +2229,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 
				 	start = vma->vm_start;
			
 
				 	if (expand_stack(vma, addr))
			
 
				 		return NULL;
			
 
				-	if (vma->vm_flags & VM_LOCKED) {
			
 
				-		mlock_vma_pages_range(vma, addr, start);
			
 
				-	}
			
 
				+	if (vma->vm_flags & VM_LOCKED)
			
 
				+		__mlock_vma_pages_range(vma, addr, start, NULL);
			
 
				 	return vma;
			
 
				 }
			
 
				 #endif
			
@@ -2590,10 +2603,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
				 out:
			
 
				 	perf_event_mmap(vma);
			
 
				 	mm->total_vm += len >> PAGE_SHIFT;
			
 
				-	if (flags & VM_LOCKED) {
			
 
				-		if (!mlock_vma_pages_range(vma, addr, addr + len))
			
 
				-			mm->locked_vm += (len >> PAGE_SHIFT);
			
 
				-	}
			
 
				+	if (flags & VM_LOCKED)
			
 
				+		mm->locked_vm += (len >> PAGE_SHIFT);
			
 
				 	return addr;
			
 
				 }
			
 
				 
			
@@ -2601,10 +2612,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	unsigned long ret;
			
 
				+	bool populate;
			
 
				 
			
 
				 	down_write(&mm->mmap_sem);
			
 
				 	ret = do_brk(addr, len);
			
 
				+	populate = ((mm->def_flags & VM_LOCKED) != 0);
			
 
				 	up_write(&mm->mmap_sem);
			
 
				+	if (populate)
			
 
				+		mm_populate(addr, len);
			
 
				 	return ret;
			
 
				 }
			
 
				 EXPORT_SYMBOL(vm_brk);
			
@@ -3002,7 +3017,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 
				 		if (!__test_and_clear_bit(0, (unsigned long *)
			
 
				 					  &anon_vma->root->rb_root.rb_node))
			
 
				 			BUG();
			
 
				-		anon_vma_unlock(anon_vma);
			
 
				+		anon_vma_unlock_write(anon_vma);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -37,49 +37,51 @@ static struct srcu_struct srcu;
 
				 void __mmu_notifier_release(struct mm_struct *mm)
			
 
				 {
			
 
				 	struct mmu_notifier *mn;
			
 
				-	struct hlist_node *n;
			
 
				 	int id;
			
 
				 
			
 
				 	/*
			
 
				-	 * SRCU here will block mmu_notifier_unregister until
			
 
				-	 * ->release returns.
			
 
				+	 * srcu_read_lock() here will block synchronize_srcu() in
			
 
				+	 * mmu_notifier_unregister() until all registered
			
 
				+	 * ->release() callouts this function makes have
			
 
				+	 * returned.
			
 
				 	 */
			
 
				 	id = srcu_read_lock(&srcu);
			
 
				-	hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
			
 
				-		/*
			
 
				-		 * if ->release runs before mmu_notifier_unregister it
			
 
				-		 * must be handled as it's the only way for the driver
			
 
				-		 * to flush all existing sptes and stop the driver
			
 
				-		 * from establishing any more sptes before all the
			
 
				-		 * pages in the mm are freed.
			
 
				-		 */
			
 
				-		if (mn->ops->release)
			
 
				-			mn->ops->release(mn, mm);
			
 
				-	srcu_read_unlock(&srcu, id);
			
 
				-
			
 
				 	spin_lock(&mm->mmu_notifier_mm->lock);
			
 
				 	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
			
 
				 		mn = hlist_entry(mm->mmu_notifier_mm->list.first,
			
 
				 				 struct mmu_notifier,
			
 
				 				 hlist);
			
 
				+
			
 
				 		/*
			
 
				-		 * We arrived before mmu_notifier_unregister so
			
 
				-		 * mmu_notifier_unregister will do nothing other than
			
 
				-		 * to wait ->release to finish and
			
 
				-		 * mmu_notifier_unregister to return.
			
 
				+		 * Unlink.  This will prevent mmu_notifier_unregister()
			
 
				+		 * from also making the ->release() callout.
			
 
				 		 */
			
 
				 		hlist_del_init_rcu(&mn->hlist);
			
 
				+		spin_unlock(&mm->mmu_notifier_mm->lock);
			
 
				+
			
 
				+		/*
			
 
				+		 * Clear sptes. (see 'release' description in mmu_notifier.h)
			
 
				+		 */
			
 
				+		if (mn->ops->release)
			
 
				+			mn->ops->release(mn, mm);
			
 
				+
			
 
				+		spin_lock(&mm->mmu_notifier_mm->lock);
			
 
				 	}
			
 
				 	spin_unlock(&mm->mmu_notifier_mm->lock);
			
 
				 
			
 
				 	/*
			
 
				-	 * synchronize_srcu here prevents mmu_notifier_release to
			
 
				-	 * return to exit_mmap (which would proceed freeing all pages
			
 
				-	 * in the mm) until the ->release method returns, if it was
			
 
				-	 * invoked by mmu_notifier_unregister.
			
 
				-	 *
			
 
				-	 * The mmu_notifier_mm can't go away from under us because one
			
 
				-	 * mm_count is hold by exit_mmap.
			
 
				+	 * All callouts to ->release() which we have done are complete.
			
 
				+	 * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
			
 
				+	 */
			
 
				+	srcu_read_unlock(&srcu, id);
			
 
				+
			
 
				+	/*
			
 
				+	 * mmu_notifier_unregister() may have unlinked a notifier and may
			
 
				+	 * still be calling out to it.	Additionally, other notifiers
			
 
				+	 * may have been active via vmtruncate() et. al. Block here
			
 
				+	 * to ensure that all notifier callouts for this mm have been
			
 
				+	 * completed and the sptes are really cleaned up before returning
			
 
				+	 * to exit_mmap().
			
 
				 	 */
			
 
				 	synchronize_srcu(&srcu);
			
 
				 }
			
@@ -170,6 +172,7 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 
				 	}
			
 
				 	srcu_read_unlock(&srcu, id);
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
			
 
				 
			
 
				 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
			
 
				 				  unsigned long start, unsigned long end)
			
@@ -185,6 +188,7 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 
				 	}
			
 
				 	srcu_read_unlock(&srcu, id);
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
			
 
				 
			
 
				 static int do_mmu_notifier_register(struct mmu_notifier *mn,
			
 
				 				    struct mm_struct *mm,
			
@@ -294,31 +298,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 
				 {
			
 
				 	BUG_ON(atomic_read(&mm->mm_count) <= 0);
			
 
				 
			
 
				+	spin_lock(&mm->mmu_notifier_mm->lock);
			
 
				 	if (!hlist_unhashed(&mn->hlist)) {
			
 
				-		/*
			
 
				-		 * SRCU here will force exit_mmap to wait ->release to finish
			
 
				-		 * before freeing the pages.
			
 
				-		 */
			
 
				 		int id;
			
 
				 
			
 
				-		id = srcu_read_lock(&srcu);
			
 
				 		/*
			
 
				-		 * exit_mmap will block in mmu_notifier_release to
			
 
				-		 * guarantee ->release is called before freeing the
			
 
				-		 * pages.
			
 
				+		 * Ensure we synchronize up with __mmu_notifier_release().
			
 
				 		 */
			
 
				+		id = srcu_read_lock(&srcu);
			
 
				+
			
 
				+		hlist_del_rcu(&mn->hlist);
			
 
				+		spin_unlock(&mm->mmu_notifier_mm->lock);
			
 
				+
			
 
				 		if (mn->ops->release)
			
 
				 			mn->ops->release(mn, mm);
			
 
				-		srcu_read_unlock(&srcu, id);
			
 
				 
			
 
				-		spin_lock(&mm->mmu_notifier_mm->lock);
			
 
				-		hlist_del_rcu(&mn->hlist);
			
 
				+		/*
			
 
				+		 * Allow __mmu_notifier_release() to complete.
			
 
				+		 */
			
 
				+		srcu_read_unlock(&srcu, id);
			
 
				+	} else
			
 
				 		spin_unlock(&mm->mmu_notifier_mm->lock);
			
 
				-	}
			
 
				 
			
 
				 	/*
			
 
				-	 * Wait any running method to finish, of course including
			
 
				-	 * ->release if it was run by mmu_notifier_relase instead of us.
			
 
				+	 * Wait for any running method to finish, including ->release() if it
			
 
				+	 * was run by __mmu_notifier_release() instead of us.
			
 
				 	 */
			
 
				 	synchronize_srcu(&srcu);
			
 
				 
			
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * linux/mm/mmzone.c
			
 
				  *
			
 
				- * management codes for pgdats and zones.
			
 
				+ * management codes for pgdats, zones and page flags
			
 
				  */
			
 
				 
			
 
				 
			
@@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec)
 
				 	for_each_lru(lru)
			
 
				 		INIT_LIST_HEAD(&lruvec->lists[lru]);
			
 
				 }
			
 
				+
			
 
				+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
			
 
				+int page_nid_xchg_last(struct page *page, int nid)
			
 
				+{
			
 
				+	unsigned long old_flags, flags;
			
 
				+	int last_nid;
			
 
				+
			
 
				+	do {
			
 
				+		old_flags = flags = page->flags;
			
 
				+		last_nid = page_nid_last(page);
			
 
				+
			
 
				+		flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
			
 
				+		flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
			
 
				+	} while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
			
 
				+
			
 
				+	return last_nid;
			
 
				+}
			
 
				+#endif
			
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -135,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 
				 	pte_unmap(new_pte - 1);
			
 
				 	pte_unmap_unlock(old_pte - 1, old_ptl);
			
 
				 	if (anon_vma)
			
 
				-		anon_vma_unlock(anon_vma);
			
 
				+		anon_vma_unlock_write(anon_vma);
			
 
				 	if (mapping)
			
 
				 		mutex_unlock(&mapping->i_mmap_mutex);
			
 
				 }
			
@@ -209,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
				 
			
 
				 static unsigned long move_vma(struct vm_area_struct *vma,
			
 
				 		unsigned long old_addr, unsigned long old_len,
			
 
				-		unsigned long new_len, unsigned long new_addr)
			
 
				+		unsigned long new_len, unsigned long new_addr, bool *locked)
			
 
				 {
			
 
				 	struct mm_struct *mm = vma->vm_mm;
			
 
				 	struct vm_area_struct *new_vma;
			
@@ -300,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
				 
			
 
				 	if (vm_flags & VM_LOCKED) {
			
 
				 		mm->locked_vm += new_len >> PAGE_SHIFT;
			
 
				-		if (new_len > old_len)
			
 
				-			mlock_vma_pages_range(new_vma, new_addr + old_len,
			
 
				-						       new_addr + new_len);
			
 
				+		*locked = true;
			
 
				 	}
			
 
				 
			
 
				 	return new_addr;
			
@@ -367,9 +365,8 @@ Eagain:
 
				 	return ERR_PTR(-EAGAIN);
			
 
				 }
			
 
				 
			
 
				-static unsigned long mremap_to(unsigned long addr,
			
 
				-	unsigned long old_len, unsigned long new_addr,
			
 
				-	unsigned long new_len)
			
 
				+static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
			
 
				+		unsigned long new_addr, unsigned long new_len, bool *locked)
			
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	struct vm_area_struct *vma;
			
@@ -419,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr,
 
				 	if (ret & ~PAGE_MASK)
			
 
				 		goto out1;
			
 
				 
			
 
				-	ret = move_vma(vma, addr, old_len, new_len, new_addr);
			
 
				+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
			
 
				 	if (!(ret & ~PAGE_MASK))
			
 
				 		goto out;
			
 
				 out1:
			
@@ -457,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 	struct vm_area_struct *vma;
			
 
				 	unsigned long ret = -EINVAL;
			
 
				 	unsigned long charged = 0;
			
 
				+	bool locked = false;
			
 
				 
			
 
				 	down_write(&current->mm->mmap_sem);
			
 
				 
			
@@ -479,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 
			
 
				 	if (flags & MREMAP_FIXED) {
			
 
				 		if (flags & MREMAP_MAYMOVE)
			
 
				-			ret = mremap_to(addr, old_len, new_addr, new_len);
			
 
				+			ret = mremap_to(addr, old_len, new_addr, new_len,
			
 
				+					&locked);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -521,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
			
 
				 			if (vma->vm_flags & VM_LOCKED) {
			
 
				 				mm->locked_vm += pages;
			
 
				-				mlock_vma_pages_range(vma, addr + old_len,
			
 
				-						   addr + new_len);
			
 
				+				locked = true;
			
 
				+				new_addr = addr;
			
 
				 			}
			
 
				 			ret = addr;
			
 
				 			goto out;
			
@@ -548,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 			goto out;
			
 
				 		}
			
 
				 
			
 
				-		ret = move_vma(vma, addr, old_len, new_len, new_addr);
			
 
				+		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
			
 
				 	}
			
 
				 out:
			
 
				 	if (ret & ~PAGE_MASK)
			
 
				 		vm_unacct_memory(charged);
			
 
				 	up_write(&current->mm->mmap_sem);
			
 
				+	if (locked && new_len > old_len)
			
 
				+		mm_populate(new_addr + old_len, new_len - old_len);
			
 
				 	return ret;
			
 
				 }
			
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -140,10 +140,10 @@ unsigned int kobjsize(const void *objp)
 
				 	return PAGE_SIZE << compound_order(page);
			
 
				 }
			
 
				 
			
 
				-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				-		     unsigned long start, int nr_pages, unsigned int foll_flags,
			
 
				-		     struct page **pages, struct vm_area_struct **vmas,
			
 
				-		     int *retry)
			
 
				+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				+		      unsigned long start, unsigned long nr_pages,
			
 
				+		      unsigned int foll_flags, struct page **pages,
			
 
				+		      struct vm_area_struct **vmas, int *nonblocking)
			
 
				 {
			
 
				 	struct vm_area_struct *vma;
			
 
				 	unsigned long vm_flags;
			
@@ -190,9 +190,10 @@ finish_or_fault:
 
				  *   slab page or a secondary page from a compound page
			
 
				  * - don't permit access to VMAs that don't support it, such as I/O mappings
			
 
				  */
			
 
				-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				-	unsigned long start, int nr_pages, int write, int force,
			
 
				-	struct page **pages, struct vm_area_struct **vmas)
			
 
				+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
			
 
				+		    unsigned long start, unsigned long nr_pages,
			
 
				+		    int write, int force, struct page **pages,
			
 
				+		    struct vm_area_struct **vmas)
			
 
				 {
			
 
				 	int flags = 0;
			
 
				 
			
@@ -1250,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 
				 			    unsigned long len,
			
 
				 			    unsigned long prot,
			
 
				 			    unsigned long flags,
			
 
				-			    unsigned long pgoff)
			
 
				+			    unsigned long pgoff,
			
 
				+			    unsigned long *populate)
			
 
				 {
			
 
				 	struct vm_area_struct *vma;
			
 
				 	struct vm_region *region;
			
@@ -1260,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 
				 
			
 
				 	kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
			
 
				 
			
 
				+	*populate = 0;
			
 
				+
			
 
				 	/* decide whether we should attempt the mapping, and if so what sort of
			
 
				 	 * mapping */
			
 
				 	ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
			
@@ -1815,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
			
 
				-			unsigned int foll_flags)
			
 
				+struct page *follow_page_mask(struct vm_area_struct *vma,
			
 
				+			      unsigned long address, unsigned int flags,
			
 
				+			      unsigned int *page_mask)
			
 
				 {
			
 
				+	*page_mask = 0;
			
 
				 	return NULL;
			
 
				 }
			
 
				 
			
@@ -1904,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
				 		 */
			
 
				 		free -= global_page_state(NR_SHMEM);
			
 
				 
			
 
				-		free += nr_swap_pages;
			
 
				+		free += get_nr_swap_pages();
			
 
				 
			
 
				 		/*
			
 
				 		 * Any slabs which are created with the
			
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 
				 	cpuset_print_task_mems_allowed(current);
			
 
				 	task_unlock(current);
			
 
				 	dump_stack();
			
 
				-	mem_cgroup_print_oom_info(memcg, p);
			
 
				-	show_mem(SHOW_MEM_FILTER_NODES);
			
 
				+	if (memcg)
			
 
				+		mem_cgroup_print_oom_info(memcg, p);
			
 
				+	else
			
 
				+		show_mem(SHOW_MEM_FILTER_NODES);
			
 
				 	if (sysctl_oom_dump_tasks)
			
 
				 		dump_tasks(memcg, nodemask);
			
 
				 }
			
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -241,6 +241,9 @@ static unsigned long global_dirtyable_memory(void)
 
				 	if (!vm_highmem_is_dirtyable)
			
 
				 		x -= highmem_dirtyable_memory(x);
			
 
				 
			
 
				+	/* Subtract min_free_kbytes */
			
 
				+	x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
			
 
				+
			
 
				 	return x + 1;	/* Ensure that we never return 0 */
			
 
				 }
			
 
				 
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -202,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages;
 
				 static unsigned long __meminitdata dma_reserve;
			
 
				 
			
 
				 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
			
 
				+/* Movable memory ranges, will also be used by memblock subsystem. */
			
 
				+struct movablemem_map movablemem_map = {
			
 
				+	.acpi = false,
			
 
				+	.nr_map = 0,
			
 
				+};
			
 
				+
			
 
				 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
			
 
				 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
			
 
				 static unsigned long __initdata required_kernelcore;
			
 
				 static unsigned long __initdata required_movablecore;
			
 
				 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
			
 
				+static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
			
 
				 
			
 
				 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
			
 
				 int movable_zone;
			
@@ -240,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 
				 	int ret = 0;
			
 
				 	unsigned seq;
			
 
				 	unsigned long pfn = page_to_pfn(page);
			
 
				+	unsigned long sp, start_pfn;
			
 
				 
			
 
				 	do {
			
 
				 		seq = zone_span_seqbegin(zone);
			
 
				-		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
			
 
				-			ret = 1;
			
 
				-		else if (pfn < zone->zone_start_pfn)
			
 
				+		start_pfn = zone->zone_start_pfn;
			
 
				+		sp = zone->spanned_pages;
			
 
				+		if (!zone_spans_pfn(zone, pfn))
			
 
				 			ret = 1;
			
 
				 	} while (zone_span_seqretry(zone, seq));
			
 
				 
			
 
				+	if (ret)
			
 
				+		pr_err("page %lu outside zone [ %lu - %lu ]\n",
			
 
				+			pfn, start_pfn, start_pfn + sp);
			
 
				+
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -288,7 +300,7 @@ static void bad_page(struct page *page)
 
				 
			
 
				 	/* Don't complain about poisoned pages */
			
 
				 	if (PageHWPoison(page)) {
			
 
				-		reset_page_mapcount(page); /* remove PageBuddy */
			
 
				+		page_mapcount_reset(page); /* remove PageBuddy */
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -320,7 +332,7 @@ static void bad_page(struct page *page)
 
				 	dump_stack();
			
 
				 out:
			
 
				 	/* Leave bad fields for debug, except PageBuddy could make trouble */
			
 
				-	reset_page_mapcount(page); /* remove PageBuddy */
			
 
				+	page_mapcount_reset(page); /* remove PageBuddy */
			
 
				 	add_taint(TAINT_BAD_PAGE);
			
 
				 }
			
 
				 
			
@@ -533,6 +545,8 @@ static inline void __free_one_page(struct page *page,
 
				 	unsigned long uninitialized_var(buddy_idx);
			
 
				 	struct page *buddy;
			
 
				 
			
 
				+	VM_BUG_ON(!zone_is_initialized(zone));
			
 
				+
			
 
				 	if (unlikely(PageCompound(page)))
			
 
				 		if (unlikely(destroy_compound_page(page, order)))
			
 
				 			return;
			
@@ -606,7 +620,7 @@ static inline int free_pages_check(struct page *page)
 
				 		bad_page(page);
			
 
				 		return 1;
			
 
				 	}
			
 
				-	reset_page_last_nid(page);
			
 
				+	page_nid_reset_last(page);
			
 
				 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
			
 
				 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
			
 
				 	return 0;
			
@@ -666,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 
				 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
			
 
				 			__free_one_page(page, zone, 0, mt);
			
 
				 			trace_mm_page_pcpu_drain(page, 0, mt);
			
 
				-			if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
			
 
				+			if (likely(!is_migrate_isolate_page(page))) {
			
 
				 				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
			
 
				 				if (is_migrate_cma(mt))
			
 
				 					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
			
@@ -684,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 
				 	zone->pages_scanned = 0;
			
 
				 
			
 
				 	__free_one_page(page, zone, order, migratetype);
			
 
				-	if (unlikely(migratetype != MIGRATE_ISOLATE))
			
 
				+	if (unlikely(!is_migrate_isolate(migratetype)))
			
 
				 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
			
 
				 	spin_unlock(&zone->lock);
			
 
				 }
			
@@ -916,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = {
 
				 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
			
 
				 #endif
			
 
				 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
			
 
				+#ifdef CONFIG_MEMORY_ISOLATION
			
 
				 	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
			
 
				+#endif
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -981,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
 
				 	end_pfn = start_pfn + pageblock_nr_pages - 1;
			
 
				 
			
 
				 	/* Do not cross zone boundaries */
			
 
				-	if (start_pfn < zone->zone_start_pfn)
			
 
				+	if (!zone_spans_pfn(zone, start_pfn))
			
 
				 		start_page = page;
			
 
				-	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
			
 
				+	if (!zone_spans_pfn(zone, end_pfn))
			
 
				 		return 0;
			
 
				 
			
 
				 	return move_freepages(zone, start_page, end_page, migratetype);
			
@@ -1142,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 
				 			list_add_tail(&page->lru, list);
			
 
				 		if (IS_ENABLED(CONFIG_CMA)) {
			
 
				 			mt = get_pageblock_migratetype(page);
			
 
				-			if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
			
 
				+			if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
			
 
				 				mt = migratetype;
			
 
				 		}
			
 
				 		set_freepage_migratetype(page, mt);
			
@@ -1277,7 +1293,7 @@ void mark_free_pages(struct zone *zone)
 
				 
			
 
				 	spin_lock_irqsave(&zone->lock, flags);
			
 
				 
			
 
				-	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
			
 
				+	max_zone_pfn = zone_end_pfn(zone);
			
 
				 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
			
 
				 		if (pfn_valid(pfn)) {
			
 
				 			struct page *page = pfn_to_page(pfn);
			
@@ -1326,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold)
 
				 	 * excessively into the page allocator
			
 
				 	 */
			
 
				 	if (migratetype >= MIGRATE_PCPTYPES) {
			
 
				-		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
			
 
				+		if (unlikely(is_migrate_isolate(migratetype))) {
			
 
				 			free_one_page(zone, page, 0, migratetype);
			
 
				 			goto out;
			
 
				 		}
			
@@ -1400,7 +1416,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
 
				 	zone = page_zone(page);
			
 
				 	mt = get_pageblock_migratetype(page);
			
 
				 
			
 
				-	if (mt != MIGRATE_ISOLATE) {
			
 
				+	if (!is_migrate_isolate(mt)) {
			
 
				 		/* Obey watermarks as if the page was being allocated */
			
 
				 		watermark = low_wmark_pages(zone) + (1 << order);
			
 
				 		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
			
@@ -1419,7 +1435,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
 
				 		struct page *endpage = page + (1 << order) - 1;
			
 
				 		for (; page < endpage; page += pageblock_nr_pages) {
			
 
				 			int mt = get_pageblock_migratetype(page);
			
 
				-			if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
			
 
				+			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
			
 
				 				set_pageblock_migratetype(page,
			
 
				 							  MIGRATE_MOVABLE);
			
 
				 		}
			
@@ -2615,10 +2631,17 @@ retry_cpuset:
 
				 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
			
 
				 			zonelist, high_zoneidx, alloc_flags,
			
 
				 			preferred_zone, migratetype);
			
 
				-	if (unlikely(!page))
			
 
				+	if (unlikely(!page)) {
			
 
				+		/*
			
 
				+		 * Runtime PM, block IO and its error handling path
			
 
				+		 * can deadlock because I/O on the device might not
			
 
				+		 * complete.
			
 
				+		 */
			
 
				+		gfp_mask = memalloc_noio_flags(gfp_mask);
			
 
				 		page = __alloc_pages_slowpath(gfp_mask, order,
			
 
				 				zonelist, high_zoneidx, nodemask,
			
 
				 				preferred_zone, migratetype);
			
 
				+	}
			
 
				 
			
 
				 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
			
 
				 
			
@@ -2790,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size)
 
				 }
			
 
				 EXPORT_SYMBOL(free_pages_exact);
			
 
				 
			
 
				-static unsigned int nr_free_zone_pages(int offset)
			
 
				+/**
			
 
				+ * nr_free_zone_pages - count number of pages beyond high watermark
			
 
				+ * @offset: The zone index of the highest zone
			
 
				+ *
			
 
				+ * nr_free_zone_pages() counts the number of counts pages which are beyond the
			
 
				+ * high watermark within all zones at or below a given zone index.  For each
			
 
				+ * zone, the number of pages is calculated as:
			
 
				+ *     present_pages - high_pages
			
 
				+ */
			
 
				+static unsigned long nr_free_zone_pages(int offset)
			
 
				 {
			
 
				 	struct zoneref *z;
			
 
				 	struct zone *zone;
			
 
				 
			
 
				 	/* Just pick one node, since fallback list is circular */
			
 
				-	unsigned int sum = 0;
			
 
				+	unsigned long sum = 0;
			
 
				 
			
 
				 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
			
 
				 
			
 
				 	for_each_zone_zonelist(zone, z, zonelist, offset) {
			
 
				-		unsigned long size = zone->present_pages;
			
 
				+		unsigned long size = zone->managed_pages;
			
 
				 		unsigned long high = high_wmark_pages(zone);
			
 
				 		if (size > high)
			
 
				 			sum += size - high;
			
@@ -2810,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset)
 
				 	return sum;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
			
 
				+/**
			
 
				+ * nr_free_buffer_pages - count number of pages beyond high watermark
			
 
				+ *
			
 
				+ * nr_free_buffer_pages() counts the number of pages which are beyond the high
			
 
				+ * watermark within ZONE_DMA and ZONE_NORMAL.
			
 
				  */
			
 
				-unsigned int nr_free_buffer_pages(void)
			
 
				+unsigned long nr_free_buffer_pages(void)
			
 
				 {
			
 
				 	return nr_free_zone_pages(gfp_zone(GFP_USER));
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
			
 
				 
			
 
				-/*
			
 
				- * Amount of free RAM allocatable within all zones
			
 
				+/**
			
 
				+ * nr_free_pagecache_pages - count number of pages beyond high watermark
			
 
				+ *
			
 
				+ * nr_free_pagecache_pages() counts the number of pages which are beyond the
			
 
				+ * high watermark within all zones.
			
 
				  */
			
 
				-unsigned int nr_free_pagecache_pages(void)
			
 
				+unsigned long nr_free_pagecache_pages(void)
			
 
				 {
			
 
				 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
			
 
				 }
			
@@ -2854,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 
				 	val->totalram = pgdat->node_present_pages;
			
 
				 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
			
 
				 #ifdef CONFIG_HIGHMEM
			
 
				-	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
			
 
				+	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
			
 
				 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
			
 
				 			NR_FREE_PAGES);
			
 
				 #else
			
@@ -2897,7 +2935,9 @@ static void show_migration_types(unsigned char type)
 
				 #ifdef CONFIG_CMA
			
 
				 		[MIGRATE_CMA]		= 'C',
			
 
				 #endif
			
 
				+#ifdef CONFIG_MEMORY_ISOLATION
			
 
				 		[MIGRATE_ISOLATE]	= 'I',
			
 
				+#endif
			
 
				 	};
			
 
				 	char tmp[MIGRATE_TYPES + 1];
			
 
				 	char *p = tmp;
			
@@ -3236,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 
				 {
			
 
				 	int n, val;
			
 
				 	int min_val = INT_MAX;
			
 
				-	int best_node = -1;
			
 
				+	int best_node = NUMA_NO_NODE;
			
 
				 	const struct cpumask *tmp = cpumask_of_node(0);
			
 
				 
			
 
				 	/* Use the local node if we haven't already */
			
@@ -3780,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
 
				 	 * the block.
			
 
				 	 */
			
 
				 	start_pfn = zone->zone_start_pfn;
			
 
				-	end_pfn = start_pfn + zone->spanned_pages;
			
 
				+	end_pfn = zone_end_pfn(zone);
			
 
				 	start_pfn = roundup(start_pfn, pageblock_nr_pages);
			
 
				 	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
			
 
				 							pageblock_order;
			
@@ -3876,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 
				 		set_page_links(page, zone, nid, pfn);
			
 
				 		mminit_verify_page_links(page, zone, nid, pfn);
			
 
				 		init_page_count(page);
			
 
				-		reset_page_mapcount(page);
			
 
				-		reset_page_last_nid(page);
			
 
				+		page_mapcount_reset(page);
			
 
				+		page_nid_reset_last(page);
			
 
				 		SetPageReserved(page);
			
 
				 		/*
			
 
				 		 * Mark the block movable so that blocks are reserved for
			
@@ -3894,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 
				 		 * pfn out of zone.
			
 
				 		 */
			
 
				 		if ((z->zone_start_pfn <= pfn)
			
 
				-		    && (pfn < z->zone_start_pfn + z->spanned_pages)
			
 
				+		    && (pfn < zone_end_pfn(z))
			
 
				 		    && !(pfn & (pageblock_nr_pages - 1)))
			
 
				 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
			
 
				 
			
@@ -3932,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone)
 
				 	 *
			
 
				 	 * OK, so we don't know how big the cache is.  So guess.
			
 
				 	 */
			
 
				-	batch = zone->present_pages / 1024;
			
 
				+	batch = zone->managed_pages / 1024;
			
 
				 	if (batch * PAGE_SIZE > 512 * 1024)
			
 
				 		batch = (512 * 1024) / PAGE_SIZE;
			
 
				 	batch /= 4;		/* We effectively *= 4 below */
			
@@ -4016,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone)
 
				 
			
 
				 		if (percpu_pagelist_fraction)
			
 
				 			setup_pagelist_highmark(pcp,
			
 
				-				(zone->present_pages /
			
 
				+				(zone->managed_pages /
			
 
				 					percpu_pagelist_fraction));
			
 
				 	}
			
 
				 }
			
@@ -4372,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 
				 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
			
 
				+ *
			
 
				+ * zone_movable_limit is initialized as 0. This function will try to get
			
 
				+ * the first ZONE_MOVABLE pfn of each node from movablemem_map, and
			
 
				+ * assigne them to zone_movable_limit.
			
 
				+ * zone_movable_limit[nid] == 0 means no limit for the node.
			
 
				+ *
			
 
				+ * Note: Each range is represented as [start_pfn, end_pfn)
			
 
				+ */
			
 
				+static void __meminit sanitize_zone_movable_limit(void)
			
 
				+{
			
 
				+	int map_pos = 0, i, nid;
			
 
				+	unsigned long start_pfn, end_pfn;
			
 
				+
			
 
				+	if (!movablemem_map.nr_map)
			
 
				+		return;
			
 
				+
			
 
				+	/* Iterate all ranges from minimum to maximum */
			
 
				+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
			
 
				+		/*
			
 
				+		 * If we have found lowest pfn of ZONE_MOVABLE of the node
			
 
				+		 * specified by user, just go on to check next range.
			
 
				+		 */
			
 
				+		if (zone_movable_limit[nid])
			
 
				+			continue;
			
 
				+
			
 
				+#ifdef CONFIG_ZONE_DMA
			
 
				+		/* Skip DMA memory. */
			
 
				+		if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
			
 
				+			start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_ZONE_DMA32
			
 
				+		/* Skip DMA32 memory. */
			
 
				+		if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
			
 
				+			start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_HIGHMEM
			
 
				+		/* Skip lowmem if ZONE_MOVABLE is highmem. */
			
 
				+		if (zone_movable_is_highmem() &&
			
 
				+		    start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
			
 
				+			start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
			
 
				+#endif
			
 
				+
			
 
				+		if (start_pfn >= end_pfn)
			
 
				+			continue;
			
 
				+
			
 
				+		while (map_pos < movablemem_map.nr_map) {
			
 
				+			if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
			
 
				+				break;
			
 
				+
			
 
				+			if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
			
 
				+				map_pos++;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			/*
			
 
				+			 * The start_pfn of ZONE_MOVABLE is either the minimum
			
 
				+			 * pfn specified by movablemem_map, or 0, which means
			
 
				+			 * the node has no ZONE_MOVABLE.
			
 
				+			 */
			
 
				+			zone_movable_limit[nid] = max(start_pfn,
			
 
				+					movablemem_map.map[map_pos].start_pfn);
			
 
				+
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
			
 
				 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
			
 
				 					unsigned long zone_type,
			
@@ -4389,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 
				 
			
 
				 	return zholes_size[zone_type];
			
 
				 }
			
 
				-
			
 
				 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
			
 
				 
			
 
				 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
			
@@ -4573,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 
				 		nr_all_pages += freesize;
			
 
				 
			
 
				 		zone->spanned_pages = size;
			
 
				-		zone->present_pages = freesize;
			
 
				+		zone->present_pages = realsize;
			
 
				 		/*
			
 
				 		 * Set an approximate value for lowmem here, it will be adjusted
			
 
				 		 * when the bootmem allocator frees pages into the buddy system.
			
@@ -4625,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 
				 		 * for the buddy allocator to function correctly.
			
 
				 		 */
			
 
				 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
			
 
				-		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
			
 
				+		end = pgdat_end_pfn(pgdat);
			
 
				 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
			
 
				 		size =  (end - start) * sizeof(struct page);
			
 
				 		map = alloc_remap(pgdat->node_id, size);
			
@@ -4831,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 
				 		required_kernelcore = max(required_kernelcore, corepages);
			
 
				 	}
			
 
				 
			
 
				-	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
			
 
				-	if (!required_kernelcore)
			
 
				+	/*
			
 
				+	 * If neither kernelcore/movablecore nor movablemem_map is specified,
			
 
				+	 * there is no ZONE_MOVABLE. But if movablemem_map is specified, the
			
 
				+	 * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
			
 
				+	 */
			
 
				+	if (!required_kernelcore) {
			
 
				+		if (movablemem_map.nr_map)
			
 
				+			memcpy(zone_movable_pfn, zone_movable_limit,
			
 
				+				sizeof(zone_movable_pfn));
			
 
				 		goto out;
			
 
				+	}
			
 
				 
			
 
				 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
			
 
				-	find_usable_zone_for_movable();
			
 
				 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
			
 
				 
			
 
				 restart:
			
@@ -4864,10 +4981,24 @@ restart:
 
				 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
			
 
				 			unsigned long size_pages;
			
 
				 
			
 
				+			/*
			
 
				+			 * Find more memory for kernelcore in
			
 
				+			 * [zone_movable_pfn[nid], zone_movable_limit[nid]).
			
 
				+			 */
			
 
				 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
			
 
				 			if (start_pfn >= end_pfn)
			
 
				 				continue;
			
 
				 
			
 
				+			if (zone_movable_limit[nid]) {
			
 
				+				end_pfn = min(end_pfn, zone_movable_limit[nid]);
			
 
				+				/* No range left for kernelcore in this node */
			
 
				+				if (start_pfn >= end_pfn) {
			
 
				+					zone_movable_pfn[nid] =
			
 
				+							zone_movable_limit[nid];
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				 			/* Account for what is only usable for kernelcore */
			
 
				 			if (start_pfn < usable_startpfn) {
			
 
				 				unsigned long kernel_pages;
			
@@ -4927,12 +5058,12 @@ restart:
 
				 	if (usable_nodes && required_kernelcore > usable_nodes)
			
 
				 		goto restart;
			
 
				 
			
 
				+out:
			
 
				 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
			
 
				 	for (nid = 0; nid < MAX_NUMNODES; nid++)
			
 
				 		zone_movable_pfn[nid] =
			
 
				 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
			
 
				 
			
 
				-out:
			
 
				 	/* restore the node_state */
			
 
				 	node_states[N_MEMORY] = saved_node_state;
			
 
				 }
			
@@ -4995,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 
				 
			
 
				 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
			
 
				 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
			
 
				+	find_usable_zone_for_movable();
			
 
				+	sanitize_zone_movable_limit();
			
 
				 	find_zone_movable_pfns_for_nodes();
			
 
				 
			
 
				 	/* Print out the zone ranges */
			
@@ -5078,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p)
 
				 early_param("kernelcore", cmdline_parse_kernelcore);
			
 
				 early_param("movablecore", cmdline_parse_movablecore);
			
 
				 
			
 
				+/**
			
 
				+ * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
			
 
				+ * @start_pfn:	start pfn of the range to be checked
			
 
				+ * @end_pfn: 	end pfn of the range to be checked (exclusive)
			
 
				+ *
			
 
				+ * This function checks if a given memory range [start_pfn, end_pfn) overlaps
			
 
				+ * the movablemem_map.map[] array.
			
 
				+ *
			
 
				+ * Return: index of the first overlapped element in movablemem_map.map[]
			
 
				+ *         or -1 if they don't overlap each other.
			
 
				+ */
			
 
				+int __init movablemem_map_overlap(unsigned long start_pfn,
			
 
				+				   unsigned long end_pfn)
			
 
				+{
			
 
				+	int overlap;
			
 
				+
			
 
				+	if (!movablemem_map.nr_map)
			
 
				+		return -1;
			
 
				+
			
 
				+	for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
			
 
				+		if (start_pfn < movablemem_map.map[overlap].end_pfn)
			
 
				+			break;
			
 
				+
			
 
				+	if (overlap == movablemem_map.nr_map ||
			
 
				+	    end_pfn <= movablemem_map.map[overlap].start_pfn)
			
 
				+		return -1;
			
 
				+
			
 
				+	return overlap;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
			
 
				+ * @start_pfn:	start pfn of the range
			
 
				+ * @end_pfn:	end pfn of the range
			
 
				+ *
			
 
				+ * This function will also merge the overlapped ranges, and sort the array
			
 
				+ * by start_pfn in monotonic increasing order.
			
 
				+ */
			
 
				+void __init insert_movablemem_map(unsigned long start_pfn,
			
 
				+				  unsigned long end_pfn)
			
 
				+{
			
 
				+	int pos, overlap;
			
 
				+
			
 
				+	/*
			
 
				+	 * pos will be at the 1st overlapped range, or the position
			
 
				+	 * where the element should be inserted.
			
 
				+	 */
			
 
				+	for (pos = 0; pos < movablemem_map.nr_map; pos++)
			
 
				+		if (start_pfn <= movablemem_map.map[pos].end_pfn)
			
 
				+			break;
			
 
				+
			
 
				+	/* If there is no overlapped range, just insert the element. */
			
 
				+	if (pos == movablemem_map.nr_map ||
			
 
				+	    end_pfn < movablemem_map.map[pos].start_pfn) {
			
 
				+		/*
			
 
				+		 * If pos is not the end of array, we need to move all
			
 
				+		 * the rest elements backward.
			
 
				+		 */
			
 
				+		if (pos < movablemem_map.nr_map)
			
 
				+			memmove(&movablemem_map.map[pos+1],
			
 
				+				&movablemem_map.map[pos],
			
 
				+				sizeof(struct movablemem_entry) *
			
 
				+				(movablemem_map.nr_map - pos));
			
 
				+		movablemem_map.map[pos].start_pfn = start_pfn;
			
 
				+		movablemem_map.map[pos].end_pfn = end_pfn;
			
 
				+		movablemem_map.nr_map++;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* overlap will be at the last overlapped range */
			
 
				+	for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
			
 
				+		if (end_pfn < movablemem_map.map[overlap].start_pfn)
			
 
				+			break;
			
 
				+
			
 
				+	/*
			
 
				+	 * If there are more ranges overlapped, we need to merge them,
			
 
				+	 * and move the rest elements forward.
			
 
				+	 */
			
 
				+	overlap--;
			
 
				+	movablemem_map.map[pos].start_pfn = min(start_pfn,
			
 
				+					movablemem_map.map[pos].start_pfn);
			
 
				+	movablemem_map.map[pos].end_pfn = max(end_pfn,
			
 
				+					movablemem_map.map[overlap].end_pfn);
			
 
				+
			
 
				+	if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
			
 
				+		memmove(&movablemem_map.map[pos+1],
			
 
				+			&movablemem_map.map[overlap+1],
			
 
				+			sizeof(struct movablemem_entry) *
			
 
				+			(movablemem_map.nr_map - overlap - 1));
			
 
				+
			
 
				+	movablemem_map.nr_map -= overlap - pos;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * movablemem_map_add_region - Add a memory range into movablemem_map.
			
 
				+ * @start:	physical start address of range
			
 
				+ * @end:	physical end address of range
			
 
				+ *
			
 
				+ * This function transform the physical address into pfn, and then add the
			
 
				+ * range into movablemem_map by calling insert_movablemem_map().
			
 
				+ */
			
 
				+static void __init movablemem_map_add_region(u64 start, u64 size)
			
 
				+{
			
 
				+	unsigned long start_pfn, end_pfn;
			
 
				+
			
 
				+	/* In case size == 0 or start + size overflows */
			
 
				+	if (start + size <= start)
			
 
				+		return;
			
 
				+
			
 
				+	if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
			
 
				+		pr_err("movablemem_map: too many entries;"
			
 
				+			" ignoring [mem %#010llx-%#010llx]\n",
			
 
				+			(unsigned long long) start,
			
 
				+			(unsigned long long) (start + size - 1));
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	start_pfn = PFN_DOWN(start);
			
 
				+	end_pfn = PFN_UP(start + size);
			
 
				+	insert_movablemem_map(start_pfn, end_pfn);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
			
 
				+ * @p:	The boot option of the following format:
			
 
				+ *	movablemem_map=nn[KMG]@ss[KMG]
			
 
				+ *
			
 
				+ * This option sets the memory range [ss, ss+nn) to be used as movable memory.
			
 
				+ *
			
 
				+ * Return: 0 on success or -EINVAL on failure.
			
 
				+ */
			
 
				+static int __init cmdline_parse_movablemem_map(char *p)
			
 
				+{
			
 
				+	char *oldp;
			
 
				+	u64 start_at, mem_size;
			
 
				+
			
 
				+	if (!p)
			
 
				+		goto err;
			
 
				+
			
 
				+	if (!strcmp(p, "acpi"))
			
 
				+		movablemem_map.acpi = true;
			
 
				+
			
 
				+	/*
			
 
				+	 * If user decide to use info from BIOS, all the other user specified
			
 
				+	 * ranges will be ingored.
			
 
				+	 */
			
 
				+	if (movablemem_map.acpi) {
			
 
				+		if (movablemem_map.nr_map) {
			
 
				+			memset(movablemem_map.map, 0,
			
 
				+				sizeof(struct movablemem_entry)
			
 
				+				* movablemem_map.nr_map);
			
 
				+			movablemem_map.nr_map = 0;
			
 
				+		}
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	oldp = p;
			
 
				+	mem_size = memparse(p, &p);
			
 
				+	if (p == oldp)
			
 
				+		goto err;
			
 
				+
			
 
				+	if (*p == '@') {
			
 
				+		oldp = ++p;
			
 
				+		start_at = memparse(p, &p);
			
 
				+		if (p == oldp || *p != '\0')
			
 
				+			goto err;
			
 
				+
			
 
				+		movablemem_map_add_region(start_at, mem_size);
			
 
				+		return 0;
			
 
				+	}
			
 
				+err:
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+early_param("movablemem_map", cmdline_parse_movablemem_map);
			
 
				+
			
 
				 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
			
 
				 
			
 
				 /**
			
@@ -5160,8 +5468,8 @@ static void calculate_totalreserve_pages(void)
 
				 			/* we treat the high watermark as reserved pages. */
			
 
				 			max += high_wmark_pages(zone);
			
 
				 
			
 
				-			if (max > zone->present_pages)
			
 
				-				max = zone->present_pages;
			
 
				+			if (max > zone->managed_pages)
			
 
				+				max = zone->managed_pages;
			
 
				 			reserve_pages += max;
			
 
				 			/*
			
 
				 			 * Lowmem reserves are not available to
			
@@ -5193,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void)
 
				 	for_each_online_pgdat(pgdat) {
			
 
				 		for (j = 0; j < MAX_NR_ZONES; j++) {
			
 
				 			struct zone *zone = pgdat->node_zones + j;
			
 
				-			unsigned long present_pages = zone->present_pages;
			
 
				+			unsigned long managed_pages = zone->managed_pages;
			
 
				 
			
 
				 			zone->lowmem_reserve[j] = 0;
			
 
				 
			
@@ -5207,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void)
 
				 					sysctl_lowmem_reserve_ratio[idx] = 1;
			
 
				 
			
 
				 				lower_zone = pgdat->node_zones + idx;
			
 
				-				lower_zone->lowmem_reserve[j] = present_pages /
			
 
				+				lower_zone->lowmem_reserve[j] = managed_pages /
			
 
				 					sysctl_lowmem_reserve_ratio[idx];
			
 
				-				present_pages += lower_zone->present_pages;
			
 
				+				managed_pages += lower_zone->managed_pages;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -5228,14 +5536,14 @@ static void __setup_per_zone_wmarks(void)
 
				 	/* Calculate total number of !ZONE_HIGHMEM pages */
			
 
				 	for_each_zone(zone) {
			
 
				 		if (!is_highmem(zone))
			
 
				-			lowmem_pages += zone->present_pages;
			
 
				+			lowmem_pages += zone->managed_pages;
			
 
				 	}
			
 
				 
			
 
				 	for_each_zone(zone) {
			
 
				 		u64 tmp;
			
 
				 
			
 
				 		spin_lock_irqsave(&zone->lock, flags);
			
 
				-		tmp = (u64)pages_min * zone->present_pages;
			
 
				+		tmp = (u64)pages_min * zone->managed_pages;
			
 
				 		do_div(tmp, lowmem_pages);
			
 
				 		if (is_highmem(zone)) {
			
 
				 			/*
			
@@ -5247,13 +5555,10 @@ static void __setup_per_zone_wmarks(void)
 
				 			 * deltas controls asynch page reclaim, and so should
			
 
				 			 * not be capped for highmem.
			
 
				 			 */
			
 
				-			int min_pages;
			
 
				+			unsigned long min_pages;
			
 
				 
			
 
				-			min_pages = zone->present_pages / 1024;
			
 
				-			if (min_pages < SWAP_CLUSTER_MAX)
			
 
				-				min_pages = SWAP_CLUSTER_MAX;
			
 
				-			if (min_pages > 128)
			
 
				-				min_pages = 128;
			
 
				+			min_pages = zone->managed_pages / 1024;
			
 
				+			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
			
 
				 			zone->watermark[WMARK_MIN] = min_pages;
			
 
				 		} else {
			
 
				 			/*
			
@@ -5314,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 
				 	unsigned int gb, ratio;
			
 
				 
			
 
				 	/* Zone size in gigabytes */
			
 
				-	gb = zone->present_pages >> (30 - PAGE_SHIFT);
			
 
				+	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
			
 
				 	if (gb)
			
 
				 		ratio = int_sqrt(10 * gb);
			
 
				 	else
			
@@ -5400,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 
				 		return rc;
			
 
				 
			
 
				 	for_each_zone(zone)
			
 
				-		zone->min_unmapped_pages = (zone->present_pages *
			
 
				+		zone->min_unmapped_pages = (zone->managed_pages *
			
 
				 				sysctl_min_unmapped_ratio) / 100;
			
 
				 	return 0;
			
 
				 }
			
@@ -5416,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 
				 		return rc;
			
 
				 
			
 
				 	for_each_zone(zone)
			
 
				-		zone->min_slab_pages = (zone->present_pages *
			
 
				+		zone->min_slab_pages = (zone->managed_pages *
			
 
				 				sysctl_min_slab_ratio) / 100;
			
 
				 	return 0;
			
 
				 }
			
@@ -5458,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 
				 	for_each_populated_zone(zone) {
			
 
				 		for_each_possible_cpu(cpu) {
			
 
				 			unsigned long  high;
			
 
				-			high = zone->present_pages / percpu_pagelist_fraction;
			
 
				+			high = zone->managed_pages / percpu_pagelist_fraction;
			
 
				 			setup_pagelist_highmark(
			
 
				 				per_cpu_ptr(zone->pageset, cpu), high);
			
 
				 		}
			
@@ -5645,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 
				 	pfn = page_to_pfn(page);
			
 
				 	bitmap = get_pageblock_bitmap(zone, pfn);
			
 
				 	bitidx = pfn_to_bitidx(zone, pfn);
			
 
				-	VM_BUG_ON(pfn < zone->zone_start_pfn);
			
 
				-	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
			
 
				+	VM_BUG_ON(!zone_spans_pfn(zone, pfn));
			
 
				 
			
 
				 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
			
 
				 		if (flags & value)
			
@@ -5744,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page)
 
				 
			
 
				 	zone = page_zone(page);
			
 
				 	pfn = page_to_pfn(page);
			
 
				-	if (zone->zone_start_pfn > pfn ||
			
 
				-			zone->zone_start_pfn + zone->spanned_pages <= pfn)
			
 
				+	if (!zone_spans_pfn(zone, pfn))
			
 
				 		return false;
			
 
				 
			
 
				 	return !has_unmovable_pages(zone, page, 0, true);
			
@@ -5801,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 
				 							&cc->migratepages);
			
 
				 		cc->nr_migratepages -= nr_reclaimed;
			
 
				 
			
 
				-		ret = migrate_pages(&cc->migratepages,
			
 
				-				    alloc_migrate_target,
			
 
				-				    0, false, MIGRATE_SYNC,
			
 
				-				    MR_CMA);
			
 
				+		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
			
 
				+				    0, MIGRATE_SYNC, MR_CMA);
			
 
				 	}
			
 
				-
			
 
				-	putback_movable_pages(&cc->migratepages);
			
 
				-	return ret > 0 ? 0 : ret;
			
 
				+	if (ret < 0) {
			
 
				+		putback_movable_pages(&cc->migratepages);
			
 
				+		return ret;
			
 
				+	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
 
				 	 */
			
 
				 	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
			
 
				 		anon_vma_lock_write(anon_vma);
			
 
				-		anon_vma_unlock(anon_vma);
			
 
				+		anon_vma_unlock_write(anon_vma);
			
 
				 	}
			
 
				 
			
 
				 	kmem_cache_free(anon_vma_cachep, anon_vma);
			
@@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 
				 			avc = NULL;
			
 
				 		}
			
 
				 		spin_unlock(&mm->page_table_lock);
			
 
				-		anon_vma_unlock(anon_vma);
			
 
				+		anon_vma_unlock_write(anon_vma);
			
 
				 
			
 
				 		if (unlikely(allocated))
			
 
				 			put_anon_vma(allocated);
			
@@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 
				 	vma->anon_vma = anon_vma;
			
 
				 	anon_vma_lock_write(anon_vma);
			
 
				 	anon_vma_chain_link(vma, avc, anon_vma);
			
 
				-	anon_vma_unlock(anon_vma);
			
 
				+	anon_vma_unlock_write(anon_vma);
			
 
				 
			
 
				 	return 0;