14 years ago · f1ebdd60cc
--- a/arch/ia64/include/asm/siginfo.h
+++ b/arch/ia64/include/asm/siginfo.h
@@ -62,6 +62,7 @@ typedef struct siginfo {
 
				 			int _imm;		/* immediate value for "break" */
			
 
				 			unsigned int _flags;	/* see below */
			
 
				 			unsigned long _isr;	/* isr */
			
 
				+			short _addr_lsb;	/* lsb of faulting address */
			
 
				 		} _sigfault;
			
 
				 
			
 
				 		/* SIGPOLL */
			
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
 
				 #include <linux/kprobes.h>		/* __kprobes, ...		*/
			
 
				 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
			
 
				 #include <linux/perf_event.h>		/* perf_sw_event		*/
			
 
				+#include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
			
 
				 
			
 
				 #include <asm/traps.h>			/* dotraplinkage, ...		*/
			
 
				 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
			
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 
				 
			
 
				 static void
			
 
				 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
			
 
				-		     struct task_struct *tsk)
			
 
				+		     struct task_struct *tsk, int fault)
			
 
				 {
			
 
				+	unsigned lsb = 0;
			
 
				 	siginfo_t info;
			
 
				 
			
 
				 	info.si_signo	= si_signo;
			
 
				 	info.si_errno	= 0;
			
 
				 	info.si_code	= si_code;
			
 
				 	info.si_addr	= (void __user *)address;
			
 
				-	info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
			
 
				+	if (fault & VM_FAULT_HWPOISON_LARGE)
			
 
				+		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
			
 
				+	if (fault & VM_FAULT_HWPOISON)
			
 
				+		lsb = PAGE_SHIFT;
			
 
				+	info.si_addr_lsb = lsb;
			
 
				 
			
 
				 	force_sig_info(si_signo, &info, tsk);
			
 
				 }
			
@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 
				 		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
			
 
				 		tsk->thread.trap_no	= 14;
			
 
				 
			
 
				-		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
			
 
				+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
			
 
				 
			
 
				 		return;
			
 
				 	}
			
@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 
				 	tsk->thread.trap_no	= 14;
			
 
				 
			
 
				 #ifdef CONFIG_MEMORY_FAILURE
			
 
				-	if (fault & VM_FAULT_HWPOISON) {
			
 
				+	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
			
 
				 		printk(KERN_ERR
			
 
				 	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
			
 
				 			tsk->comm, tsk->pid, address);
			
 
				 		code = BUS_MCEERR_AR;
			
 
				 	}
			
 
				 #endif
			
 
				-	force_sig_info_fault(SIGBUS, code, address, tsk);
			
 
				+	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
			
 
				 }
			
 
				 
			
 
				 static noinline void
			
@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
				 	if (fault & VM_FAULT_OOM) {
			
 
				 		out_of_memory(regs, error_code, address);
			
 
				 	} else {
			
 
				-		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
			
 
				+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
			
 
				+			     VM_FAULT_HWPOISON_LARGE))
			
 
				 			do_sigbus(regs, error_code, address, fault);
			
 
				 		else
			
 
				 			BUG();
			
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
 
				 #include <linux/statfs.h>
			
 
				 #include <linux/security.h>
			
 
				 #include <linux/magic.h>
			
 
				+#include <linux/migrate.h>
			
 
				 
			
 
				 #include <asm/uaccess.h>
			
 
				 
			
@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int hugetlbfs_migrate_page(struct address_space *mapping,
			
 
				+				struct page *newpage, struct page *page)
			
 
				+{
			
 
				+	int rc;
			
 
				+
			
 
				+	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
			
 
				+	if (rc)
			
 
				+		return rc;
			
 
				+	migrate_page_copy(newpage, page);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
			
 
				 {
			
 
				 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
			
@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 
				 	.write_begin	= hugetlbfs_write_begin,
			
 
				 	.write_end	= hugetlbfs_write_end,
			
 
				 	.set_page_dirty	= hugetlbfs_set_page_dirty,
			
 
				+	.migratepage    = hugetlbfs_migrate_page,
			
 
				 };
			
 
				 
			
 
				 
			
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 
				 		err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
			
 
				 #ifdef __ARCH_SI_TRAPNO
			
 
				 		err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
			
 
				+#endif
			
 
				+#ifdef BUS_MCEERR_AO
			
 
				+		/* 
			
 
				+		 * Other callers might not initialize the si_lsb field,
			
 
				+		 * so check explicitly for the right codes here.
			
 
				+		 */
			
 
				+		if (kinfo->si_code == BUS_MCEERR_AR ||
			
 
				+		    kinfo->si_code == BUS_MCEERR_AO)
			
 
				+			err |= __put_user((short) kinfo->si_addr_lsb,
			
 
				+					  &uinfo->ssi_addr_lsb);
			
 
				 #endif
			
 
				 		break;
			
 
				 	case __SI_CHLD:
			
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 
				 						struct vm_area_struct *vma,
			
 
				 						int acctflags);
			
 
				 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
			
 
				-void __isolate_hwpoisoned_huge_page(struct page *page);
			
 
				+int dequeue_hwpoisoned_huge_page(struct page *page);
			
 
				+void copy_huge_page(struct page *dst, struct page *src);
			
 
				 
			
 
				 extern unsigned long hugepages_treat_as_movable;
			
 
				 extern const unsigned long hugetlb_zero, hugetlb_infinity;
			
@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 
				 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
			
 
				 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
			
 
				 #define huge_pte_offset(mm, address)	0
			
 
				-#define __isolate_hwpoisoned_huge_page(page)	0
			
 
				+#define dequeue_hwpoisoned_huge_page(page)	0
			
 
				+static inline void copy_huge_page(struct page *dst, struct page *src)
			
 
				+{
			
 
				+}
			
 
				 
			
 
				 #define hugetlb_change_protection(vma, address, end, newprot)
			
 
				 
			
@@ -228,6 +232,8 @@ struct huge_bootmem_page {
 
				 	struct hstate *hstate;
			
 
				 };
			
 
				 
			
 
				+struct page *alloc_huge_page_node(struct hstate *h, int nid);
			
 
				+
			
 
				 /* arch callback */
			
 
				 int __init alloc_bootmem_huge_page(struct hstate *h);
			
 
				 
			
@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page)
 
				 	return size_to_hstate(PAGE_SIZE << compound_order(page));
			
 
				 }
			
 
				 
			
 
				+static inline unsigned hstate_index_to_shift(unsigned index)
			
 
				+{
			
 
				+	return hstates[index].order + PAGE_SHIFT;
			
 
				+}
			
 
				+
			
 
				 #else
			
 
				 struct hstate {};
			
 
				+#define alloc_huge_page_node(h, nid) NULL
			
 
				 #define alloc_bootmem_huge_page(h) NULL
			
 
				 #define hstate_file(f) NULL
			
 
				 #define hstate_vma(v) NULL
			
@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
 
				 {
			
 
				 	return 1;
			
 
				 }
			
 
				+#define hstate_index_to_shift(index) 0
			
 
				 #endif
			
 
				 
			
 
				 #endif /* _LINUX_HUGETLB_H */
			
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
 
				 			struct page *, struct page *);
			
 
				 extern int migrate_pages(struct list_head *l, new_page_t x,
			
 
				 			unsigned long private, int offlining);
			
 
				+extern int migrate_huge_pages(struct list_head *l, new_page_t x,
			
 
				+			unsigned long private, int offlining);
			
 
				 
			
 
				 extern int fail_migrate_page(struct address_space *,
			
 
				 			struct page *, struct page *);
			
@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
 
				 extern int migrate_vmas(struct mm_struct *mm,
			
 
				 		const nodemask_t *from, const nodemask_t *to,
			
 
				 		unsigned long flags);
			
 
				+extern void migrate_page_copy(struct page *newpage, struct page *page);
			
 
				+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
			
 
				+				  struct page *newpage, struct page *page);
			
 
				 #else
			
 
				 #define PAGE_MIGRATION 0
			
 
				 
			
 
				 static inline void putback_lru_pages(struct list_head *l) {}
			
 
				 static inline int migrate_pages(struct list_head *l, new_page_t x,
			
 
				 		unsigned long private, int offlining) { return -ENOSYS; }
			
 
				+static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
			
 
				+		unsigned long private, int offlining) { return -ENOSYS; }
			
 
				 
			
 
				 static inline int migrate_prep(void) { return -ENOSYS; }
			
 
				 static inline int migrate_prep_local(void) { return -ENOSYS; }
			
@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
 
				 	return -ENOSYS;
			
 
				 }
			
 
				 
			
 
				+static inline void migrate_page_copy(struct page *newpage,
			
 
				+				     struct page *page) {}
			
 
				+
			
 
				+static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
			
 
				+				  struct page *newpage, struct page *page)
			
 
				+{
			
 
				+	return -ENOSYS;
			
 
				+}
			
 
				+
			
 
				 /* Possible settings for the migrate_page() method in address_operations */
			
 
				 #define migrate_page NULL
			
 
				 #define fail_migrate_page NULL
			
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page)
 
				 #define VM_FAULT_SIGBUS	0x0002
			
 
				 #define VM_FAULT_MAJOR	0x0004
			
 
				 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
			
 
				-#define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned page */
			
 
				+#define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
			
 
				+#define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
			
 
				 
			
 
				 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
			
 
				 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
			
 
				 
			
 
				-#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
			
 
				+#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
			
 
				+
			
 
				+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
			
 
				+			 VM_FAULT_HWPOISON_LARGE)
			
 
				+
			
 
				+/* Encode hstate index for a hwpoisoned large page */
			
 
				+#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
			
 
				+#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
			
 
				 
			
 
				 /*
			
 
				  * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
			
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -33,6 +33,7 @@ struct signalfd_siginfo {
 
				 	__u64 ssi_utime;
			
 
				 	__u64 ssi_stime;
			
 
				 	__u64 ssi_addr;
			
 
				+	__u16 ssi_addr_lsb;
			
 
				 
			
 
				 	/*
			
 
				 	 * Pad strcture to 128 bytes. Remember to update the
			
@@ -43,7 +44,7 @@ struct signalfd_siginfo {
 
				 	 * comes out of a read(2) and we really don't want to have
			
 
				 	 * a compat on read(2).
			
 
				 	 */
			
 
				-	__u8 __pad[48];
			
 
				+	__u8 __pad[46];
			
 
				 };
			
 
				 
			
 
				 
			
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void copy_gigantic_page(struct page *dst, struct page *src,
			
 
				+static void copy_user_gigantic_page(struct page *dst, struct page *src,
			
 
				 			   unsigned long addr, struct vm_area_struct *vma)
			
 
				 {
			
 
				 	int i;
			
 
				 	struct hstate *h = hstate_vma(vma);
			
 
				 	struct page *dst_base = dst;
			
 
				 	struct page *src_base = src;
			
 
				-	might_sleep();
			
 
				+
			
 
				 	for (i = 0; i < pages_per_huge_page(h); ) {
			
 
				 		cond_resched();
			
 
				 		copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
			
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
 
				 		src = mem_map_next(src, src_base, i);
			
 
				 	}
			
 
				 }
			
 
				-static void copy_huge_page(struct page *dst, struct page *src,
			
 
				+
			
 
				+static void copy_user_huge_page(struct page *dst, struct page *src,
			
 
				 			   unsigned long addr, struct vm_area_struct *vma)
			
 
				 {
			
 
				 	int i;
			
 
				 	struct hstate *h = hstate_vma(vma);
			
 
				 
			
 
				 	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
			
 
				-		copy_gigantic_page(dst, src, addr, vma);
			
 
				+		copy_user_gigantic_page(dst, src, addr, vma);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void copy_gigantic_page(struct page *dst, struct page *src)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct hstate *h = page_hstate(src);
			
 
				+	struct page *dst_base = dst;
			
 
				+	struct page *src_base = src;
			
 
				+
			
 
				+	for (i = 0; i < pages_per_huge_page(h); ) {
			
 
				+		cond_resched();
			
 
				+		copy_highpage(dst, src);
			
 
				+
			
 
				+		i++;
			
 
				+		dst = mem_map_next(dst, dst_base, i);
			
 
				+		src = mem_map_next(src, src_base, i);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void copy_huge_page(struct page *dst, struct page *src)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct hstate *h = page_hstate(src);
			
 
				+
			
 
				+	if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
			
 
				+		copy_gigantic_page(dst, src);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	might_sleep();
			
 
				+	for (i = 0; i < pages_per_huge_page(h); i++) {
			
 
				+		cond_resched();
			
 
				+		copy_highpage(dst + i, src + i);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void enqueue_huge_page(struct hstate *h, struct page *page)
			
 
				 {
			
 
				 	int nid = page_to_nid(page);
			
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 
				 	h->free_huge_pages_node[nid]++;
			
 
				 }
			
 
				 
			
 
				+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+
			
 
				+	if (list_empty(&h->hugepage_freelists[nid]))
			
 
				+		return NULL;
			
 
				+	page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
			
 
				+	list_del(&page->lru);
			
 
				+	set_page_refcounted(page);
			
 
				+	h->free_huge_pages--;
			
 
				+	h->free_huge_pages_node[nid]--;
			
 
				+	return page;
			
 
				+}
			
 
				+
			
 
				 static struct page *dequeue_huge_page_vma(struct hstate *h,
			
 
				 				struct vm_area_struct *vma,
			
 
				 				unsigned long address, int avoid_reserve)
			
 
				 {
			
 
				-	int nid;
			
 
				 	struct page *page = NULL;
			
 
				 	struct mempolicy *mpol;
			
 
				 	nodemask_t *nodemask;
			
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 
				 
			
 
				 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
			
 
				 						MAX_NR_ZONES - 1, nodemask) {
			
 
				-		nid = zone_to_nid(zone);
			
 
				-		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
			
 
				-		    !list_empty(&h->hugepage_freelists[nid])) {
			
 
				-			page = list_entry(h->hugepage_freelists[nid].next,
			
 
				-					  struct page, lru);
			
 
				-			list_del(&page->lru);
			
 
				-			h->free_huge_pages--;
			
 
				-			h->free_huge_pages_node[nid]--;
			
 
				-
			
 
				-			if (!avoid_reserve)
			
 
				-				decrement_hugepage_resv_vma(h, vma);
			
 
				-
			
 
				-			break;
			
 
				+		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
			
 
				+			page = dequeue_huge_page_node(h, zone_to_nid(zone));
			
 
				+			if (page) {
			
 
				+				if (!avoid_reserve)
			
 
				+					decrement_hugepage_resv_vma(h, vma);
			
 
				+				break;
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 err:
			
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static struct page *alloc_buddy_huge_page(struct hstate *h,
			
 
				-			struct vm_area_struct *vma, unsigned long address)
			
 
				+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
			
 
				 {
			
 
				 	struct page *page;
			
 
				-	unsigned int nid;
			
 
				+	unsigned int r_nid;
			
 
				 
			
 
				 	if (h->order >= MAX_ORDER)
			
 
				 		return NULL;
			
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 
				 	}
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 
			
 
				-	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
			
 
				-					__GFP_REPEAT|__GFP_NOWARN,
			
 
				-					huge_page_order(h));
			
 
				+	if (nid == NUMA_NO_NODE)
			
 
				+		page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
			
 
				+				   __GFP_REPEAT|__GFP_NOWARN,
			
 
				+				   huge_page_order(h));
			
 
				+	else
			
 
				+		page = alloc_pages_exact_node(nid,
			
 
				+			htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
			
 
				+			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
			
 
				 
			
 
				 	if (page && arch_prepare_hugepage(page)) {
			
 
				 		__free_pages(page, huge_page_order(h));
			
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 
				 
			
 
				 	spin_lock(&hugetlb_lock);
			
 
				 	if (page) {
			
 
				-		/*
			
 
				-		 * This page is now managed by the hugetlb allocator and has
			
 
				-		 * no users -- drop the buddy allocator's reference.
			
 
				-		 */
			
 
				-		put_page_testzero(page);
			
 
				-		VM_BUG_ON(page_count(page));
			
 
				-		nid = page_to_nid(page);
			
 
				+		r_nid = page_to_nid(page);
			
 
				 		set_compound_page_dtor(page, free_huge_page);
			
 
				 		/*
			
 
				 		 * We incremented the global counters already
			
 
				 		 */
			
 
				-		h->nr_huge_pages_node[nid]++;
			
 
				-		h->surplus_huge_pages_node[nid]++;
			
 
				+		h->nr_huge_pages_node[r_nid]++;
			
 
				+		h->surplus_huge_pages_node[r_nid]++;
			
 
				 		__count_vm_event(HTLB_BUDDY_PGALLOC);
			
 
				 	} else {
			
 
				 		h->nr_huge_pages--;
			
@@ -847,6 +887,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * This allocation function is useful in the context where vma is irrelevant.
			
 
				+ * E.g. soft-offlining uses this function because it only cares physical
			
 
				+ * address of error page.
			
 
				+ */
			
 
				+struct page *alloc_huge_page_node(struct hstate *h, int nid)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+
			
 
				+	spin_lock(&hugetlb_lock);
			
 
				+	page = dequeue_huge_page_node(h, nid);
			
 
				+	spin_unlock(&hugetlb_lock);
			
 
				+
			
 
				+	if (!page)
			
 
				+		page = alloc_buddy_huge_page(h, nid);
			
 
				+
			
 
				+	return page;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Increase the hugetlb pool such that it can accomodate a reservation
			
 
				  * of size 'delta'.
			
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 
				 retry:
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 	for (i = 0; i < needed; i++) {
			
 
				-		page = alloc_buddy_huge_page(h, NULL, 0);
			
 
				-		if (!page) {
			
 
				+		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
			
 
				+		if (!page)
			
 
				 			/*
			
 
				 			 * We were not able to allocate enough pages to
			
 
				 			 * satisfy the entire reservation so we free what
			
 
				 			 * we've allocated so far.
			
 
				 			 */
			
 
				-			spin_lock(&hugetlb_lock);
			
 
				-			needed = 0;
			
 
				 			goto free;
			
 
				-		}
			
 
				 
			
 
				 		list_add(&page->lru, &surplus_list);
			
 
				 	}
			
@@ -908,31 +964,31 @@ retry:
 
				 	needed += allocated;
			
 
				 	h->resv_huge_pages += delta;
			
 
				 	ret = 0;
			
 
				-free:
			
 
				+
			
 
				+	spin_unlock(&hugetlb_lock);
			
 
				 	/* Free the needed pages to the hugetlb pool */
			
 
				 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
			
 
				 		if ((--needed) < 0)
			
 
				 			break;
			
 
				 		list_del(&page->lru);
			
 
				+		/*
			
 
				+		 * This page is now managed by the hugetlb allocator and has
			
 
				+		 * no users -- drop the buddy allocator's reference.
			
 
				+		 */
			
 
				+		put_page_testzero(page);
			
 
				+		VM_BUG_ON(page_count(page));
			
 
				 		enqueue_huge_page(h, page);
			
 
				 	}
			
 
				 
			
 
				 	/* Free unnecessary surplus pages to the buddy allocator */
			
 
				+free:
			
 
				 	if (!list_empty(&surplus_list)) {
			
 
				-		spin_unlock(&hugetlb_lock);
			
 
				 		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
			
 
				 			list_del(&page->lru);
			
 
				-			/*
			
 
				-			 * The page has a reference count of zero already, so
			
 
				-			 * call free_huge_page directly instead of using
			
 
				-			 * put_page.  This must be done with hugetlb_lock
			
 
				-			 * unlocked which is safe because free_huge_page takes
			
 
				-			 * hugetlb_lock before deciding how to free the page.
			
 
				-			 */
			
 
				-			free_huge_page(page);
			
 
				+			put_page(page);
			
 
				 		}
			
 
				-		spin_lock(&hugetlb_lock);
			
 
				 	}
			
 
				+	spin_lock(&hugetlb_lock);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 
			
 
				 	if (!page) {
			
 
				-		page = alloc_buddy_huge_page(h, vma, addr);
			
 
				+		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
			
 
				 		if (!page) {
			
 
				 			hugetlb_put_quota(inode->i_mapping, chg);
			
 
				 			return ERR_PTR(-VM_FAULT_SIGBUS);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	set_page_refcounted(page);
			
 
				 	set_page_private(page, (unsigned long) mapping);
			
 
				 
			
 
				 	vma_commit_reservation(h, vma, addr);
			
@@ -2153,6 +2208,19 @@ nomem:
 
				 	return -ENOMEM;
			
 
				 }
			
 
				 
			
 
				+static int is_hugetlb_entry_migration(pte_t pte)
			
 
				+{
			
 
				+	swp_entry_t swp;
			
 
				+
			
 
				+	if (huge_pte_none(pte) || pte_present(pte))
			
 
				+		return 0;
			
 
				+	swp = pte_to_swp_entry(pte);
			
 
				+	if (non_swap_entry(swp) && is_migration_entry(swp)) {
			
 
				+		return 1;
			
 
				+	} else
			
 
				+		return 0;
			
 
				+}
			
 
				+
			
 
				 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
			
 
				 {
			
 
				 	swp_entry_t swp;
			
@@ -2383,7 +2451,7 @@ retry_avoidcopy:
 
				 	if (unlikely(anon_vma_prepare(vma)))
			
 
				 		return VM_FAULT_OOM;
			
 
				 
			
 
				-	copy_huge_page(new_page, old_page, address, vma);
			
 
				+	copy_user_huge_page(new_page, old_page, address, vma);
			
 
				 	__SetPageUptodate(new_page);
			
 
				 
			
 
				 	/*
			
@@ -2515,21 +2583,19 @@ retry:
 
				 			hugepage_add_new_anon_rmap(page, vma, address);
			
 
				 		}
			
 
				 	} else {
			
 
				+		/*
			
 
				+		 * If memory error occurs between mmap() and fault, some process
			
 
				+		 * don't have hwpoisoned swap entry for errored virtual address.
			
 
				+		 * So we need to block hugepage fault by PG_hwpoison bit check.
			
 
				+		 */
			
 
				+		if (unlikely(PageHWPoison(page))) {
			
 
				+			ret = VM_FAULT_HWPOISON | 
			
 
				+			      VM_FAULT_SET_HINDEX(h - hstates);
			
 
				+			goto backout_unlocked;
			
 
				+		}
			
 
				 		page_dup_rmap(page);
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Since memory error handler replaces pte into hwpoison swap entry
			
 
				-	 * at the time of error handling, a process which reserved but not have
			
 
				-	 * the mapping to the error hugepage does not have hwpoison swap entry.
			
 
				-	 * So we need to block accesses from such a process by checking
			
 
				-	 * PG_hwpoison bit here.
			
 
				-	 */
			
 
				-	if (unlikely(PageHWPoison(page))) {
			
 
				-		ret = VM_FAULT_HWPOISON;
			
 
				-		goto backout_unlocked;
			
 
				-	}
			
 
				-
			
 
				 	/*
			
 
				 	 * If we are going to COW a private mapping later, we examine the
			
 
				 	 * pending reservations for this page now. This will ensure that
			
@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	ptep = huge_pte_offset(mm, address);
			
 
				 	if (ptep) {
			
 
				 		entry = huge_ptep_get(ptep);
			
 
				-		if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
			
 
				-			return VM_FAULT_HWPOISON;
			
 
				+		if (unlikely(is_hugetlb_entry_migration(entry))) {
			
 
				+			migration_entry_wait(mm, (pmd_t *)ptep, address);
			
 
				+			return 0;
			
 
				+		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
			
 
				+			return VM_FAULT_HWPOISON_LARGE | 
			
 
				+			       VM_FAULT_SET_HINDEX(h - hstates);
			
 
				 	}
			
 
				 
			
 
				 	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
			
@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 
				 	hugetlb_acct_memory(h, -(chg - freed));
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_MEMORY_FAILURE
			
 
				+
			
 
				+/* Should be called in hugetlb_lock */
			
 
				+static int is_hugepage_on_freelist(struct page *hpage)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+	struct page *tmp;
			
 
				+	struct hstate *h = page_hstate(hpage);
			
 
				+	int nid = page_to_nid(hpage);
			
 
				+
			
 
				+	list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
			
 
				+		if (page == hpage)
			
 
				+			return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * This function is called from memory failure code.
			
 
				  * Assume the caller holds page lock of the head page.
			
 
				  */
			
 
				-void __isolate_hwpoisoned_huge_page(struct page *hpage)
			
 
				+int dequeue_hwpoisoned_huge_page(struct page *hpage)
			
 
				 {
			
 
				 	struct hstate *h = page_hstate(hpage);
			
 
				 	int nid = page_to_nid(hpage);
			
 
				+	int ret = -EBUSY;
			
 
				 
			
 
				 	spin_lock(&hugetlb_lock);
			
 
				-	list_del(&hpage->lru);
			
 
				-	h->free_huge_pages--;
			
 
				-	h->free_huge_pages_node[nid]--;
			
 
				+	if (is_hugepage_on_freelist(hpage)) {
			
 
				+		list_del(&hpage->lru);
			
 
				+		set_page_refcounted(hpage);
			
 
				+		h->free_huge_pages--;
			
 
				+		h->free_huge_pages_node[nid]--;
			
 
				+		ret = 0;
			
 
				+	}
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				+	return ret;
			
 
				 }
			
 
				+#endif
			
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
 
				  * Free Software Foundation.
			
 
				  *
			
 
				  * High level machine check handler. Handles pages reported by the
			
 
				- * hardware as being corrupted usually due to a 2bit ECC memory or cache
			
 
				+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
			
 
				  * failure.
			
 
				+ * 
			
 
				+ * In addition there is a "soft offline" entry point that allows stop using
			
 
				+ * not-yet-corrupted-by-suspicious pages without killing anything.
			
 
				  *
			
 
				  * Handles page cache pages in various states.	The tricky part
			
 
				- * here is that we can access any page asynchronous to other VM
			
 
				- * users, because memory failures could happen anytime and anywhere,
			
 
				- * possibly violating some of their assumptions. This is why this code
			
 
				- * has to be extremely careful. Generally it tries to use normal locking
			
 
				- * rules, as in get the standard locks, even if that means the
			
 
				- * error handling takes potentially a long time.
			
 
				- *
			
 
				- * The operation to map back from RMAP chains to processes has to walk
			
 
				- * the complete process list and has non linear complexity with the number
			
 
				- * mappings. In short it can be quite slow. But since memory corruptions
			
 
				- * are rare we hope to get away with this.
			
 
				+ * here is that we can access any page asynchronously in respect to 
			
 
				+ * other VM users, because memory failures could happen anytime and 
			
 
				+ * anywhere. This could violate some of their assumptions. This is why 
			
 
				+ * this code has to be extremely careful. Generally it tries to use 
			
 
				+ * normal locking rules, as in get the standard locks, even if that means 
			
 
				+ * the error handling takes potentially a long time.
			
 
				+ * 
			
 
				+ * There are several operations here with exponential complexity because
			
 
				+ * of unsuitable VM data structures. For example the operation to map back 
			
 
				+ * from RMAP chains to processes has to walk the complete process list and 
			
 
				+ * has non linear complexity with the number. But since memory corruptions
			
 
				+ * are rare we hope to get away with this. This avoids impacting the core 
			
 
				+ * VM.
			
 
				  */
			
 
				 
			
 
				 /*
			
@@ -30,7 +35,6 @@
 
				  * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
			
 
				  * - pass bad pages to kdump next kernel
			
 
				  */
			
 
				-#define DEBUG 1		/* remove me in 2.6.34 */
			
 
				 #include <linux/kernel.h>
			
 
				 #include <linux/mm.h>
			
 
				 #include <linux/page-flags.h>
			
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
 
				 		return 0;
			
 
				 
			
 
				 	/*
			
 
				-	 * page_mapping() does not accept slab page
			
 
				+	 * page_mapping() does not accept slab pages.
			
 
				 	 */
			
 
				 	if (PageSlab(p))
			
 
				 		return -EINVAL;
			
@@ -268,7 +272,7 @@ struct to_kill {
 
				 	struct list_head nd;
			
 
				 	struct task_struct *tsk;
			
 
				 	unsigned long addr;
			
 
				-	unsigned addr_valid:1;
			
 
				+	char addr_valid;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 
				 	 * a SIGKILL because the error is not contained anymore.
			
 
				 	 */
			
 
				 	if (tk->addr == -EFAULT) {
			
 
				-		pr_debug("MCE: Unable to find user space address %lx in %s\n",
			
 
				+		pr_info("MCE: Unable to find user space address %lx in %s\n",
			
 
				 			page_to_pfn(p), tsk->comm);
			
 
				 		tk->addr_valid = 0;
			
 
				 	}
			
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 
				 					pfn, err);
			
 
				 		} else if (page_has_private(p) &&
			
 
				 				!try_to_release_page(p, GFP_NOIO)) {
			
 
				-			pr_debug("MCE %#lx: failed to release buffers\n", pfn);
			
 
				+			pr_info("MCE %#lx: failed to release buffers\n", pfn);
			
 
				 		} else {
			
 
				 			ret = RECOVERED;
			
 
				 		}
			
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 
				  * Issues:
			
 
				  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
			
 
				  *   To narrow down kill region to one page, we need to break up pmd.
			
 
				- * - To support soft-offlining for hugepage, we need to support hugepage
			
 
				- *   migration.
			
 
				  */
			
 
				 static int me_huge_page(struct page *p, unsigned long pfn)
			
 
				 {
			
 
				+	int res = 0;
			
 
				 	struct page *hpage = compound_head(p);
			
 
				 	/*
			
 
				 	 * We can safely recover from error on free or reserved (i.e.
			
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 
				 	 * so there is no race between isolation and mapping/unmapping.
			
 
				 	 */
			
 
				 	if (!(page_mapping(hpage) || PageAnon(hpage))) {
			
 
				-		__isolate_hwpoisoned_huge_page(hpage);
			
 
				-		return RECOVERED;
			
 
				+		res = dequeue_hwpoisoned_huge_page(hpage);
			
 
				+		if (!res)
			
 
				+			return RECOVERED;
			
 
				 	}
			
 
				 	return DELAYED;
			
 
				 }
			
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
 
				 	return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
			
 
				 }
			
 
				 
			
 
				-#define N_UNMAP_TRIES 5
			
 
				-
			
 
				 /*
			
 
				  * Do all that is necessary to remove user space mappings. Unmap
			
 
				  * the pages and send SIGBUS to the processes if the data was dirty.
			
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 
				 	struct address_space *mapping;
			
 
				 	LIST_HEAD(tokill);
			
 
				 	int ret;
			
 
				-	int i;
			
 
				 	int kill = 1;
			
 
				 	struct page *hpage = compound_head(p);
			
 
				 
			
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 
				 	if (kill)
			
 
				 		collect_procs(hpage, &tokill);
			
 
				 
			
 
				-	/*
			
 
				-	 * try_to_unmap can fail temporarily due to races.
			
 
				-	 * Try a few times (RED-PEN better strategy?)
			
 
				-	 */
			
 
				-	for (i = 0; i < N_UNMAP_TRIES; i++) {
			
 
				-		ret = try_to_unmap(hpage, ttu);
			
 
				-		if (ret == SWAP_SUCCESS)
			
 
				-			break;
			
 
				-		pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
			
 
				-	}
			
 
				-
			
 
				+	ret = try_to_unmap(hpage, ttu);
			
 
				 	if (ret != SWAP_SUCCESS)
			
 
				 		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
			
 
				 				pfn, page_mapcount(hpage));
			
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 
				 	 * We need/can do nothing about count=0 pages.
			
 
				 	 * 1) it's a free page, and therefore in safe hand:
			
 
				 	 *    prep_new_page() will be the gate keeper.
			
 
				-	 * 2) it's part of a non-compound high order page.
			
 
				+	 * 2) it's a free hugepage, which is also safe:
			
 
				+	 *    an affected hugepage will be dequeued from hugepage freelist,
			
 
				+	 *    so there's no concern about reusing it ever after.
			
 
				+	 * 3) it's part of a non-compound high order page.
			
 
				 	 *    Implies some kernel user: cannot stop them from
			
 
				 	 *    R/W the page; let's pray that the page has been
			
 
				 	 *    used and will be freed some time later.
			
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 
				 		if (is_free_buddy_page(p)) {
			
 
				 			action_result(pfn, "free buddy", DELAYED);
			
 
				 			return 0;
			
 
				+		} else if (PageHuge(hpage)) {
			
 
				+			/*
			
 
				+			 * Check "just unpoisoned", "filter hit", and
			
 
				+			 * "race with other subpage."
			
 
				+			 */
			
 
				+			lock_page_nosync(hpage);
			
 
				+			if (!PageHWPoison(hpage)
			
 
				+			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
			
 
				+			    || (p != hpage && TestSetPageHWPoison(hpage))) {
			
 
				+				atomic_long_sub(nr_pages, &mce_bad_pages);
			
 
				+				return 0;
			
 
				+			}
			
 
				+			set_page_hwpoison_huge_page(hpage);
			
 
				+			res = dequeue_hwpoisoned_huge_page(hpage);
			
 
				+			action_result(pfn, "free huge",
			
 
				+				      res ? IGNORED : DELAYED);
			
 
				+			unlock_page(hpage);
			
 
				+			return res;
			
 
				 		} else {
			
 
				 			action_result(pfn, "high order kernel", IGNORED);
			
 
				 			return -EBUSY;
			
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
 
				 	page = compound_head(p);
			
 
				 
			
 
				 	if (!PageHWPoison(p)) {
			
 
				-		pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
			
 
				+		pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				 	nr_pages = 1 << compound_order(page);
			
 
				 
			
 
				 	if (!get_page_unless_zero(page)) {
			
 
				+		/*
			
 
				+		 * Since HWPoisoned hugepage should have non-zero refcount,
			
 
				+		 * race between memory failure and unpoison seems to happen.
			
 
				+		 * In such case unpoison fails and memory failure runs
			
 
				+		 * to the end.
			
 
				+		 */
			
 
				+		if (PageHuge(page)) {
			
 
				+			pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
			
 
				+			return 0;
			
 
				+		}
			
 
				 		if (TestClearPageHWPoison(p))
			
 
				 			atomic_long_sub(nr_pages, &mce_bad_pages);
			
 
				-		pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
			
 
				+		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
 
				 	 * the free buddy page pool.
			
 
				 	 */
			
 
				 	if (TestClearPageHWPoison(page)) {
			
 
				-		pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
			
 
				+		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
			
 
				 		atomic_long_sub(nr_pages, &mce_bad_pages);
			
 
				 		freeit = 1;
			
 
				+		if (PageHuge(page))
			
 
				+			clear_page_hwpoison_huge_page(page);
			
 
				 	}
			
 
				-	if (PageHuge(p))
			
 
				-		clear_page_hwpoison_huge_page(page);
			
 
				 	unlock_page(page);
			
 
				 
			
 
				 	put_page(page);
			
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
 
				 static struct page *new_page(struct page *p, unsigned long private, int **x)
			
 
				 {
			
 
				 	int nid = page_to_nid(p);
			
 
				-	return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
			
 
				+	if (PageHuge(p))
			
 
				+		return alloc_huge_page_node(page_hstate(compound_head(p)),
			
 
				+						   nid);
			
 
				+	else
			
 
				+		return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
 
				 	 * was free.
			
 
				 	 */
			
 
				 	set_migratetype_isolate(p);
			
 
				+	/*
			
 
				+	 * When the target page is a free hugepage, just remove it
			
 
				+	 * from free hugepage list.
			
 
				+	 */
			
 
				 	if (!get_page_unless_zero(compound_head(p))) {
			
 
				-		if (is_free_buddy_page(p)) {
			
 
				-			pr_debug("get_any_page: %#lx free buddy page\n", pfn);
			
 
				+		if (PageHuge(p)) {
			
 
				+			pr_info("get_any_page: %#lx free huge page\n", pfn);
			
 
				+			ret = dequeue_hwpoisoned_huge_page(compound_head(p));
			
 
				+		} else if (is_free_buddy_page(p)) {
			
 
				+			pr_info("get_any_page: %#lx free buddy page\n", pfn);
			
 
				 			/* Set hwpoison bit while page is still isolated */
			
 
				 			SetPageHWPoison(p);
			
 
				 			ret = 0;
			
 
				 		} else {
			
 
				-			pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
			
 
				+			pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
			
 
				 				pfn, p->flags);
			
 
				 			ret = -EIO;
			
 
				 		}
			
@@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static int soft_offline_huge_page(struct page *page, int flags)
			
 
				+{
			
 
				+	int ret;
			
 
				+	unsigned long pfn = page_to_pfn(page);
			
 
				+	struct page *hpage = compound_head(page);
			
 
				+	LIST_HEAD(pagelist);
			
 
				+
			
 
				+	ret = get_any_page(page, pfn, flags);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+	if (ret == 0)
			
 
				+		goto done;
			
 
				+
			
 
				+	if (PageHWPoison(hpage)) {
			
 
				+		put_page(hpage);
			
 
				+		pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
			
 
				+		return -EBUSY;
			
 
				+	}
			
 
				+
			
 
				+	/* Keep page count to indicate a given hugepage is isolated. */
			
 
				+
			
 
				+	list_add(&hpage->lru, &pagelist);
			
 
				+	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
			
 
				+	if (ret) {
			
 
				+		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
			
 
				+			 pfn, ret, page->flags);
			
 
				+		if (ret > 0)
			
 
				+			ret = -EIO;
			
 
				+		return ret;
			
 
				+	}
			
 
				+done:
			
 
				+	if (!PageHWPoison(hpage))
			
 
				+		atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
			
 
				+	set_page_hwpoison_huge_page(hpage);
			
 
				+	dequeue_hwpoisoned_huge_page(hpage);
			
 
				+	/* keep elevated page count for bad page */
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * soft_offline_page - Soft offline a page.
			
 
				  * @page: page to offline
			
@@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags)
 
				 	int ret;
			
 
				 	unsigned long pfn = page_to_pfn(page);
			
 
				 
			
 
				+	if (PageHuge(page))
			
 
				+		return soft_offline_huge_page(page, flags);
			
 
				+
			
 
				 	ret = get_any_page(page, pfn, flags);
			
 
				 	if (ret < 0)
			
 
				 		return ret;
			
@@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags)
 
				 			goto done;
			
 
				 	}
			
 
				 	if (!PageLRU(page)) {
			
 
				-		pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
			
 
				+		pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
			
 
				 				pfn, page->flags);
			
 
				 		return -EIO;
			
 
				 	}
			
@@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags)
 
				 	if (PageHWPoison(page)) {
			
 
				 		unlock_page(page);
			
 
				 		put_page(page);
			
 
				-		pr_debug("soft offline: %#lx page already poisoned\n", pfn);
			
 
				+		pr_info("soft offline: %#lx page already poisoned\n", pfn);
			
 
				 		return -EBUSY;
			
 
				 	}
			
 
				 
			
@@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags)
 
				 	put_page(page);
			
 
				 	if (ret == 1) {
			
 
				 		ret = 0;
			
 
				-		pr_debug("soft_offline: %#lx: invalidated\n", pfn);
			
 
				+		pr_info("soft_offline: %#lx: invalidated\n", pfn);
			
 
				 		goto done;
			
 
				 	}
			
 
				 
			
@@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags)
 
				 		list_add(&page->lru, &pagelist);
			
 
				 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
			
 
				 		if (ret) {
			
 
				-			pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
			
 
				+			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
			
 
				 				pfn, ret, page->flags);
			
 
				 			if (ret > 0)
			
 
				 				ret = -EIO;
			
 
				 		}
			
 
				 	} else {
			
 
				-		pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
			
 
				+		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
			
 
				 				pfn, ret, page_count(page), page->flags);
			
 
				 	}
			
 
				 	if (ret)
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
				 					if (ret & VM_FAULT_OOM)
			
 
				 						return i ? i : -ENOMEM;
			
 
				 					if (ret &
			
 
				-					    (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
			
 
				+					    (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
			
 
				+					     VM_FAULT_SIGBUS))
			
 
				 						return i ? i : -EFAULT;
			
 
				 					BUG();
			
 
				 				}
			
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
 
				 #include <linux/security.h>
			
 
				 #include <linux/memcontrol.h>
			
 
				 #include <linux/syscalls.h>
			
 
				+#include <linux/hugetlb.h>
			
 
				 #include <linux/gfp.h>
			
 
				 
			
 
				 #include "internal.h"
			
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 
				 	pte_t *ptep, pte;
			
 
				  	spinlock_t *ptl;
			
 
				 
			
 
				- 	pgd = pgd_offset(mm, addr);
			
 
				-	if (!pgd_present(*pgd))
			
 
				-		goto out;
			
 
				+	if (unlikely(PageHuge(new))) {
			
 
				+		ptep = huge_pte_offset(mm, addr);
			
 
				+		if (!ptep)
			
 
				+			goto out;
			
 
				+		ptl = &mm->page_table_lock;
			
 
				+	} else {
			
 
				+		pgd = pgd_offset(mm, addr);
			
 
				+		if (!pgd_present(*pgd))
			
 
				+			goto out;
			
 
				 
			
 
				-	pud = pud_offset(pgd, addr);
			
 
				-	if (!pud_present(*pud))
			
 
				-		goto out;
			
 
				+		pud = pud_offset(pgd, addr);
			
 
				+		if (!pud_present(*pud))
			
 
				+			goto out;
			
 
				 
			
 
				-	pmd = pmd_offset(pud, addr);
			
 
				-	if (!pmd_present(*pmd))
			
 
				-		goto out;
			
 
				+		pmd = pmd_offset(pud, addr);
			
 
				+		if (!pmd_present(*pmd))
			
 
				+			goto out;
			
 
				 
			
 
				-	ptep = pte_offset_map(pmd, addr);
			
 
				+		ptep = pte_offset_map(pmd, addr);
			
 
				 
			
 
				-	if (!is_swap_pte(*ptep)) {
			
 
				-		pte_unmap(ptep);
			
 
				-		goto out;
			
 
				- 	}
			
 
				+		if (!is_swap_pte(*ptep)) {
			
 
				+			pte_unmap(ptep);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		ptl = pte_lockptr(mm, pmd);
			
 
				+	}
			
 
				 
			
 
				- 	ptl = pte_lockptr(mm, pmd);
			
 
				  	spin_lock(ptl);
			
 
				 	pte = *ptep;
			
 
				 	if (!is_swap_pte(pte))
			
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 
				 	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
			
 
				 	if (is_write_migration_entry(entry))
			
 
				 		pte = pte_mkwrite(pte);
			
 
				+#ifdef CONFIG_HUGETLB_PAGE
			
 
				+	if (PageHuge(new))
			
 
				+		pte = pte_mkhuge(pte);
			
 
				+#endif
			
 
				 	flush_cache_page(vma, addr, pte_pfn(pte));
			
 
				 	set_pte_at(mm, addr, ptep, pte);
			
 
				 
			
 
				-	if (PageAnon(new))
			
 
				+	if (PageHuge(new)) {
			
 
				+		if (PageAnon(new))
			
 
				+			hugepage_add_anon_rmap(new, vma, addr);
			
 
				+		else
			
 
				+			page_dup_rmap(new);
			
 
				+	} else if (PageAnon(new))
			
 
				 		page_add_anon_rmap(new, vma, addr);
			
 
				 	else
			
 
				 		page_add_file_rmap(new);
			
@@ -275,12 +293,60 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * The expected number of remaining references is the same as that
			
 
				+ * of migrate_page_move_mapping().
			
 
				+ */
			
 
				+int migrate_huge_page_move_mapping(struct address_space *mapping,
			
 
				+				   struct page *newpage, struct page *page)
			
 
				+{
			
 
				+	int expected_count;
			
 
				+	void **pslot;
			
 
				+
			
 
				+	if (!mapping) {
			
 
				+		if (page_count(page) != 1)
			
 
				+			return -EAGAIN;
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+
			
 
				+	pslot = radix_tree_lookup_slot(&mapping->page_tree,
			
 
				+					page_index(page));
			
 
				+
			
 
				+	expected_count = 2 + page_has_private(page);
			
 
				+	if (page_count(page) != expected_count ||
			
 
				+	    (struct page *)radix_tree_deref_slot(pslot) != page) {
			
 
				+		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	if (!page_freeze_refs(page, expected_count)) {
			
 
				+		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	get_page(newpage);
			
 
				+
			
 
				+	radix_tree_replace_slot(pslot, newpage);
			
 
				+
			
 
				+	page_unfreeze_refs(page, expected_count);
			
 
				+
			
 
				+	__put_page(page);
			
 
				+
			
 
				+	spin_unlock_irq(&mapping->tree_lock);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Copy the page to its new location
			
 
				  */
			
 
				-static void migrate_page_copy(struct page *newpage, struct page *page)
			
 
				+void migrate_page_copy(struct page *newpage, struct page *page)
			
 
				 {
			
 
				-	copy_highpage(newpage, page);
			
 
				+	if (PageHuge(page))
			
 
				+		copy_huge_page(newpage, page);
			
 
				+	else
			
 
				+		copy_highpage(newpage, page);
			
 
				 
			
 
				 	if (PageError(page))
			
 
				 		SetPageError(newpage);
			
@@ -723,6 +789,92 @@ move_newpage:
 
				 	return rc;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Counterpart of unmap_and_move_page() for hugepage migration.
			
 
				+ *
			
 
				+ * This function doesn't wait the completion of hugepage I/O
			
 
				+ * because there is no race between I/O and migration for hugepage.
			
 
				+ * Note that currently hugepage I/O occurs only in direct I/O
			
 
				+ * where no lock is held and PG_writeback is irrelevant,
			
 
				+ * and writeback status of all subpages are counted in the reference
			
 
				+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
			
 
				+ * under direct I/O, the reference of the head page is 512 and a bit more.)
			
 
				+ * This means that when we try to migrate hugepage whose subpages are
			
 
				+ * doing direct I/O, some references remain after try_to_unmap() and
			
 
				+ * hugepage migration fails without data corruption.
			
 
				+ *
			
 
				+ * There is also no race when direct I/O is issued on the page under migration,
			
 
				+ * because then pte is replaced with migration swap entry and direct I/O code
			
 
				+ * will wait in the page fault for migration to complete.
			
 
				+ */
			
 
				+static int unmap_and_move_huge_page(new_page_t get_new_page,
			
 
				+				unsigned long private, struct page *hpage,
			
 
				+				int force, int offlining)
			
 
				+{
			
 
				+	int rc = 0;
			
 
				+	int *result = NULL;
			
 
				+	struct page *new_hpage = get_new_page(hpage, private, &result);
			
 
				+	int rcu_locked = 0;
			
 
				+	struct anon_vma *anon_vma = NULL;
			
 
				+
			
 
				+	if (!new_hpage)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	rc = -EAGAIN;
			
 
				+
			
 
				+	if (!trylock_page(hpage)) {
			
 
				+		if (!force)
			
 
				+			goto out;
			
 
				+		lock_page(hpage);
			
 
				+	}
			
 
				+
			
 
				+	if (PageAnon(hpage)) {
			
 
				+		rcu_read_lock();
			
 
				+		rcu_locked = 1;
			
 
				+
			
 
				+		if (page_mapped(hpage)) {
			
 
				+			anon_vma = page_anon_vma(hpage);
			
 
				+			atomic_inc(&anon_vma->external_refcount);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
			
 
				+
			
 
				+	if (!page_mapped(hpage))
			
 
				+		rc = move_to_new_page(new_hpage, hpage, 1);
			
 
				+
			
 
				+	if (rc)
			
 
				+		remove_migration_ptes(hpage, hpage);
			
 
				+
			
 
				+	if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
			
 
				+					    &anon_vma->lock)) {
			
 
				+		int empty = list_empty(&anon_vma->head);
			
 
				+		spin_unlock(&anon_vma->lock);
			
 
				+		if (empty)
			
 
				+			anon_vma_free(anon_vma);
			
 
				+	}
			
 
				+
			
 
				+	if (rcu_locked)
			
 
				+		rcu_read_unlock();
			
 
				+out:
			
 
				+	unlock_page(hpage);
			
 
				+
			
 
				+	if (rc != -EAGAIN) {
			
 
				+		list_del(&hpage->lru);
			
 
				+		put_page(hpage);
			
 
				+	}
			
 
				+
			
 
				+	put_page(new_hpage);
			
 
				+
			
 
				+	if (result) {
			
 
				+		if (rc)
			
 
				+			*result = rc;
			
 
				+		else
			
 
				+			*result = page_to_nid(new_hpage);
			
 
				+	}
			
 
				+	return rc;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * migrate_pages
			
 
				  *
			
@@ -788,6 +940,52 @@ out:
 
				 	return nr_failed + retry;
			
 
				 }
			
 
				 
			
 
				+int migrate_huge_pages(struct list_head *from,
			
 
				+		new_page_t get_new_page, unsigned long private, int offlining)
			
 
				+{
			
 
				+	int retry = 1;
			
 
				+	int nr_failed = 0;
			
 
				+	int pass = 0;
			
 
				+	struct page *page;
			
 
				+	struct page *page2;
			
 
				+	int rc;
			
 
				+
			
 
				+	for (pass = 0; pass < 10 && retry; pass++) {
			
 
				+		retry = 0;
			
 
				+
			
 
				+		list_for_each_entry_safe(page, page2, from, lru) {
			
 
				+			cond_resched();
			
 
				+
			
 
				+			rc = unmap_and_move_huge_page(get_new_page,
			
 
				+					private, page, pass > 2, offlining);
			
 
				+
			
 
				+			switch(rc) {
			
 
				+			case -ENOMEM:
			
 
				+				goto out;
			
 
				+			case -EAGAIN:
			
 
				+				retry++;
			
 
				+				break;
			
 
				+			case 0:
			
 
				+				break;
			
 
				+			default:
			
 
				+				/* Permanent failure */
			
 
				+				nr_failed++;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	rc = 0;
			
 
				+out:
			
 
				+
			
 
				+	list_for_each_entry_safe(page, page2, from, lru)
			
 
				+		put_page(page);
			
 
				+
			
 
				+	if (rc)
			
 
				+		return rc;
			
 
				+
			
 
				+	return nr_failed + retry;
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_NUMA
			
 
				 /*
			
 
				  * Move a list of individual pages
			
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page,
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * __page_set_anon_rmap - setup new anonymous rmap
			
 
				- * @page:	the page to add the mapping to
			
 
				- * @vma:	the vm area in which the mapping is added
			
 
				- * @address:	the user virtual address mapped
			
 
				+ * __page_set_anon_rmap - set up new anonymous rmap
			
 
				+ * @page:	Page to add to rmap	
			
 
				+ * @vma:	VM area to add page to.
			
 
				+ * @address:	User virtual address of the mapping	
			
 
				  * @exclusive:	the page is exclusively owned by the current process
			
 
				  */
			
 
				 static void __page_set_anon_rmap(struct page *page,
			
@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page,
 
				 
			
 
				 	BUG_ON(!anon_vma);
			
 
				 
			
 
				+	if (PageAnon(page))
			
 
				+		return;
			
 
				+
			
 
				 	/*
			
 
				 	 * If the page isn't exclusively mapped into this vma,
			
 
				 	 * we must use the _oldest_ possible anon_vma for the
			
 
				 	 * page mapping!
			
 
				 	 */
			
 
				-	if (!exclusive) {
			
 
				-		if (PageAnon(page))
			
 
				-			return;
			
 
				+	if (!exclusive)
			
 
				 		anon_vma = anon_vma->root;
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * In this case, swapped-out-but-not-discarded swap-cache
			
 
				-		 * is remapped. So, no need to update page->mapping here.
			
 
				-		 * We convice anon_vma poitned by page->mapping is not obsolete
			
 
				-		 * because vma->anon_vma is necessary to be a family of it.
			
 
				-		 */
			
 
				-		if (PageAnon(page))
			
 
				-			return;
			
 
				-	}
			
 
				 
			
 
				 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
			
 
				 	page->mapping = (struct address_space *) anon_vma;