Jeff Garzik 19 роки тому
батько
коміт
f0612bbc41
100 змінених файлів з 1117 додано та 1935 видалено
  1. 0 9
      Documentation/cachetlb.txt
  2. 0 2
      Documentation/kernel-parameters.txt
  3. 1 23
      Documentation/m68k/kernel-options.txt
  4. 3 0
      arch/alpha/mm/numa.c
  5. 1 5
      arch/alpha/mm/remap.c
  6. 18 78
      arch/arm/kernel/signal.c
  7. 9 5
      arch/arm/kernel/traps.c
  8. 1 5
      arch/arm/mm/consistent.c
  9. 6 1
      arch/arm/mm/fault-armv.c
  10. 1 3
      arch/arm/mm/ioremap.c
  11. 1 14
      arch/arm/mm/mm-armv.c
  12. 9 37
      arch/arm/oprofile/backtrace.c
  13. 2 16
      arch/arm26/mm/memc.c
  14. 4 2
      arch/cris/arch-v32/mm/tlb.c
  15. 1 3
      arch/cris/mm/ioremap.c
  16. 1 4
      arch/frv/mm/dma-alloc.c
  17. 2 2
      arch/frv/mm/pgalloc.c
  18. 7 10
      arch/i386/kernel/vm86.c
  19. 2 2
      arch/i386/mm/discontig.c
  20. 57 5
      arch/i386/mm/init.c
  21. 1 3
      arch/i386/mm/ioremap.c
  22. 7 4
      arch/i386/mm/pgtable.c
  23. 13 25
      arch/i386/oprofile/backtrace.c
  24. 2 1
      arch/ia64/kernel/perfmon.c
  25. 6 1
      arch/ia64/mm/discontig.c
  26. 7 27
      arch/ia64/mm/fault.c
  27. 4 9
      arch/ia64/mm/init.c
  28. 2 0
      arch/ia64/mm/tlb.c
  29. 8 1
      arch/m32r/mm/init.c
  30. 1 3
      arch/m32r/mm/ioremap.c
  31. 1 23
      arch/m68k/Kconfig
  32. 15 901
      arch/m68k/atari/stram.c
  33. 1 1
      arch/m68k/mm/kmap.c
  34. 1 1
      arch/m68k/sun3x/dvma.c
  35. 0 1
      arch/mips/kernel/irixelf.c
  36. 1 3
      arch/mips/mm/ioremap.c
  37. 9 15
      arch/parisc/kernel/cache.c
  38. 1 1
      arch/parisc/kernel/pci-dma.c
  39. 3 0
      arch/parisc/mm/init.c
  40. 2 4
      arch/parisc/mm/ioremap.c
  41. 1 5
      arch/ppc/kernel/dma-mapping.c
  42. 0 4
      arch/ppc/mm/4xx_mmu.c
  43. 1 3
      arch/ppc/mm/pgtable.c
  44. 7 5
      arch/ppc64/kernel/vdso.c
  45. 0 5
      arch/ppc64/mm/imalloc.c
  46. 84 3
      arch/ppc64/mm/init.c
  47. 1 3
      arch/s390/mm/ioremap.c
  48. 23 17
      arch/sh/mm/fault.c
  49. 0 2
      arch/sh/mm/hugetlbpage.c
  50. 1 3
      arch/sh/mm/ioremap.c
  51. 30 38
      arch/sh64/mm/cache.c
  52. 12 176
      arch/sh64/mm/hugetlbpage.c
  53. 1 3
      arch/sh64/mm/ioremap.c
  54. 4 3
      arch/sparc/mm/generic.c
  55. 0 1
      arch/sparc64/kernel/binfmt_aout32.c
  56. 5 4
      arch/sparc64/mm/generic.c
  57. 3 4
      arch/sparc64/mm/tlb.c
  58. 0 1
      arch/um/include/tlb.h
  59. 5 3
      arch/um/kernel/process_kern.c
  60. 1 3
      arch/um/kernel/skas/mmu.c
  61. 0 36
      arch/um/kernel/tt/tlb.c
  62. 0 1
      arch/x86_64/ia32/ia32_aout.c
  63. 1 3
      arch/x86_64/mm/ioremap.c
  64. 4 1
      crypto/api.c
  65. 5 14
      crypto/hmac.c
  66. 17 39
      crypto/tcrypt.c
  67. 2 3
      drivers/acpi/acpi_memhotplug.c
  68. 1 0
      drivers/base/Makefile
  69. 2 0
      drivers/base/init.c
  70. 452 0
      drivers/base/memory.c
  71. 4 8
      drivers/md/dm-crypt.c
  72. 3 4
      drivers/net/wireless/airo.c
  73. 0 9
      drivers/scsi/ahci.c
  74. 3 3
      drivers/scsi/arm/scsi.h
  75. 11 35
      drivers/scsi/libata-core.c
  76. 0 18
      drivers/scsi/sata_mv.c
  77. 0 8
      drivers/scsi/sata_sil24.c
  78. 10 7
      drivers/scsi/sg.c
  79. 7 3
      drivers/scsi/st.c
  80. 2 5
      drivers/usb/misc/usbtest.c
  81. 2 2
      fs/afs/file.c
  82. 0 1
      fs/binfmt_aout.c
  83. 0 1
      fs/binfmt_elf.c
  84. 0 7
      fs/binfmt_elf_fdpic.c
  85. 0 1
      fs/binfmt_flat.c
  86. 0 1
      fs/binfmt_som.c
  87. 1 1
      fs/buffer.c
  88. 0 1
      fs/compat.c
  89. 3 1
      fs/direct-io.c
  90. 6 11
      fs/exec.c
  91. 120 86
      fs/hugetlbfs/inode.c
  92. 6 6
      fs/jfs/jfs_metapage.c
  93. 1 1
      fs/proc/array.c
  94. 31 20
      fs/proc/task_mmu.c
  95. 4 3
      fs/xfs/linux-2.6/xfs_buf.c
  96. 2 0
      include/asm-alpha/barrier.h
  97. 5 0
      include/asm-alpha/rwsem.h
  98. 3 20
      include/asm-arm/tlb.h
  99. 16 31
      include/asm-arm26/tlb.h
  100. 3 8
      include/asm-generic/4level-fixup.h

+ 0 - 9
Documentation/cachetlb.txt

@@ -49,9 +49,6 @@ changes occur:
 	page table operations such as what happens during
 	fork, and exec.
 
-	Platform developers note that generic code will always
-	invoke this interface without mm->page_table_lock held.
-
 3) void flush_tlb_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 
@@ -72,9 +69,6 @@ changes occur:
 	call flush_tlb_page (see below) for each entry which may be
 	modified.
 
-	Platform developers note that generic code will always
-	invoke this interface with mm->page_table_lock held.
-
 4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
 
 	This time we need to remove the PAGE_SIZE sized translation
@@ -93,9 +87,6 @@ changes occur:
 
 	This is used primarily during fault processing.
 
-	Platform developers note that generic code will always
-	invoke this interface with mm->page_table_lock held.
-
 5) void flush_tlb_pgtables(struct mm_struct *mm,
 			   unsigned long start, unsigned long end)
 

+ 0 - 2
Documentation/kernel-parameters.txt

@@ -1460,8 +1460,6 @@ running once the system is up.
 	stifb=		[HW]
 			Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
 
-	stram_swap=	[HW,M68k]
-
 	swiotlb=	[IA-64] Number of I/O TLB slabs
 
 	switches=	[HW,M68k]

+ 1 - 23
Documentation/m68k/kernel-options.txt

@@ -626,7 +626,7 @@ ignored (others aren't affected).
     can be performed in optimal order. Not all SCSI devices support
     tagged queuing (:-().
 
-4.6 switches=
+4.5 switches=
 -------------
 
 Syntax: switches=<list of switches>
@@ -661,28 +661,6 @@ correctly.
 earlier initialization ("ov_"-less) takes precedence. But the
 switching-off on reset still happens in this case.
 
-4.5) stram_swap=
-----------------
-
-Syntax: stram_swap=<do_swap>[,<max_swap>]
-
-  This option is available only if the kernel has been compiled with
-CONFIG_STRAM_SWAP enabled. Normally, the kernel then determines
-dynamically whether to actually use ST-RAM as swap space. (Currently,
-the fraction of ST-RAM must be less or equal 1/3 of total memory to
-enable this swapping.) You can override the kernel's decision by
-specifying this option. 1 for <do_swap> means always enable the swap,
-even if you have less alternate RAM. 0 stands for never swap to
-ST-RAM, even if it's small enough compared to the rest of memory.
-
-  If ST-RAM swapping is enabled, the kernel usually uses all free
-ST-RAM as swap "device". If the kernel resides in ST-RAM, the region
-allocated by it is obviously never used for swapping :-) You can also
-limit this amount by specifying the second parameter, <max_swap>, if
-you want to use parts of ST-RAM as normal system memory. <max_swap> is
-in kBytes and the number should be a multiple of 4 (otherwise: rounded
-down).
-
 5) Options for Amiga Only:
 ==========================
 

+ 3 - 0
arch/alpha/mm/numa.c

@@ -371,6 +371,8 @@ show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_online_node(nid) {
+		unsigned long flags;
+		pgdat_resize_lock(NODE_DATA(nid), &flags);
 		i = node_spanned_pages(nid);
 		while (i-- > 0) {
 			struct page *page = nid_page_nr(nid, i);
@@ -384,6 +386,7 @@ show_mem(void)
 			else
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(NODE_DATA(nid), &flags);
 	}
 	printk("%ld pages of RAM\n",total);
 	printk("%ld free pages\n",free);

+ 1 - 5
arch/alpha/mm/remap.c

@@ -2,7 +2,6 @@
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 
-/* called with the page_table_lock held */
 static inline void 
 remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, 
 	       unsigned long phys_addr, unsigned long flags)
@@ -31,7 +30,6 @@ remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
 	} while (address && (address < end));
 }
 
-/* called with the page_table_lock held */
 static inline int 
 remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, 
 	       unsigned long phys_addr, unsigned long flags)
@@ -46,7 +44,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, 
@@ -70,7 +68,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -84,7 +81,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	return error;
 }
 

+ 18 - 78
arch/arm/kernel/signal.c

@@ -139,93 +139,33 @@ struct iwmmxt_sigframe {
 	unsigned long	storage[0x98/4];
 };
 
-static int page_present(struct mm_struct *mm, void __user *uptr, int wr)
-{
-	unsigned long addr = (unsigned long)uptr;
-	pgd_t *pgd = pgd_offset(mm, addr);
-	if (pgd_present(*pgd)) {
-		pmd_t *pmd = pmd_offset(pgd, addr);
-		if (pmd_present(*pmd)) {
-			pte_t *pte = pte_offset_map(pmd, addr);
-			return (pte_present(*pte) && (!wr || pte_write(*pte)));
-		}
-	}
-	return 0;
-}
-
-static int copy_locked(void __user *uptr, void *kptr, size_t size, int write,
-		       void (*copyfn)(void *, void __user *))
-{
-	unsigned char v, __user *userptr = uptr;
-	int err = 0;
-
-	do {
-		struct mm_struct *mm;
-
-		if (write) {
-			__put_user_error(0, userptr, err);
-			__put_user_error(0, userptr + size - 1, err);
-		} else {
-			__get_user_error(v, userptr, err);
-			__get_user_error(v, userptr + size - 1, err);
-		}
-
-		if (err)
-			break;
-
-		mm = current->mm;
-		spin_lock(&mm->page_table_lock);
-		if (page_present(mm, userptr, write) &&
-		    page_present(mm, userptr + size - 1, write)) {
-		    	copyfn(kptr, uptr);
-		} else
-			err = 1;
-		spin_unlock(&mm->page_table_lock);
-	} while (err);
-
-	return err;
-}
-
 static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
 {
-	int err = 0;
+	char kbuf[sizeof(*frame) + 8];
+	struct iwmmxt_sigframe *kframe;
 
 	/* the iWMMXt context must be 64 bit aligned */
-	WARN_ON((unsigned long)frame & 7);
-
-	__put_user_error(IWMMXT_MAGIC0, &frame->magic0, err);
-	__put_user_error(IWMMXT_MAGIC1, &frame->magic1, err);
-
-	/*
-	 * iwmmxt_task_copy() doesn't check user permissions.
-	 * Let's do a dummy write on the upper boundary to ensure
-	 * access to user mem is OK all way up.
-	 */
-	err |= copy_locked(&frame->storage, current_thread_info(),
-			   sizeof(frame->storage), 1, iwmmxt_task_copy);
-	return err;
+	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+	kframe->magic0 = IWMMXT_MAGIC0;
+	kframe->magic1 = IWMMXT_MAGIC1;
+	iwmmxt_task_copy(current_thread_info(), &kframe->storage);
+	return __copy_to_user(frame, kframe, sizeof(*frame));
 }
 
 static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
 {
-	unsigned long magic0, magic1;
-	int err = 0;
+	char kbuf[sizeof(*frame) + 8];
+	struct iwmmxt_sigframe *kframe;
 
-	/* the iWMMXt context is 64 bit aligned */
-	WARN_ON((unsigned long)frame & 7);
-
-	/*
-	 * Validate iWMMXt context signature.
-	 * Also, iwmmxt_task_restore() doesn't check user permissions.
-	 * Let's do a dummy write on the upper boundary to ensure
-	 * access to user mem is OK all way up.
-	 */
-	__get_user_error(magic0, &frame->magic0, err);
-	__get_user_error(magic1, &frame->magic1, err);
-	if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1)
-		err = copy_locked(&frame->storage, current_thread_info(),
-				  sizeof(frame->storage), 0, iwmmxt_task_restore);
-	return err;
+	/* the iWMMXt context must be 64 bit aligned */
+	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+	if (__copy_from_user(kframe, frame, sizeof(*frame)))
+		return -1;
+	if (kframe->magic0 != IWMMXT_MAGIC0 ||
+	    kframe->magic1 != IWMMXT_MAGIC1)
+		return -1;
+	iwmmxt_task_restore(current_thread_info(), &kframe->storage);
+	return 0;
 }
 
 #endif

+ 9 - 5
arch/arm/kernel/traps.c

@@ -483,29 +483,33 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 		unsigned long addr = regs->ARM_r2;
 		struct mm_struct *mm = current->mm;
 		pgd_t *pgd; pmd_t *pmd; pte_t *pte;
+		spinlock_t *ptl;
 
 		regs->ARM_cpsr &= ~PSR_C_BIT;
-		spin_lock(&mm->page_table_lock);
+		down_read(&mm->mmap_sem);
 		pgd = pgd_offset(mm, addr);
 		if (!pgd_present(*pgd))
 			goto bad_access;
 		pmd = pmd_offset(pgd, addr);
 		if (!pmd_present(*pmd))
 			goto bad_access;
-		pte = pte_offset_map(pmd, addr);
-		if (!pte_present(*pte) || !pte_write(*pte))
+		pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+		if (!pte_present(*pte) || !pte_write(*pte)) {
+			pte_unmap_unlock(pte, ptl);
 			goto bad_access;
+		}
 		val = *(unsigned long *)addr;
 		val -= regs->ARM_r0;
 		if (val == 0) {
 			*(unsigned long *)addr = regs->ARM_r1;
 			regs->ARM_cpsr |= PSR_C_BIT;
 		}
-		spin_unlock(&mm->page_table_lock);
+		pte_unmap_unlock(pte, ptl);
+		up_read(&mm->mmap_sem);
 		return val;
 
 		bad_access:
-		spin_unlock(&mm->page_table_lock);
+		up_read(&mm->mmap_sem);
 		/* simulate a write access fault */
 		do_DataAbort(addr, 15 + (1 << 11), regs);
 		return -1;

+ 1 - 5
arch/arm/mm/consistent.c

@@ -397,8 +397,6 @@ static int __init consistent_init(void)
 	pte_t *pte;
 	int ret = 0;
 
-	spin_lock(&init_mm.page_table_lock);
-
 	do {
 		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
 		pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -409,7 +407,7 @@ static int __init consistent_init(void)
 		}
 		WARN_ON(!pmd_none(*pmd));
 
-		pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE);
+		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
 		if (!pte) {
 			printk(KERN_ERR "%s: no pte tables\n", __func__);
 			ret = -ENOMEM;
@@ -419,8 +417,6 @@ static int __init consistent_init(void)
 		consistent_pte = pte;
 	} while (0);
 
-	spin_unlock(&init_mm.page_table_lock);
-
 	return ret;
 }
 

+ 6 - 1
arch/arm/mm/fault-armv.c

@@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE;
 /*
  * We take the easy way out of this problem - we make the
  * PTE uncacheable.  However, we leave the write buffer on.
+ *
+ * Note that the pte lock held when calling update_mmu_cache must also
+ * guard the pte (somewhere else in the same mm) that we modify here.
+ * Therefore those configurations which might call adjust_pte (those
+ * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
  */
 static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
 {
@@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page);
  *  2. If we have multiple shared mappings of the same space in
  *     an object, we need to deal with the cache aliasing issues.
  *
- * Note that the page_table_lock will be held.
+ * Note that the pte lock will be held.
  */
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {

+ 1 - 3
arch/arm/mm/ioremap.c

@@ -75,7 +75,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 
 	pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags);
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, pgprot);
@@ -97,7 +97,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
 	phys_addr -= address;
 	dir = pgd_offset(&init_mm, address);
 	BUG_ON(address >= end);
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
 		if (!pmd) {
@@ -114,7 +113,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr,
 		dir++;
 	} while (address && (address < end));
 
-	spin_unlock(&init_mm.page_table_lock);
 	flush_cache_vmap(start, end);
 	return err;
 }

+ 1 - 14
arch/arm/mm/mm-armv.c

@@ -179,11 +179,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));
 
 	if (!vectors_high()) {
-		/*
-		 * This lock is here just to satisfy pmd_alloc and pte_lock
-		 */
-		spin_lock(&mm->page_table_lock);
-
 		/*
 		 * On ARM, first page must always be allocated since it
 		 * contains the machine vectors.
@@ -201,23 +196,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 		set_pte(new_pte, *init_pte);
 		pte_unmap_nested(init_pte);
 		pte_unmap(new_pte);
-
-		spin_unlock(&mm->page_table_lock);
 	}
 
 	return new_pgd;
 
 no_pte:
-	spin_unlock(&mm->page_table_lock);
 	pmd_free(new_pmd);
-	free_pages((unsigned long)new_pgd, 2);
-	return NULL;
-
 no_pmd:
-	spin_unlock(&mm->page_table_lock);
 	free_pages((unsigned long)new_pgd, 2);
-	return NULL;
-
 no_pgd:
 	return NULL;
 }
@@ -243,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
 	pte = pmd_page(*pmd);
 	pmd_clear(pmd);
 	dec_page_state(nr_page_table_pages);
+	pte_lock_deinit(pte);
 	pte_free(pte);
 	pmd_free(pmd);
 free:

+ 9 - 37
arch/arm/oprofile/backtrace.c

@@ -49,42 +49,22 @@ static struct frame_tail* kernel_backtrace(struct frame_tail *tail)
 
 static struct frame_tail* user_backtrace(struct frame_tail *tail)
 {
-	struct frame_tail buftail;
+	struct frame_tail buftail[2];
 
-	/* hardware pte might not be valid due to dirty/accessed bit emulation
-	 * so we use copy_from_user and benefit from exception fixups */
-	if (copy_from_user(&buftail, tail, sizeof(struct frame_tail)))
+	/* Also check accessibility of one struct frame_tail beyond */
+	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+		return NULL;
+	if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail)))
 		return NULL;
 
-	oprofile_add_trace(buftail.lr);
+	oprofile_add_trace(buftail[0].lr);
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (tail >= buftail.fp)
+	if (tail >= buftail[0].fp)
 		return NULL;
 
-	return buftail.fp-1;
-}
-
-/* Compare two addresses and see if they're on the same page */
-#define CMP_ADDR_EQUAL(x,y,offset) ((((unsigned long) x) >> PAGE_SHIFT) \
-	== ((((unsigned long) y) + offset) >> PAGE_SHIFT))
-
-/* check that the page(s) containing the frame tail are present */
-static int pages_present(struct frame_tail *tail)
-{
-	struct mm_struct * mm = current->mm;
-
-	if (!check_user_page_readable(mm, (unsigned long)tail))
-		return 0;
-
-	if (CMP_ADDR_EQUAL(tail, tail, 8))
-		return 1;
-
-	if (!check_user_page_readable(mm, ((unsigned long)tail) + 8))
-		return 0;
-
-	return 1;
+	return buftail[0].fp-1;
 }
 
 /*
@@ -118,7 +98,6 @@ static int valid_kernel_stack(struct frame_tail *tail, struct pt_regs *regs)
 void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
 {
 	struct frame_tail *tail;
-	unsigned long last_address = 0;
 
 	tail = ((struct frame_tail *) regs->ARM_fp) - 1;
 
@@ -132,13 +111,6 @@ void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
 		return;
 	}
 
-	while (depth-- && tail && !((unsigned long) tail & 3)) {
-		if ((!CMP_ADDR_EQUAL(last_address, tail, 0)
-			|| !CMP_ADDR_EQUAL(last_address, tail, 8))
-				&& !pages_present(tail))
-			return;
-		last_address = (unsigned long) tail;
+	while (depth-- && tail && !((unsigned long) tail & 3))
 		tail = user_backtrace(tail);
-	}
 }
-

+ 2 - 16
arch/arm26/mm/memc.c

@@ -78,12 +78,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	if (!new_pgd)
 		goto no_pgd;
 
-	/*
-	 * This lock is here just to satisfy pmd_alloc and pte_lock
-         * FIXME: I bet we could avoid taking it pretty much altogether
-	 */
-	spin_lock(&mm->page_table_lock);
-
 	/*
 	 * On ARM, first page must always be allocated since it contains
 	 * the machine vectors.
@@ -92,7 +86,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	if (!new_pmd)
 		goto no_pmd;
 
-	new_pte = pte_alloc_kernel(mm, new_pmd, 0);
+	new_pte = pte_alloc_map(mm, new_pmd, 0);
 	if (!new_pte)
 		goto no_pte;
 
@@ -101,6 +95,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	init_pte = pte_offset(init_pmd, 0);
 
 	set_pte(new_pte, *init_pte);
+	pte_unmap(new_pte);
 
 	/*
 	 * the page table entries are zeroed
@@ -112,23 +107,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm)
 	memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR,
 		(PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t));
 
-	spin_unlock(&mm->page_table_lock);
-
 	/* update MEMC tables */
 	cpu_memc_update_all(new_pgd);
 	return new_pgd;
 
 no_pte:
-	spin_unlock(&mm->page_table_lock);
 	pmd_free(new_pmd);
-	free_pgd_slow(new_pgd);
-	return NULL;
-
 no_pmd:
-	spin_unlock(&mm->page_table_lock);
 	free_pgd_slow(new_pgd);
-	return NULL;
-
 no_pgd:
 	return NULL;
 }

+ 4 - 2
arch/cris/arch-v32/mm/tlb.c

@@ -175,6 +175,8 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
+static DEFINE_SPINLOCK(mmu_context_lock);
+
 /* Called in schedule() just before actually doing the switch_to. */
 void
 switch_mm(struct mm_struct *prev, struct mm_struct *next,
@@ -183,10 +185,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	int cpu = smp_processor_id();
 
 	/* Make sure there is a MMU context. */
-	spin_lock(&next->page_table_lock);
+	spin_lock(&mmu_context_lock);
 	get_mmu_context(next);
 	cpu_set(cpu, next->cpu_vm_mask);
-	spin_unlock(&next->page_table_lock);
+	spin_unlock(&mmu_context_lock);
 
 	/*
 	 * Remember the pgd for the fault handlers. Keep a seperate copy of it

+ 1 - 3
arch/cris/mm/ioremap.c

@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, prot);
@@ -74,7 +74,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pmd_t *pmd;
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }

+ 1 - 4
arch/frv/mm/dma-alloc.c

@@ -55,21 +55,18 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot)
 	pte_t *pte;
 	int err = -ENOMEM;
 
-	spin_lock(&init_mm.page_table_lock);
-
 	/* Use upper 10 bits of VA to index the first level map */
 	pge = pgd_offset_k(va);
 	pue = pud_offset(pge, va);
 	pme = pmd_offset(pue, va);
 
 	/* Use middle 10 bits of VA to index the second-level map */
-	pte = pte_alloc_kernel(&init_mm, pme, va);
+	pte = pte_alloc_kernel(pme, va);
 	if (pte != 0) {
 		err = 0;
 		set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot));
 	}
 
-	spin_unlock(&init_mm.page_table_lock);
 	return err;
 }
 

+ 2 - 2
arch/frv/mm/pgalloc.c

@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
 	if (pgd_list)
 		pgd_list->private = (unsigned long) &page->index;
 	pgd_list = page;
-	page->private = (unsigned long) &pgd_list;
+	set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *) page->index;
-	pprev = (struct page **) page->private;
+	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
 		next->private = (unsigned long) pprev;

+ 7 - 10
arch/i386/kernel/vm86.c

@@ -134,17 +134,16 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 	return ret;
 }
 
-static void mark_screen_rdonly(struct task_struct * tsk)
+static void mark_screen_rdonly(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte, *mapped;
+	pte_t *pte;
+	spinlock_t *ptl;
 	int i;
 
-	preempt_disable();
-	spin_lock(&tsk->mm->page_table_lock);
-	pgd = pgd_offset(tsk->mm, 0xA0000);
+	pgd = pgd_offset(mm, 0xA0000);
 	if (pgd_none_or_clear_bad(pgd))
 		goto out;
 	pud = pud_offset(pgd, 0xA0000);
@@ -153,16 +152,14 @@ static void mark_screen_rdonly(struct task_struct * tsk)
 	pmd = pmd_offset(pud, 0xA0000);
 	if (pmd_none_or_clear_bad(pmd))
 		goto out;
-	pte = mapped = pte_offset_map(pmd, 0xA0000);
+	pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
 	for (i = 0; i < 32; i++) {
 		if (pte_present(*pte))
 			set_pte(pte, pte_wrprotect(*pte));
 		pte++;
 	}
-	pte_unmap(mapped);
+	pte_unmap_unlock(pte, ptl);
 out:
-	spin_unlock(&tsk->mm->page_table_lock);
-	preempt_enable();
 	flush_tlb();
 }
 
@@ -306,7 +303,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 
 	tsk->thread.screen_bitmap = info->screen_bitmap;
 	if (info->flags & VM86_SCREEN_BITMAP)
-		mark_screen_rdonly(tsk);
+		mark_screen_rdonly(tsk->mm);
 	__asm__ __volatile__(
 		"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
 		"movl %0,%%esp\n\t"

+ 2 - 2
arch/i386/mm/discontig.c

@@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 
 extern unsigned long find_max_low_pfn(void);
 extern void find_max_pfn(void);
-extern void one_highpage_init(struct page *, int, int);
+extern void add_one_highpage_init(struct page *, int, int);
 
 extern struct e820map e820;
 extern unsigned long init_pg_tables_end;
@@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int bad_ppro)
 			if (!pfn_valid(node_pfn))
 				continue;
 			page = pfn_to_page(node_pfn);
-			one_highpage_init(page, node_pfn, bad_ppro);
+			add_one_highpage_init(page, node_pfn, bad_ppro);
 		}
 	}
 	totalram_pages += totalhigh_pages;

+ 57 - 5
arch/i386/mm/init.c

@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/efi.h>
+#include <linux/memory_hotplug.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -266,17 +267,46 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;	
 }
 
-void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
+void __devinit free_new_highpage(struct page *page)
+{
+	set_page_count(page, 1);
+	__free_page(page);
+	totalhigh_pages++;
+}
+
+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
 	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
-		set_page_count(page, 1);
-		__free_page(page);
-		totalhigh_pages++;
+		free_new_highpage(page);
 	} else
 		SetPageReserved(page);
 }
 
+static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+{
+	free_new_highpage(page);
+	totalram_pages++;
+#ifdef CONFIG_FLATMEM
+	max_mapnr = max(pfn, max_mapnr);
+#endif
+	num_physpages++;
+	return 0;
+}
+
+/*
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+ * onlined here is in HIGHMEM
+ */
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	add_one_highpage_hotplug(page, page_to_pfn(page));
+}
+
+
 #ifdef CONFIG_NUMA
 extern void set_highmem_pages_init(int);
 #else
@@ -284,7 +314,7 @@ static void __init set_highmem_pages_init(int bad_ppro)
 {
 	int pfn;
 	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
-		one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+		add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
 	totalram_pages += totalhigh_pages;
 }
 #endif /* CONFIG_FLATMEM */
@@ -615,6 +645,28 @@ void __init mem_init(void)
 #endif
 }
 
+/*
+ * this is for the non-NUMA, single node SMP system case.
+ * Specifically, in the case of x86, we will always add
+ * memory to the highmem for now.
+ */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+int add_memory(u64 start, u64 size)
+{
+	struct pglist_data *pgdata = &contig_page_data;
+	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	return __add_pages(zone, start_pfn, nr_pages);
+}
+
+int remove_memory(u64 start, u64 size)
+{
+	return -EINVAL;
+}
+#endif
+
 kmem_cache_t *pgd_cache;
 kmem_cache_t *pmd_cache;
 

+ 1 - 3
arch/i386/mm/ioremap.c

@@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 	unsigned long pfn;
 
 	pfn = phys_addr >> PAGE_SHIFT;
-	pte = pte_alloc_kernel(&init_mm, pmd, addr);
+	pte = pte_alloc_kernel(pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -87,14 +87,12 @@ static int ioremap_page_range(unsigned long addr,
 	flush_cache_all();
 	phys_addr -= addr;
 	pgd = pgd_offset_k(addr);
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return err;
 }

+ 7 - 4
arch/i386/mm/pgtable.c

@@ -31,11 +31,13 @@ void show_mem(void)
 	pg_data_t *pgdat;
 	unsigned long i;
 	struct page_state ps;
+	unsigned long flags;
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
 	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -48,6 +50,7 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk(KERN_INFO "%d pages of RAM\n", total);
 	printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
@@ -188,19 +191,19 @@ static inline void pgd_list_add(pgd_t *pgd)
 	struct page *page = virt_to_page(pgd);
 	page->index = (unsigned long)pgd_list;
 	if (pgd_list)
-		pgd_list->private = (unsigned long)&page->index;
+		set_page_private(pgd_list, (unsigned long)&page->index);
 	pgd_list = page;
-	page->private = (unsigned long)&pgd_list;
+	set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *)page->index;
-	pprev = (struct page **)page->private;
+	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
-		next->private = (unsigned long)pprev;
+		set_page_private(next, (unsigned long)pprev);
 }
 
 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)

+ 13 - 25
arch/i386/oprofile/backtrace.c

@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/ptrace.h>
+#include <asm/uaccess.h>
 
 struct frame_head {
 	struct frame_head * ebp;
@@ -21,26 +22,22 @@ struct frame_head {
 static struct frame_head *
 dump_backtrace(struct frame_head * head)
 {
-	oprofile_add_trace(head->ret);
+	struct frame_head bufhead[2];
 
-	/* frame pointers should strictly progress back up the stack
-	 * (towards higher addresses) */
-	if (head >= head->ebp)
+	/* Also check accessibility of one struct frame_head beyond */
+	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+		return NULL;
+	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
 		return NULL;
 
-	return head->ebp;
-}
-
-/* check that the page(s) containing the frame head are present */
-static int pages_present(struct frame_head * head)
-{
-	struct mm_struct * mm = current->mm;
+	oprofile_add_trace(bufhead[0].ret);
 
-	/* FIXME: only necessary once per page */
-	if (!check_user_page_readable(mm, (unsigned long)head))
-		return 0;
+	/* frame pointers should strictly progress back up the stack
+	 * (towards higher addresses) */
+	if (head >= bufhead[0].ebp)
+		return NULL;
 
-	return check_user_page_readable(mm, (unsigned long)(head + 1));
+	return bufhead[0].ebp;
 }
 
 /*
@@ -97,15 +94,6 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 		return;
 	}
 
-#ifdef CONFIG_SMP
-	if (!spin_trylock(&current->mm->page_table_lock))
-		return;
-#endif
-
-	while (depth-- && head && pages_present(head))
+	while (depth-- && head)
 		head = dump_backtrace(head);
-
-#ifdef CONFIG_SMP
-	spin_unlock(&current->mm->page_table_lock);
-#endif
 }

+ 2 - 1
arch/ia64/kernel/perfmon.c

@@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	insert_vm_struct(mm, vma);
 
 	mm->total_vm  += size >> PAGE_SHIFT;
-	vm_stat_account(vma);
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+							vma_pages(vma));
 	up_write(&task->mm->mmap_sem);
 
 	/*

+ 6 - 1
arch/ia64/mm/discontig.c

@@ -555,9 +555,13 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		unsigned long present = pgdat->node_present_pages;
+		unsigned long present;
+		unsigned long flags;
 		int shared = 0, cached = 0, reserved = 0;
+
 		printk("Node ID: %d\n", pgdat->node_id);
+		pgdat_resize_lock(pgdat, &flags);
+		present = pgdat->node_present_pages;
 		for(i = 0; i < pgdat->node_spanned_pages; i++) {
 			struct page *page;
 			if (pfn_valid(pgdat->node_start_pfn + i))
@@ -571,6 +575,7 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page)-1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 		total_present += present;
 		total_reserved += reserved;
 		total_cached += cached;

+ 7 - 27
arch/ia64/mm/fault.c

@@ -19,32 +19,6 @@
 
 extern void die (char *, struct pt_regs *, long);
 
-/*
- * This routine is analogous to expand_stack() but instead grows the
- * register backing store (which grows towards higher addresses).
- * Since the register backing store is access sequentially, we
- * disallow growing the RBS by more than a page at a time.  Note that
- * the VM_GROWSUP flag can be set on any VM area but that's fine
- * because the total process size is still limited by RLIMIT_STACK and
- * RLIMIT_AS.
- */
-static inline long
-expand_backing_store (struct vm_area_struct *vma, unsigned long address)
-{
-	unsigned long grow;
-
-	grow = PAGE_SIZE >> PAGE_SHIFT;
-	if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur
-	    || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur))
-		return -ENOMEM;
-	vma->vm_end += PAGE_SIZE;
-	vma->vm_mm->total_vm += grow;
-	if (vma->vm_flags & VM_LOCKED)
-		vma->vm_mm->locked_vm += grow;
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
-	return 0;
-}
-
 /*
  * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
  * (inside region 5, on ia64) and that page is present.
@@ -185,7 +159,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 		if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start)
 		    || REGION_OFFSET(address) >= RGN_MAP_LIMIT)
 			goto bad_area;
-		if (expand_backing_store(vma, address))
+		/*
+		 * Since the register backing store is accessed sequentially,
+		 * we disallow growing it by more than a page at a time.
+		 */
+		if (address > vma->vm_end + PAGE_SIZE - sizeof(long))
+			goto bad_area;
+		if (expand_upwards(vma, address))
 			goto bad_area;
 	}
 	goto good_area;

+ 4 - 9
arch/ia64/mm/init.c

@@ -158,7 +158,7 @@ ia64_init_addr_space (void)
 		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
 		vma->vm_end = vma->vm_start + PAGE_SIZE;
 		vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
-		vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
+		vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT;
 		down_write(&current->mm->mmap_sem);
 		if (insert_vm_struct(current->mm, vma)) {
 			up_write(&current->mm->mmap_sem);
@@ -275,26 +275,21 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 
 	pgd = pgd_offset_k(address);		/* note: this is NOT pgd_offset()! */
 
-	spin_lock(&init_mm.page_table_lock);
 	{
 		pud = pud_alloc(&init_mm, pgd, address);
 		if (!pud)
 			goto out;
-
 		pmd = pmd_alloc(&init_mm, pud, address);
 		if (!pmd)
 			goto out;
-		pte = pte_alloc_map(&init_mm, pmd, address);
+		pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			goto out;
-		if (!pte_none(*pte)) {
-			pte_unmap(pte);
+		if (!pte_none(*pte))
 			goto out;
-		}
 		set_pte(pte, mk_pte(page, pgprot));
-		pte_unmap(pte);
 	}
-  out:	spin_unlock(&init_mm.page_table_lock);
+  out:
 	/* no need for flush_tlb */
 	return page;
 }

+ 2 - 0
arch/ia64/mm/tlb.c

@@ -158,10 +158,12 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long
 # ifdef CONFIG_SMP
 	platform_global_tlb_purge(mm, start, end, nbits);
 # else
+	preempt_disable();
 	do {
 		ia64_ptcl(start, (nbits<<2));
 		start += (1UL << nbits);
 	} while (start < end);
+	preempt_enable();
 # endif
 
 	ia64_srlz_i();			/* srlz.i implies srlz.d */

+ 8 - 1
arch/m32r/mm/init.c

@@ -48,6 +48,8 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -60,6 +62,7 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk("%d pages of RAM\n", total);
 	printk("%d pages of HIGHMEM\n",highmem);
@@ -150,10 +153,14 @@ int __init reservedpages_count(void)
 	int reservedpages, nid, i;
 
 	reservedpages = 0;
-	for_each_online_node(nid)
+	for_each_online_node(nid) {
+		unsigned long flags;
+		pgdat_resize_lock(NODE_DATA(nid), &flags);
 		for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
 			if (PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
+		pgdat_resize_unlock(NODE_DATA(nid), &flags);
+	}
 
 	return reservedpages;
 }

+ 1 - 3
arch/m32r/mm/ioremap.c

@@ -67,7 +67,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -90,7 +90,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -104,7 +103,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }

+ 1 - 23
arch/m68k/Kconfig

@@ -388,33 +388,11 @@ config AMIGA_PCMCIA
 	  Include support in the kernel for pcmcia on Amiga 1200 and Amiga
 	  600. If you intend to use pcmcia cards say Y; otherwise say N.
 
-config STRAM_SWAP
-	bool "Support for ST-RAM as swap space"
-	depends on ATARI && BROKEN
-	---help---
-	  Some Atari 68k machines (including the 520STF and 1020STE) divide
-	  their addressable memory into ST and TT sections.  The TT section
-	  (up to 512MB) is the main memory; the ST section (up to 4MB) is
-	  accessible to the built-in graphics board, runs slower, and is
-	  present mainly for backward compatibility with older machines.
-
-	  This enables support for using (parts of) ST-RAM as swap space,
-	  instead of as normal system memory. This can first enhance system
-	  performance if you have lots of alternate RAM (compared to the size
-	  of ST-RAM), because executable code always will reside in faster
-	  memory. ST-RAM will remain as ultra-fast swap space. On the other
-	  hand, it allows much improved dynamic allocations of ST-RAM buffers
-	  for device driver modules (e.g. floppy, ACSI, SLM printer, DMA
-	  sound). The probability that such allocations at module load time
-	  fail is drastically reduced.
-
 config STRAM_PROC
 	bool "ST-RAM statistics in /proc"
 	depends on ATARI
 	help
-	  Say Y here to report ST-RAM usage statistics in /proc/stram.  See
-	  the help for CONFIG_STRAM_SWAP for discussion of ST-RAM and its
-	  uses.
+	  Say Y here to report ST-RAM usage statistics in /proc/stram.
 
 config HEARTBEAT
 	bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40

Різницю між файлами не показано, бо вона завелика
+ 15 - 901
arch/m68k/atari/stram.c


+ 1 - 1
arch/m68k/mm/kmap.c

@@ -201,7 +201,7 @@ void *__ioremap(unsigned long physaddr, unsigned long size, int cacheflag)
 			virtaddr += PTRTREESIZE;
 			size -= PTRTREESIZE;
 		} else {
-			pte_dir = pte_alloc_kernel(&init_mm, pmd_dir, virtaddr);
+			pte_dir = pte_alloc_kernel(pmd_dir, virtaddr);
 			if (!pte_dir) {
 				printk("ioremap: no mem for pte_dir\n");
 				return NULL;

+ 1 - 1
arch/m68k/sun3x/dvma.c

@@ -116,7 +116,7 @@ inline int dvma_map_cpu(unsigned long kaddr,
 			pte_t *pte;
 			unsigned long end3;
 
-			if((pte = pte_alloc_kernel(&init_mm, pmd, vaddr)) == NULL) {
+			if((pte = pte_alloc_kernel(pmd, vaddr)) == NULL) {
 				ret = -ENOMEM;
 				goto out;
 			}

+ 0 - 1
arch/mips/kernel/irixelf.c

@@ -697,7 +697,6 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	/* Do this so that we can load the interpreter, if need be.  We will
 	 * change some of these later.
 	 */
-	set_mm_counter(current->mm, rss, 0);
 	setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 	current->mm->start_stack = bprm->p;
 

+ 1 - 3
arch/mips/mm/ioremap.c

@@ -55,7 +55,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -77,7 +77,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pmd_t *pmd;
@@ -96,7 +95,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }

+ 9 - 15
arch/parisc/kernel/cache.c

@@ -270,7 +270,6 @@ void flush_dcache_page(struct page *page)
 	unsigned long offset;
 	unsigned long addr;
 	pgoff_t pgoff;
-	pte_t *pte;
 	unsigned long pfn = page_to_pfn(page);
 
 
@@ -301,21 +300,16 @@ void flush_dcache_page(struct page *page)
 		 * taking a page fault if the pte doesn't exist.
 		 * This is just for speed.  If the page translation
 		 * isn't there, there's no point exciting the
-		 * nadtlb handler into a nullification frenzy */
-
-
-  		if(!(pte = translation_exists(mpnt, addr)))
-			continue;
-
-		/* make sure we really have this page: the private
+		 * nadtlb handler into a nullification frenzy.
+		 *
+		 * Make sure we really have this page: the private
 		 * mappings may cover this area but have COW'd this
-		 * particular page */
-		if(pte_pfn(*pte) != pfn)
-  			continue;
-
-		__flush_cache_page(mpnt, addr);
-
-		break;
+		 * particular page.
+		 */
+  		if (translation_exists(mpnt, addr, pfn)) {
+			__flush_cache_page(mpnt, addr);
+			break;
+		}
 	}
 	flush_dcache_mmap_unlock(mapping);
 }

+ 1 - 1
arch/parisc/kernel/pci-dma.c

@@ -114,7 +114,7 @@ static inline int map_pmd_uncached(pmd_t * pmd, unsigned long vaddr,
 	if (end > PGDIR_SIZE)
 		end = PGDIR_SIZE;
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, vaddr);
+		pte_t * pte = pte_alloc_kernel(pmd, vaddr);
 		if (!pte)
 			return -ENOMEM;
 		if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr))

+ 3 - 0
arch/parisc/mm/init.c

@@ -505,7 +505,9 @@ void show_mem(void)
 
 		for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
 			struct page *p;
+			unsigned long flags;
 
+			pgdat_resize_lock(NODE_DATA(i), &flags);
 			p = nid_page_nr(i, j) - node_start_pfn(i);
 
 			total++;
@@ -517,6 +519,7 @@ void show_mem(void)
 				free++;
 			else
 				shared += page_count(p) - 1;
+			pgdat_resize_unlock(NODE_DATA(i), &flags);
         	}
 	}
 #endif

+ 2 - 4
arch/parisc/mm/ioremap.c

@@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(NULL, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -75,10 +75,9 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
-		pmd = pmd_alloc(dir, address);
+		pmd = pmd_alloc(&init_mm, dir, address);
 		error = -ENOMEM;
 		if (!pmd)
 			break;
@@ -89,7 +88,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }

+ 1 - 5
arch/ppc/kernel/dma-mapping.c

@@ -335,8 +335,6 @@ static int __init dma_alloc_init(void)
 	pte_t *pte;
 	int ret = 0;
 
-	spin_lock(&init_mm.page_table_lock);
-
 	do {
 		pgd = pgd_offset(&init_mm, CONSISTENT_BASE);
 		pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE);
@@ -347,7 +345,7 @@ static int __init dma_alloc_init(void)
 		}
 		WARN_ON(!pmd_none(*pmd));
 
-		pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE);
+		pte = pte_alloc_kernel(pmd, CONSISTENT_BASE);
 		if (!pte) {
 			printk(KERN_ERR "%s: no pte tables\n", __func__);
 			ret = -ENOMEM;
@@ -357,8 +355,6 @@ static int __init dma_alloc_init(void)
 		consistent_pte = pte;
 	} while (0);
 
-	spin_unlock(&init_mm.page_table_lock);
-
 	return ret;
 }
 

+ 0 - 4
arch/ppc/mm/4xx_mmu.c

@@ -110,13 +110,11 @@ unsigned long __init mmu_mapin_ram(void)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE;
 
-		spin_lock(&init_mm.page_table_lock);
 		pmdp = pmd_offset(pgd_offset_k(v), v);
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
 		pmd_val(*pmdp++) = val;
-		spin_unlock(&init_mm.page_table_lock);
 
 		v += LARGE_PAGE_SIZE_16M;
 		p += LARGE_PAGE_SIZE_16M;
@@ -127,10 +125,8 @@ unsigned long __init mmu_mapin_ram(void)
 		pmd_t *pmdp;
 		unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE;
 
-		spin_lock(&init_mm.page_table_lock);
 		pmdp = pmd_offset(pgd_offset_k(v), v);
 		pmd_val(*pmdp) = val;
-		spin_unlock(&init_mm.page_table_lock);
 
 		v += LARGE_PAGE_SIZE_4M;
 		p += LARGE_PAGE_SIZE_4M;

+ 1 - 3
arch/ppc/mm/pgtable.c

@@ -280,18 +280,16 @@ map_page(unsigned long va, phys_addr_t pa, int flags)
 	pte_t *pg;
 	int err = -ENOMEM;
 
-	spin_lock(&init_mm.page_table_lock);
 	/* Use upper 10 bits of VA to index the first level map */
 	pd = pmd_offset(pgd_offset_k(va), va);
 	/* Use middle 10 bits of VA to index the second-level map */
-	pg = pte_alloc_kernel(&init_mm, pd, va);
+	pg = pte_alloc_kernel(pd, va);
 	if (pg != 0) {
 		err = 0;
 		set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags)));
 		if (mem_init_done)
 			flush_HPTE(0, va, pmd_val(*pd));
 	}
-	spin_unlock(&init_mm.page_table_lock);
 	return err;
 }
 

+ 7 - 5
arch/ppc64/kernel/vdso.c

@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
 		return NOPAGE_SIGBUS;
 
 	/*
-	 * Last page is systemcfg, special handling here, no get_page() a
-	 * this is a reserved page
+	 * Last page is systemcfg.
 	 */
 	if ((vma->vm_end - address) <= PAGE_SIZE)
-		return virt_to_page(systemcfg);
+		pg = virt_to_page(systemcfg);
+	else
+		pg = virt_to_page(vbase + offset);
 
-	pg = virt_to_page(vbase + offset);
 	get_page(pg);
 	DBG(" ->page count: %d\n", page_count(pg));
 
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
 	 * gettimeofday will be totally dead. It's fine to use that for setting
 	 * breakpoints in the vDSO code pages though
 	 */
-	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
 	vma->vm_flags |= mm->def_flags;
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
 	vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
 		ClearPageReserved(pg);
 		get_page(pg);
 	}
+
+	get_page(virt_to_page(systemcfg));
 }
 
 int in_gate_area_no_task(unsigned long addr)

+ 0 - 5
arch/ppc64/mm/imalloc.c

@@ -300,12 +300,7 @@ void im_free(void * addr)
 	for (p = &imlist ; (tmp = *p) ; p = &tmp->next) {
 		if (tmp->addr == addr) {
 			*p = tmp->next;
-
-			/* XXX: do we need the lock? */
-			spin_lock(&init_mm.page_table_lock);
 			unmap_vm_area(tmp);
-			spin_unlock(&init_mm.page_table_lock);
-
 			kfree(tmp);
 			up(&imlist_sem);
 			return;

+ 84 - 3
arch/ppc64/mm/init.c

@@ -104,6 +104,8 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat_page_nr(pgdat, i);
 			total++;
@@ -114,6 +116,7 @@ void show_mem(void)
 			else if (page_count(page))
 				shared += page_count(page) - 1;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 	printk("%ld pages of RAM\n", total);
 	printk("%ld reserved pages\n", reserved);
@@ -155,7 +158,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 	unsigned long vsid;
 
 	if (mem_init_done) {
-		spin_lock(&init_mm.page_table_lock);
 		pgdp = pgd_offset_k(ea);
 		pudp = pud_alloc(&init_mm, pgdp, ea);
 		if (!pudp)
@@ -163,12 +165,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		pmdp = pmd_alloc(&init_mm, pudp, ea);
 		if (!pmdp)
 			return -ENOMEM;
-		ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
+		ptep = pte_alloc_kernel(pmdp, ea);
 		if (!ptep)
 			return -ENOMEM;
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
-		spin_unlock(&init_mm.page_table_lock);
 	} else {
 		unsigned long va, vpn, hash, hpteg;
 
@@ -649,11 +650,14 @@ void __init mem_init(void)
 #endif
 
 	for_each_pgdat(pgdat) {
+		unsigned long flags;
+		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat_page_nr(pgdat, i);
 			if (PageReserved(page))
 				reservedpages++;
 		}
+		pgdat_resize_unlock(pgdat, &flags);
 	}
 
 	codesize = (unsigned long)&_etext - (unsigned long)&_stext;
@@ -867,3 +871,80 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 	return vma_prot;
 }
 EXPORT_SYMBOL(phys_mem_access_prot);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	free_cold_page(page);
+	totalram_pages++;
+	num_physpages++;
+}
+
+/*
+ * This works only for the non-NUMA case.  Later, we'll need a lookup
+ * to convert from real physical addresses to nid, that doesn't use
+ * pfn_to_nid().
+ */
+int __devinit add_memory(u64 start, u64 size)
+{
+	struct pglist_data *pgdata = NODE_DATA(0);
+	struct zone *zone;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	/* this should work for most non-highmem platforms */
+	zone = pgdata->node_zones;
+
+	return __add_pages(zone, start_pfn, nr_pages);
+
+	return 0;
+}
+
+/*
+ * First pass at this code will check to determine if the remove
+ * request is within the RMO.  Do not allow removal within the RMO.
+ */
+int __devinit remove_memory(u64 start, u64 size)
+{
+	struct zone *zone;
+	unsigned long start_pfn, end_pfn, nr_pages;
+
+	start_pfn = start >> PAGE_SHIFT;
+	nr_pages = size >> PAGE_SHIFT;
+	end_pfn = start_pfn + nr_pages;
+
+	printk("%s(): Attempting to remove memoy in range "
+			"%lx to %lx\n", __func__, start, start+size);
+	/*
+	 * check for range within RMO
+	 */
+	zone = page_zone(pfn_to_page(start_pfn));
+
+	printk("%s(): memory will be removed from "
+			"the %s zone\n", __func__, zone->name);
+
+	/*
+	 * not handling removing memory ranges that
+	 * overlap multiple zones yet
+	 */
+	if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages))
+		goto overlap;
+
+	/* make sure it is NOT in RMO */
+	if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) {
+		printk("%s(): range to be removed must NOT be in RMO!\n",
+			__func__);
+		goto in_rmo;
+	}
+
+	return __remove_pages(zone, start_pfn, nr_pages);
+
+overlap:
+	printk("%s(): memory range to be removed overlaps "
+		"multiple zones!!!\n", __func__);
+in_rmo:
+	return -1;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */

+ 1 - 3
arch/s390/mm/ioremap.c

@@ -58,7 +58,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -80,7 +80,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return 0;
 }

+ 23 - 17
arch/sh/mm/fault.c

@@ -194,10 +194,13 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 			       unsigned long address)
 {
 	unsigned long addrmax = P4SEG;
-	pgd_t *dir;
+	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t entry;
+	struct mm_struct *mm;
+	spinlock_t *ptl;
+	int ret = 1;
 
 #ifdef CONFIG_SH_KGDB
 	if (kgdb_nofault && kgdb_bus_err_hook)
@@ -208,28 +211,28 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	addrmax = P4SEG_STORE_QUE + 0x04000000;
 #endif
 
-	if (address >= P3SEG && address < addrmax)
-		dir = pgd_offset_k(address);
-	else if (address >= TASK_SIZE)
+	if (address >= P3SEG && address < addrmax) {
+		pgd = pgd_offset_k(address);
+		mm = NULL;
+	} else if (address >= TASK_SIZE)
 		return 1;
-	else if (!current->mm)
+	else if (!(mm = current->mm))
 		return 1;
 	else
-		dir = pgd_offset(current->mm, address);
+		pgd = pgd_offset(mm, address);
 
-	pmd = pmd_offset(dir, address);
-	if (pmd_none(*pmd))
-		return 1;
-	if (pmd_bad(*pmd)) {
-		pmd_ERROR(*pmd);
-		pmd_clear(pmd);
+	pmd = pmd_offset(pgd, address);
+	if (pmd_none_or_clear_bad(pmd))
 		return 1;
-	}
-	pte = pte_offset_kernel(pmd, address);
+	if (mm)
+		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+	else
+		pte = pte_offset_kernel(pmd, address);
+
 	entry = *pte;
 	if (pte_none(entry) || pte_not_present(entry)
 	    || (writeaccess && !pte_write(entry)))
-		return 1;
+		goto unlock;
 
 	if (writeaccess)
 		entry = pte_mkdirty(entry);
@@ -251,8 +254,11 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 
 	set_pte(pte, entry);
 	update_mmu_cache(NULL, address, entry);
-
-	return 0;
+	ret = 0;
+unlock:
+	if (mm)
+		pte_unmap_unlock(pte, ptl);
+	return ret;
 }
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)

+ 0 - 2
arch/sh/mm/hugetlbpage.c

@@ -54,8 +54,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
-
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry)
 {

+ 1 - 3
arch/sh/mm/ioremap.c

@@ -57,7 +57,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address,
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -79,7 +79,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd;
 		pmd = pmd_alloc(&init_mm, dir, address);
@@ -93,7 +92,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }

+ 30 - 38
arch/sh64/mm/cache.c

@@ -584,32 +584,36 @@ static void sh64_dcache_purge_phy_page(unsigned long paddr)
 	}
 }
 
-static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr)
+static void sh64_dcache_purge_user_pages(struct mm_struct *mm,
+				unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t entry;
+	spinlock_t *ptl;
 	unsigned long paddr;
 
-	/* NOTE : all the callers of this have mm->page_table_lock held, so the
-	   following page table traversal is safe even on SMP/pre-emptible. */
-
-	if (!mm) return; /* No way to find physical address of page */
-	pgd = pgd_offset(mm, eaddr);
-	if (pgd_bad(*pgd)) return;
-
-	pmd = pmd_offset(pgd, eaddr);
-	if (pmd_none(*pmd) || pmd_bad(*pmd)) return;
-
-	pte = pte_offset_kernel(pmd, eaddr);
-	entry = *pte;
-	if (pte_none(entry) || !pte_present(entry)) return;
-
-	paddr = pte_val(entry) & PAGE_MASK;
-
-	sh64_dcache_purge_coloured_phy_page(paddr, eaddr);
-
+	if (!mm)
+		return; /* No way to find physical address of page */
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_bad(*pgd))
+		return;
+
+	pmd = pmd_offset(pgd, addr);
+	if (pmd_none(*pmd) || pmd_bad(*pmd))
+		return;
+
+	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	do {
+		entry = *pte;
+		if (pte_none(entry) || !pte_present(entry))
+			continue;
+		paddr = pte_val(entry) & PAGE_MASK;
+		sh64_dcache_purge_coloured_phy_page(paddr, addr);
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap_unlock(pte - 1, ptl);
 }
 /****************************************************************************/
 
@@ -668,7 +672,7 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
 	int n_pages;
 
 	n_pages = ((end - start) >> PAGE_SHIFT);
-	if (n_pages >= 64) {
+	if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) {
 #if 1
 		sh64_dcache_purge_all();
 #else
@@ -707,20 +711,10 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
 		}
 #endif
 	} else {
-		/* 'Small' range */
-		unsigned long aligned_start;
-		unsigned long eaddr;
-		unsigned long last_page_start;
-
-		aligned_start = start & PAGE_MASK;
-		/* 'end' is 1 byte beyond the end of the range */
-		last_page_start = (end - 1) & PAGE_MASK;
-
-		eaddr = aligned_start;
-		while (eaddr <= last_page_start) {
-			sh64_dcache_purge_user_page(mm, eaddr);
-			eaddr += PAGE_SIZE;
-		}
+		/* Small range, covered by a single page table page */
+		start &= PAGE_MASK;	/* should already be so */
+		end = PAGE_ALIGN(end);	/* should already be so */
+		sh64_dcache_purge_user_pages(mm, start, end);
 	}
 	return;
 }
@@ -880,9 +874,7 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
 	   addresses from the user address space specified by mm, after writing
 	   back any dirty data.
 
-	   Note(1), 'end' is 1 byte beyond the end of the range to flush.
-
-	   Note(2), this is called with mm->page_table_lock held.*/
+	   Note, 'end' is 1 byte beyond the end of the range to flush. */
 
 	sh64_dcache_purge_user_range(mm, start, end);
 	sh64_icache_inv_user_page_range(mm, start, end);
@@ -898,7 +890,7 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr, unsigned
 	   the I-cache must be searched too in case the page in question is
 	   both writable and being executed from (e.g. stack trampolines.)
 
-	   Note(1), this is called with mm->page_table_lock held.
+	   Note, this is called with pte lock held.
 	   */
 
 	sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT);

+ 12 - 176
arch/sh64/mm/hugetlbpage.c

@@ -54,41 +54,31 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
-
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 struct page *page, pte_t * page_table, int write_access)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry)
 {
-	unsigned long i;
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-
-	if (write_access)
-		entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
-						       vma->vm_page_prot)));
-	else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
+	int i;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		set_pte(page_table, entry);
-		page_table++;
-
+		set_pte_at(mm, addr, ptep, entry);
+		ptep++;
+		addr += PAGE_SIZE;
 		pte_val(entry) += PAGE_SIZE;
 	}
 }
 
-pte_t huge_ptep_get_and_clear(pte_t *ptep)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
 {
 	pte_t entry;
+	int i;
 
 	entry = *ptep;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		pte_clear(pte);
-		pte++;
+		pte_clear(mm, addr, ptep);
+		addr += PAGE_SIZE;
+		ptep++;
 	}
 
 	return entry;
@@ -106,79 +96,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			    struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int i;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		BUG_ON(!src_pte || pte_none(*src_pte));
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			set_pte(dst_pte, entry);
-			pte_val(entry) += PAGE_SIZE;
-			dst_pte++;
-		}
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		addr += HPAGE_SIZE;
-	}
-	return 0;
-
-nomem:
-	return -ENOMEM;
-}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			struct page **pages, struct vm_area_struct **vmas,
-			unsigned long *position, int *length, int i)
-{
-	unsigned long vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			BUG_ON(!pte || pte_none(*pte));
-
-			page = pte_page(*pte);
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
@@ -195,84 +112,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 {
 	return NULL;
 }
-
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t *pte;
-	struct page *page;
-	int i;
-
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_pte_offset(mm, address);
-		BUG_ON(!pte);
-		if (pte_none(*pte))
-			continue;
-		page = pte_page(*pte);
-		put_page(page);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			pte_clear(mm, address+(i*PAGE_SIZE), pte);
-			pte++;
-		}
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}

+ 1 - 3
arch/sh64/mm/ioremap.c

@@ -79,7 +79,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 		BUG();
 
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -101,7 +101,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
 		error = -ENOMEM;
@@ -115,7 +114,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return 0;
 }

+ 4 - 3
arch/sparc/mm/generic.c

@@ -73,14 +73,16 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+	/* See comment in mm/memory.c remap_pfn_range */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
 	flush_cache_range(vma, beg, end);
 
-	spin_lock(&mm->page_table_lock);
 	while (from < end) {
-		pmd_t *pmd = pmd_alloc(current->mm, dir, from);
+		pmd_t *pmd = pmd_alloc(mm, dir, from);
 		error = -ENOMEM;
 		if (!pmd)
 			break;
@@ -90,7 +92,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 		from = (from + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	}
-	spin_unlock(&mm->page_table_lock);
 
 	flush_tlb_range(vma, beg, end);
 	return error;

+ 0 - 1
arch/sparc64/kernel/binfmt_aout32.c

@@ -241,7 +241,6 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;

+ 5 - 4
arch/sparc64/mm/generic.c

@@ -127,14 +127,16 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+	/* See comment in mm/memory.c remap_pfn_range */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
 	flush_cache_range(vma, beg, end);
 
-	spin_lock(&mm->page_table_lock);
 	while (from < end) {
-		pud_t *pud = pud_alloc(current->mm, dir, from);
+		pud_t *pud = pud_alloc(mm, dir, from);
 		error = -ENOMEM;
 		if (!pud)
 			break;
@@ -144,8 +146,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 		from = (from + PGDIR_SIZE) & PGDIR_MASK;
 		dir++;
 	}
-	flush_tlb_range(vma, beg, end);
-	spin_unlock(&mm->page_table_lock);
 
+	flush_tlb_range(vma, beg, end);
 	return error;
 }

+ 3 - 4
arch/sparc64/mm/tlb.c

@@ -18,8 +18,7 @@
 
 /* Heavily inspired by the ppc64 code.  */
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) =
-	{ NULL, 0, 0, 0, 0, 0, { 0 }, { NULL }, };
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = { 0, };
 
 void flush_tlb_pending(void)
 {
@@ -72,7 +71,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t
 
 no_cache_flush:
 
-	if (mp->tlb_frozen)
+	if (mp->fullmm)
 		return;
 
 	nr = mp->tlb_nr;
@@ -97,7 +96,7 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long
 	unsigned long nr = mp->tlb_nr;
 	long s = start, e = end, vpte_base;
 
-	if (mp->tlb_frozen)
+	if (mp->fullmm)
 		return;
 
 	/* If start is greater than end, that is a real problem.  */

+ 0 - 1
arch/um/include/tlb.h

@@ -34,7 +34,6 @@ struct host_vm_op {
 	} u;
 };
 
-extern void mprotect_kernel_vm(int w);
 extern void force_flush_all(void);
 extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
                              unsigned long end_addr, int force,

+ 5 - 3
arch/um/kernel/process_kern.c

@@ -222,6 +222,7 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	pte_t ptent;
 
 	if(task->mm == NULL) 
 		return(ERR_PTR(-EINVAL));
@@ -238,12 +239,13 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr,
 		return(ERR_PTR(-EINVAL));
 
 	pte = pte_offset_kernel(pmd, addr);
-	if(!pte_present(*pte)) 
+	ptent = *pte;
+	if(!pte_present(ptent))
 		return(ERR_PTR(-EINVAL));
 
 	if(pte_out != NULL)
-		*pte_out = *pte;
-	return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK));
+		*pte_out = ptent;
+	return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK));
 }
 
 char *current_cmd(void)

+ 1 - 3
arch/um/kernel/skas/mmu.c

@@ -28,7 +28,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	pmd_t *pmd;
 	pte_t *pte;
 
-	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, proc);
 	pud = pud_alloc(mm, pgd, proc);
 	if (!pud)
@@ -63,7 +62,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	*pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT));
 	*pte = pte_mkexec(*pte);
 	*pte = pte_wrprotect(*pte);
-	spin_unlock(&mm->page_table_lock);
 	return(0);
 
  out_pmd:
@@ -71,7 +69,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
  out_pte:
 	pmd_free(pmd);
  out:
-	spin_unlock(&mm->page_table_lock);
 	return(-ENOMEM);
 }
 
@@ -147,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)
 
 	if(!proc_mm || !ptrace_faultinfo){
 		free_page(mmu->id.stack);
+		pte_lock_deinit(virt_to_page(mmu->last_page_table));
 		pte_free_kernel((pte_t *) mmu->last_page_table);
                 dec_page_state(nr_page_table_pages);
 #ifdef CONFIG_3_LEVEL_PGTABLES

+ 0 - 36
arch/um/kernel/tt/tlb.c

@@ -74,42 +74,6 @@ void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end)
                 atomic_inc(&vmchange_seq);
 }
 
-static void protect_vm_page(unsigned long addr, int w, int must_succeed)
-{
-	int err;
-
-	err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed);
-	if(err == 0) return;
-	else if((err == -EFAULT) || (err == -ENOMEM)){
-		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-		protect_vm_page(addr, w, 1);
-	}
-	else panic("protect_vm_page : protect failed, errno = %d\n", err);
-}
-
-void mprotect_kernel_vm(int w)
-{
-	struct mm_struct *mm;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long addr;
-	
-	mm = &init_mm;
-	for(addr = start_vm; addr < end_vm;){
-		pgd = pgd_offset(mm, addr);
-		pud = pud_offset(pgd, addr);
-		pmd = pmd_offset(pud, addr);
-		if(pmd_present(*pmd)){
-			pte = pte_offset_kernel(pmd, addr);
-			if(pte_present(*pte)) protect_vm_page(addr, w, 0);
-			addr += PAGE_SIZE;
-		}
-		else addr += PMD_SIZE;
-	}
-}
-
 void flush_tlb_kernel_vm_tt(void)
 {
         flush_tlb_kernel_range(start_vm, end_vm);

+ 0 - 1
arch/x86_64/ia32/ia32_aout.c

@@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;

+ 1 - 3
arch/x86_64/mm/ioremap.c

@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo
 	if (address >= end)
 		BUG();
 	do {
-		pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+		pte_t * pte = pte_alloc_kernel(pmd, address);
 		if (!pte)
 			return -ENOMEM;
 		remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -105,7 +105,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 	flush_cache_all();
 	if (address >= end)
 		BUG();
-	spin_lock(&init_mm.page_table_lock);
 	do {
 		pud_t *pud;
 		pud = pud_alloc(&init_mm, pgd, address);
@@ -119,7 +118,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr,
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgd++;
 	} while (address && (address < end));
-	spin_unlock(&init_mm.page_table_lock);
 	flush_tlb_all();
 	return error;
 }

+ 4 - 1
crypto/api.c

@@ -215,7 +215,10 @@ int crypto_register_alg(struct crypto_alg *alg)
 	if (alg->cra_alignmask & (alg->cra_alignmask + 1))
 		return -EINVAL;
 
-	if (alg->cra_alignmask > PAGE_SIZE)
+	if (alg->cra_alignmask & alg->cra_blocksize)
+		return -EINVAL;
+
+	if (alg->cra_blocksize > PAGE_SIZE)
 		return -EINVAL;
 	
 	down_write(&crypto_alg_sem);

+ 5 - 14
crypto/hmac.c

@@ -18,18 +18,15 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include "internal.h"
 
 static void hash_key(struct crypto_tfm *tfm, u8 *key, unsigned int keylen)
 {
 	struct scatterlist tmp;
 	
-	tmp.page = virt_to_page(key);
-	tmp.offset = offset_in_page(key);
-	tmp.length = keylen;
+	sg_set_buf(&tmp, key, keylen);
 	crypto_digest_digest(tfm, &tmp, 1, key);
-		
 }
 
 int crypto_alloc_hmac_block(struct crypto_tfm *tfm)
@@ -69,9 +66,7 @@ void crypto_hmac_init(struct crypto_tfm *tfm, u8 *key, unsigned int *keylen)
 	for (i = 0; i < crypto_tfm_alg_blocksize(tfm); i++)
 		ipad[i] ^= 0x36;
 
-	tmp.page = virt_to_page(ipad);
-	tmp.offset = offset_in_page(ipad);
-	tmp.length = crypto_tfm_alg_blocksize(tfm);
+	sg_set_buf(&tmp, ipad, crypto_tfm_alg_blocksize(tfm));
 	
 	crypto_digest_init(tfm);
 	crypto_digest_update(tfm, &tmp, 1);
@@ -103,16 +98,12 @@ void crypto_hmac_final(struct crypto_tfm *tfm, u8 *key,
 	for (i = 0; i < crypto_tfm_alg_blocksize(tfm); i++)
 		opad[i] ^= 0x5c;
 
-	tmp.page = virt_to_page(opad);
-	tmp.offset = offset_in_page(opad);
-	tmp.length = crypto_tfm_alg_blocksize(tfm);
+	sg_set_buf(&tmp, opad, crypto_tfm_alg_blocksize(tfm));
 
 	crypto_digest_init(tfm);
 	crypto_digest_update(tfm, &tmp, 1);
 	
-	tmp.page = virt_to_page(out);
-	tmp.offset = offset_in_page(out);
-	tmp.length = crypto_tfm_alg_digestsize(tfm);
+	sg_set_buf(&tmp, out, crypto_tfm_alg_digestsize(tfm));
 	
 	crypto_digest_update(tfm, &tmp, 1);
 	crypto_digest_final(tfm, out);

+ 17 - 39
crypto/tcrypt.c

@@ -21,7 +21,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
 #include <linux/highmem.h>
@@ -86,7 +86,6 @@ static void hexdump(unsigned char *buf, unsigned int len)
 static void test_hash(char *algo, struct hash_testvec *template,
 		      unsigned int tcount)
 {
-	char *p;
 	unsigned int i, j, k, temp;
 	struct scatterlist sg[8];
 	char result[64];
@@ -116,10 +115,7 @@ static void test_hash(char *algo, struct hash_testvec *template,
 		printk("test %u:\n", i + 1);
 		memset(result, 0, 64);
 
-		p = hash_tv[i].plaintext;
-		sg[0].page = virt_to_page(p);
-		sg[0].offset = offset_in_page(p);
-		sg[0].length = hash_tv[i].psize;
+		sg_set_buf(&sg[0], hash_tv[i].plaintext, hash_tv[i].psize);
 
 		crypto_digest_init(tfm);
 		if (tfm->crt_u.digest.dit_setkey) {
@@ -154,10 +150,8 @@ static void test_hash(char *algo, struct hash_testvec *template,
 				       hash_tv[i].plaintext + temp,
 				       hash_tv[i].tap[k]);
 				temp += hash_tv[i].tap[k];
-				p = &xbuf[IDX[k]];
-				sg[k].page = virt_to_page(p);
-				sg[k].offset = offset_in_page(p);
-				sg[k].length = hash_tv[i].tap[k];
+				sg_set_buf(&sg[k], &xbuf[IDX[k]],
+					    hash_tv[i].tap[k]);
 			}
 
 			crypto_digest_digest(tfm, sg, hash_tv[i].np, result);
@@ -179,7 +173,6 @@ static void test_hash(char *algo, struct hash_testvec *template,
 static void test_hmac(char *algo, struct hmac_testvec *template,
 		      unsigned int tcount)
 {
-	char *p;
 	unsigned int i, j, k, temp;
 	struct scatterlist sg[8];
 	char result[64];
@@ -210,11 +203,8 @@ static void test_hmac(char *algo, struct hmac_testvec *template,
 		printk("test %u:\n", i + 1);
 		memset(result, 0, sizeof (result));
 
-		p = hmac_tv[i].plaintext;
 		klen = hmac_tv[i].ksize;
-		sg[0].page = virt_to_page(p);
-		sg[0].offset = offset_in_page(p);
-		sg[0].length = hmac_tv[i].psize;
+		sg_set_buf(&sg[0], hmac_tv[i].plaintext, hmac_tv[i].psize);
 
 		crypto_hmac(tfm, hmac_tv[i].key, &klen, sg, 1, result);
 
@@ -243,10 +233,8 @@ static void test_hmac(char *algo, struct hmac_testvec *template,
 				       hmac_tv[i].plaintext + temp,
 				       hmac_tv[i].tap[k]);
 				temp += hmac_tv[i].tap[k];
-				p = &xbuf[IDX[k]];
-				sg[k].page = virt_to_page(p);
-				sg[k].offset = offset_in_page(p);
-				sg[k].length = hmac_tv[i].tap[k];
+				sg_set_buf(&sg[k], &xbuf[IDX[k]],
+					    hmac_tv[i].tap[k]);
 			}
 
 			crypto_hmac(tfm, hmac_tv[i].key, &klen, sg,
@@ -270,7 +258,7 @@ static void test_cipher(char *algo, int mode, int enc,
 {
 	unsigned int ret, i, j, k, temp;
 	unsigned int tsize;
-	char *p, *q;
+	char *q;
 	struct crypto_tfm *tfm;
 	char *key;
 	struct cipher_testvec *cipher_tv;
@@ -330,10 +318,8 @@ static void test_cipher(char *algo, int mode, int enc,
 					goto out;
 			}
 
-			p = cipher_tv[i].input;
-			sg[0].page = virt_to_page(p);
-			sg[0].offset = offset_in_page(p);
-			sg[0].length = cipher_tv[i].ilen;
+			sg_set_buf(&sg[0], cipher_tv[i].input,
+				   cipher_tv[i].ilen);
 
 			if (!mode) {
 				crypto_cipher_set_iv(tfm, cipher_tv[i].iv,
@@ -389,10 +375,8 @@ static void test_cipher(char *algo, int mode, int enc,
 				       cipher_tv[i].input + temp,
 				       cipher_tv[i].tap[k]);
 				temp += cipher_tv[i].tap[k];
-				p = &xbuf[IDX[k]];
-				sg[k].page = virt_to_page(p);
-				sg[k].offset = offset_in_page(p);
-				sg[k].length = cipher_tv[i].tap[k];
+				sg_set_buf(&sg[k], &xbuf[IDX[k]],
+					   cipher_tv[i].tap[k]);
 			}
 
 			if (!mode) {
@@ -431,14 +415,12 @@ out:
 static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p,
 			       int blen, int sec)
 {
-	struct scatterlist sg[8];
+	struct scatterlist sg[1];
 	unsigned long start, end;
 	int bcount;
 	int ret;
 
-	sg[0].page = virt_to_page(p);
-	sg[0].offset = offset_in_page(p);
-	sg[0].length = blen;
+	sg_set_buf(sg, p, blen);
 
 	for (start = jiffies, end = start + sec * HZ, bcount = 0;
 	     time_before(jiffies, end); bcount++) {
@@ -459,14 +441,12 @@ static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p,
 static int test_cipher_cycles(struct crypto_tfm *tfm, int enc, char *p,
 			      int blen)
 {
-	struct scatterlist sg[8];
+	struct scatterlist sg[1];
 	unsigned long cycles = 0;
 	int ret = 0;
 	int i;
 
-	sg[0].page = virt_to_page(p);
-	sg[0].offset = offset_in_page(p);
-	sg[0].length = blen;
+	sg_set_buf(sg, p, blen);
 
 	local_bh_disable();
 	local_irq_disable();
@@ -709,9 +689,7 @@ static void test_crc32c(void)
 	for (i = 0; i < NUMVEC; i++) {
 		for (j = 0; j < VECSIZE; j++)
 			test_vec[i][j] = ++b;
-		sg[i].page = virt_to_page(test_vec[i]);
-		sg[i].offset = offset_in_page(test_vec[i]);
-		sg[i].length = VECSIZE;
+		sg_set_buf(&sg[i], test_vec[i], VECSIZE);
 	}
 
 	seed = SEEDTESTVAL;

+ 2 - 3
drivers/acpi/acpi_memhotplug.c

@@ -200,8 +200,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 	 * Note: Assume that this function returns zero on success
 	 */
 	result = add_memory(mem_device->start_addr,
-			    (mem_device->end_addr - mem_device->start_addr) + 1,
-			    mem_device->read_write_attribute);
+			    (mem_device->end_addr - mem_device->start_addr) + 1);
 	if (result) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n"));
 		mem_device->state = MEMORY_INVALID_STATE;
@@ -259,7 +258,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
 	 * Ask the VM to offline this memory range.
 	 * Note: Assume that this function returns zero on success
 	 */
-	result = remove_memory(start, len, attr);
+	result = remove_memory(start, len);
 	if (result) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n"));
 		return_VALUE(result);

+ 1 - 0
drivers/base/Makefile

@@ -7,6 +7,7 @@ obj-y			:= core.o sys.o bus.o dd.o \
 obj-y			+= power/
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
 EXTRA_CFLAGS += -DDEBUG

+ 2 - 0
drivers/base/init.c

@@ -9,6 +9,7 @@
 
 #include <linux/device.h>
 #include <linux/init.h>
+#include <linux/memory.h>
 
 #include "base.h"
 
@@ -33,5 +34,6 @@ void __init driver_init(void)
 	platform_bus_init();
 	system_bus_init();
 	cpu_dev_init();
+	memory_dev_init();
 	attribute_container_init();
 }

+ 452 - 0
drivers/base/memory.c

@@ -0,0 +1,452 @@
+/*
+ * drivers/base/memory.c - basic Memory class support
+ *
+ * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
+ *            Dave Hansen <haveblue@us.ibm.com>
+ *
+ * This file provides the necessary infrastructure to represent
+ * a SPARSEMEM-memory-model system's physical memory in /sysfs.
+ * All arch-independent code that assumes MEMORY_HOTPLUG requires
+ * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
+ */
+
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>	/* capable() */
+#include <linux/topology.h>
+#include <linux/device.h>
+#include <linux/memory.h>
+#include <linux/kobject.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define MEMORY_CLASS_NAME	"memory"
+
+static struct sysdev_class memory_sysdev_class = {
+	set_kset_name(MEMORY_CLASS_NAME),
+};
+EXPORT_SYMBOL(memory_sysdev_class);
+
+static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
+{
+	return MEMORY_CLASS_NAME;
+}
+
+static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+			int num_envp, char *buffer, int buffer_size)
+{
+	int retval = 0;
+
+	return retval;
+}
+
+static struct kset_hotplug_ops memory_hotplug_ops = {
+	.name		= memory_hotplug_name,
+	.hotplug	= memory_hotplug,
+};
+
+static struct notifier_block *memory_chain;
+
+static int register_memory_notifier(struct notifier_block *nb)
+{
+        return notifier_chain_register(&memory_chain, nb);
+}
+
+static void unregister_memory_notifier(struct notifier_block *nb)
+{
+        notifier_chain_unregister(&memory_chain, nb);
+}
+
+/*
+ * register_memory - Setup a sysfs device for a memory block
+ */
+static int
+register_memory(struct memory_block *memory, struct mem_section *section,
+		struct node *root)
+{
+	int error;
+
+	memory->sysdev.cls = &memory_sysdev_class;
+	memory->sysdev.id = __section_nr(section);
+
+	error = sysdev_register(&memory->sysdev);
+
+	if (root && !error)
+		error = sysfs_create_link(&root->sysdev.kobj,
+					  &memory->sysdev.kobj,
+					  kobject_name(&memory->sysdev.kobj));
+
+	return error;
+}
+
+static void
+unregister_memory(struct memory_block *memory, struct mem_section *section,
+		struct node *root)
+{
+	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
+	BUG_ON(memory->sysdev.id != __section_nr(section));
+
+	sysdev_unregister(&memory->sysdev);
+	if (root)
+		sysfs_remove_link(&root->sysdev.kobj,
+				  kobject_name(&memory->sysdev.kobj));
+}
+
+/*
+ * use this as the physical section index that this memsection
+ * uses.
+ */
+
+static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	return sprintf(buf, "%08lx\n", mem->phys_index);
+}
+
+/*
+ * online, offline, going offline, etc.
+ */
+static ssize_t show_mem_state(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	ssize_t len = 0;
+
+	/*
+	 * We can probably put these states in a nice little array
+	 * so that they're not open-coded
+	 */
+	switch (mem->state) {
+		case MEM_ONLINE:
+			len = sprintf(buf, "online\n");
+			break;
+		case MEM_OFFLINE:
+			len = sprintf(buf, "offline\n");
+			break;
+		case MEM_GOING_OFFLINE:
+			len = sprintf(buf, "going-offline\n");
+			break;
+		default:
+			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
+					mem->state);
+			WARN_ON(1);
+			break;
+	}
+
+	return len;
+}
+
+static inline int memory_notify(unsigned long val, void *v)
+{
+	return notifier_call_chain(&memory_chain, val, v);
+}
+
+/*
+ * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
+ * OK to have direct references to sparsemem variables in here.
+ */
+static int
+memory_block_action(struct memory_block *mem, unsigned long action)
+{
+	int i;
+	unsigned long psection;
+	unsigned long start_pfn, start_paddr;
+	struct page *first_page;
+	int ret;
+	int old_state = mem->state;
+
+	psection = mem->phys_index;
+	first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+
+	/*
+	 * The probe routines leave the pages reserved, just
+	 * as the bootmem code does.  Make sure they're still
+	 * that way.
+	 */
+	if (action == MEM_ONLINE) {
+		for (i = 0; i < PAGES_PER_SECTION; i++) {
+			if (PageReserved(first_page+i))
+				continue;
+
+			printk(KERN_WARNING "section number %ld page number %d "
+				"not reserved, was it already online? \n",
+				psection, i);
+			return -EBUSY;
+		}
+	}
+
+	switch (action) {
+		case MEM_ONLINE:
+			start_pfn = page_to_pfn(first_page);
+			ret = online_pages(start_pfn, PAGES_PER_SECTION);
+			break;
+		case MEM_OFFLINE:
+			mem->state = MEM_GOING_OFFLINE;
+			memory_notify(MEM_GOING_OFFLINE, NULL);
+			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
+			ret = remove_memory(start_paddr,
+					    PAGES_PER_SECTION << PAGE_SHIFT);
+			if (ret) {
+				mem->state = old_state;
+				break;
+			}
+			memory_notify(MEM_MAPPING_INVALID, NULL);
+			break;
+		default:
+			printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
+					__FUNCTION__, mem, action, action);
+			WARN_ON(1);
+			ret = -EINVAL;
+	}
+	/*
+	 * For now, only notify on successful memory operations
+	 */
+	if (!ret)
+		memory_notify(action, NULL);
+
+	return ret;
+}
+
+static int memory_block_change_state(struct memory_block *mem,
+		unsigned long to_state, unsigned long from_state_req)
+{
+	int ret = 0;
+	down(&mem->state_sem);
+
+	if (mem->state != from_state_req) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = memory_block_action(mem, to_state);
+	if (!ret)
+		mem->state = to_state;
+
+out:
+	up(&mem->state_sem);
+	return ret;
+}
+
+static ssize_t
+store_mem_state(struct sys_device *dev, const char *buf, size_t count)
+{
+	struct memory_block *mem;
+	unsigned int phys_section_nr;
+	int ret = -EINVAL;
+
+	mem = container_of(dev, struct memory_block, sysdev);
+	phys_section_nr = mem->phys_index;
+
+	if (!valid_section_nr(phys_section_nr))
+		goto out;
+
+	if (!strncmp(buf, "online", min((int)count, 6)))
+		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+	else if(!strncmp(buf, "offline", min((int)count, 7)))
+		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+out:
+	if (ret)
+		return ret;
+	return count;
+}
+
+/*
+ * phys_device is a bad name for this.  What I really want
+ * is a way to differentiate between memory ranges that
+ * are part of physical devices that constitute
+ * a complete removable unit or fru.
+ * i.e. do these ranges belong to the same physical device,
+ * s.t. if I offline all of these sections I can then
+ * remove the physical device?
+ */
+static ssize_t show_phys_device(struct sys_device *dev, char *buf)
+{
+	struct memory_block *mem =
+		container_of(dev, struct memory_block, sysdev);
+	return sprintf(buf, "%d\n", mem->phys_device);
+}
+
+static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
+static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
+
+#define mem_create_simple_file(mem, attr_name)	\
+	sysdev_create_file(&mem->sysdev, &attr_##attr_name)
+#define mem_remove_simple_file(mem, attr_name)	\
+	sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
+
+/*
+ * Block size attribute stuff
+ */
+static ssize_t
+print_block_size(struct class *class, char *buf)
+{
+	return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
+}
+
+static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
+
+static int block_size_init(void)
+{
+	sysfs_create_file(&memory_sysdev_class.kset.kobj,
+		&class_attr_block_size_bytes.attr);
+	return 0;
+}
+
+/*
+ * Some architectures will have custom drivers to do this, and
+ * will not need to do it from userspace.  The fake hot-add code
+ * as well as ppc64 will do all of their discovery in userspace
+ * and will require this interface.
+ */
+#ifdef CONFIG_ARCH_MEMORY_PROBE
+static ssize_t
+memory_probe_store(struct class *class, const char __user *buf, size_t count)
+{
+	u64 phys_addr;
+	int ret;
+
+	phys_addr = simple_strtoull(buf, NULL, 0);
+
+	ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+
+	if (ret)
+		count = ret;
+
+	return count;
+}
+static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
+
+static int memory_probe_init(void)
+{
+	sysfs_create_file(&memory_sysdev_class.kset.kobj,
+		&class_attr_probe.attr);
+	return 0;
+}
+#else
+#define memory_probe_init(...)	do {} while (0)
+#endif
+
+/*
+ * Note that phys_device is optional.  It is here to allow for
+ * differentiation between which *physical* devices each
+ * section belongs to...
+ */
+
+static int add_memory_block(unsigned long node_id, struct mem_section *section,
+		     unsigned long state, int phys_device)
+{
+	struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	int ret = 0;
+
+	if (!mem)
+		return -ENOMEM;
+
+	mem->phys_index = __section_nr(section);
+	mem->state = state;
+	init_MUTEX(&mem->state_sem);
+	mem->phys_device = phys_device;
+
+	ret = register_memory(mem, section, NULL);
+	if (!ret)
+		ret = mem_create_simple_file(mem, phys_index);
+	if (!ret)
+		ret = mem_create_simple_file(mem, state);
+	if (!ret)
+		ret = mem_create_simple_file(mem, phys_device);
+
+	return ret;
+}
+
+/*
+ * For now, we have a linear search to go find the appropriate
+ * memory_block corresponding to a particular phys_index. If
+ * this gets to be a real problem, we can always use a radix
+ * tree or something here.
+ *
+ * This could be made generic for all sysdev classes.
+ */
+static struct memory_block *find_memory_block(struct mem_section *section)
+{
+	struct kobject *kobj;
+	struct sys_device *sysdev;
+	struct memory_block *mem;
+	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
+
+	/*
+	 * This only works because we know that section == sysdev->id
+	 * slightly redundant with sysdev_register()
+	 */
+	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
+
+	kobj = kset_find_obj(&memory_sysdev_class.kset, name);
+	if (!kobj)
+		return NULL;
+
+	sysdev = container_of(kobj, struct sys_device, kobj);
+	mem = container_of(sysdev, struct memory_block, sysdev);
+
+	return mem;
+}
+
+int remove_memory_block(unsigned long node_id, struct mem_section *section,
+		int phys_device)
+{
+	struct memory_block *mem;
+
+	mem = find_memory_block(section);
+	mem_remove_simple_file(mem, phys_index);
+	mem_remove_simple_file(mem, state);
+	mem_remove_simple_file(mem, phys_device);
+	unregister_memory(mem, section, NULL);
+
+	return 0;
+}
+
+/*
+ * need an interface for the VM to add new memory regions,
+ * but without onlining it.
+ */
+int register_new_memory(struct mem_section *section)
+{
+	return add_memory_block(0, section, MEM_OFFLINE, 0);
+}
+
+int unregister_memory_section(struct mem_section *section)
+{
+	if (!valid_section(section))
+		return -EINVAL;
+
+	return remove_memory_block(0, section, 0);
+}
+
+/*
+ * Initialize the sysfs support for memory devices...
+ */
+int __init memory_dev_init(void)
+{
+	unsigned int i;
+	int ret;
+
+	memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
+	ret = sysdev_class_register(&memory_sysdev_class);
+
+	/*
+	 * Create entries for memory sections that were found
+	 * during boot and have been initialized
+	 */
+	for (i = 0; i < NR_MEM_SECTIONS; i++) {
+		if (!valid_section_nr(i))
+			continue;
+		add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
+	}
+
+	memory_probe_init();
+	block_size_init();
+
+	return ret;
+}

+ 4 - 8
drivers/md/dm-crypt.c

@@ -15,7 +15,7 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <asm/atomic.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/page.h>
 
 #include "dm.h"
@@ -164,9 +164,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -ENOMEM;
 	}
 
-	sg.page = virt_to_page(cc->key);
-	sg.offset = offset_in_page(cc->key);
-	sg.length = cc->key_size;
+	sg_set_buf(&sg, cc->key, cc->key_size);
 	crypto_digest_digest(hash_tfm, &sg, 1, salt);
 	crypto_free_tfm(hash_tfm);
 
@@ -207,14 +205,12 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
-	struct scatterlist sg = { NULL, };
+	struct scatterlist sg;
 
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
 
-	sg.page = virt_to_page(iv);
-	sg.offset = offset_in_page(iv);
-	sg.length = cc->iv_size;
+	sg_set_buf(&sg, iv, cc->iv_size);
 	crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private,
 	                      &sg, &sg, cc->iv_size);
 

+ 3 - 4
drivers/net/wireless/airo.c

@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/in.h>
 #include <linux/bitops.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/system.h>
 
@@ -1590,11 +1591,9 @@ static void emmh32_setseed(emmh32_context *context, u8 *pkey, int keylen, struct
 		aes_counter[12] = (u8)(counter >> 24);
 		counter++;
 		memcpy (plain, aes_counter, 16);
-		sg[0].page = virt_to_page(plain);
-		sg[0].offset = ((long) plain & ~PAGE_MASK);
-		sg[0].length = 16;
+		sg_set_buf(sg, plain, 16);
 		crypto_cipher_encrypt(tfm, sg, sg, 16);
-		cipher = kmap(sg[0].page) + sg[0].offset;
+		cipher = kmap(sg->page) + sg->offset;
 		for (j=0; (j<16) && (i< (sizeof(context->coeff)/sizeof(context->coeff[0]))); ) {
 			context->coeff[i++] = ntohl(*(u32 *)&cipher[j]);
 			j += 4;

+ 0 - 9
drivers/scsi/ahci.c

@@ -192,7 +192,6 @@ static void ahci_port_stop(struct ata_port *ap);
 static void ahci_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
 static void ahci_qc_prep(struct ata_queued_cmd *qc);
 static u8 ahci_check_status(struct ata_port *ap);
-static u8 ahci_check_err(struct ata_port *ap);
 static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
 static void ahci_remove_one (struct pci_dev *pdev);
 
@@ -221,7 +220,6 @@ static const struct ata_port_operations ahci_ops = {
 
 	.check_status		= ahci_check_status,
 	.check_altstatus	= ahci_check_status,
-	.check_err		= ahci_check_err,
 	.dev_select		= ata_noop_dev_select,
 
 	.tf_read		= ahci_tf_read,
@@ -466,13 +464,6 @@ static u8 ahci_check_status(struct ata_port *ap)
 	return readl(mmio + PORT_TFDATA) & 0xFF;
 }
 
-static u8 ahci_check_err(struct ata_port *ap)
-{
-	void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr;
-
-	return (readl(mmio + PORT_TFDATA) >> 8) & 0xFF;
-}
-
 static void ahci_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
 {
 	struct ahci_port_priv *pp = ap->private_data;

+ 3 - 3
drivers/scsi/arm/scsi.h

@@ -10,6 +10,8 @@
  *  Commonly used scsi driver functions.
  */
 
+#include <linux/scatterlist.h>
+
 #define BELT_AND_BRACES
 
 /*
@@ -22,9 +24,7 @@ static inline int copy_SCp_to_sg(struct scatterlist *sg, Scsi_Pointer *SCp, int
 
 	BUG_ON(bufs + 1 > max);
 
-	sg->page   = virt_to_page(SCp->ptr);
-	sg->offset = offset_in_page(SCp->ptr);
-	sg->length = SCp->this_residual;
+	sg_set_buf(sg, SCp->ptr, SCp->this_residual);
 
 	if (bufs)
 		memcpy(sg + 1, SCp->buffer + 1,

+ 11 - 35
drivers/scsi/libata-core.c

@@ -49,6 +49,7 @@
 #include <linux/suspend.h>
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
+#include <linux/scatterlist.h>
 #include <scsi/scsi.h>
 #include "scsi.h"
 #include "scsi_priv.h"
@@ -371,7 +372,7 @@ static void ata_tf_read_pio(struct ata_port *ap, struct ata_taskfile *tf)
 	struct ata_ioports *ioaddr = &ap->ioaddr;
 
 	tf->command = ata_check_status(ap);
-	tf->feature = ata_chk_err(ap);
+	tf->feature = inb(ioaddr->error_addr);
 	tf->nsect = inb(ioaddr->nsect_addr);
 	tf->lbal = inb(ioaddr->lbal_addr);
 	tf->lbam = inb(ioaddr->lbam_addr);
@@ -405,7 +406,7 @@ static void ata_tf_read_mmio(struct ata_port *ap, struct ata_taskfile *tf)
 	struct ata_ioports *ioaddr = &ap->ioaddr;
 
 	tf->command = ata_check_status(ap);
-	tf->feature = ata_chk_err(ap);
+	tf->feature = readb((void __iomem *)ioaddr->error_addr);
 	tf->nsect = readb((void __iomem *)ioaddr->nsect_addr);
 	tf->lbal = readb((void __iomem *)ioaddr->lbal_addr);
 	tf->lbam = readb((void __iomem *)ioaddr->lbam_addr);
@@ -525,30 +526,6 @@ u8 ata_altstatus(struct ata_port *ap)
 }
 
 
-/**
- *	ata_chk_err - Read device error reg
- *	@ap: port where the device is
- *
- *	Reads ATA taskfile error register for
- *	currently-selected device and return its value.
- *
- *	Note: may NOT be used as the check_err() entry in
- *	ata_port_operations.
- *
- *	LOCKING:
- *	Inherited from caller.
- */
-u8 ata_chk_err(struct ata_port *ap)
-{
-	if (ap->ops->check_err)
-		return ap->ops->check_err(ap);
-
-	if (ap->flags & ATA_FLAG_MMIO) {
-		return readb((void __iomem *) ap->ioaddr.error_addr);
-	}
-	return inb(ap->ioaddr.error_addr);
-}
-
 /**
  *	ata_tf_to_fis - Convert ATA taskfile to SATA FIS structure
  *	@tf: Taskfile to convert
@@ -901,8 +878,8 @@ static u8 ata_dev_try_classify(struct ata_port *ap, unsigned int device)
 
 	memset(&tf, 0, sizeof(tf));
 
-	err = ata_chk_err(ap);
 	ap->ops->tf_read(ap, &tf);
+	err = tf.feature;
 
 	dev->class = ATA_DEV_NONE;
 
@@ -1139,7 +1116,6 @@ static void ata_dev_identify(struct ata_port *ap, unsigned int device)
 	unsigned int major_version;
 	u16 tmp;
 	unsigned long xfer_modes;
-	u8 status;
 	unsigned int using_edd;
 	DECLARE_COMPLETION(wait);
 	struct ata_queued_cmd *qc;
@@ -1193,8 +1169,11 @@ retry:
 	else
 		wait_for_completion(&wait);
 
-	status = ata_chk_status(ap);
-	if (status & ATA_ERR) {
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->ops->tf_read(ap, &qc->tf);
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	if (qc->tf.command & ATA_ERR) {
 		/*
 		 * arg!  EDD works for all test cases, but seems to return
 		 * the ATA signature for some ATAPI devices.  Until the
@@ -1207,7 +1186,7 @@ retry:
 		 * to have this problem.
 		 */
 		if ((using_edd) && (qc->tf.command == ATA_CMD_ID_ATA)) {
-			u8 err = ata_chk_err(ap);
+			u8 err = qc->tf.feature;
 			if (err & ATA_ABORTED) {
 				dev->class = ATA_DEV_ATAPI;
 				qc->cursg = 0;
@@ -2609,9 +2588,7 @@ void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf, unsigned int buflen)
 	qc->buf_virt = buf;
 
 	sg = qc->__sg;
-	sg->page = virt_to_page(buf);
-	sg->offset = (unsigned long) buf & ~PAGE_MASK;
-	sg->length = buflen;
+	sg_init_one(sg, buf, buflen);
 }
 
 /**
@@ -4962,7 +4939,6 @@ EXPORT_SYMBOL_GPL(ata_tf_to_fis);
 EXPORT_SYMBOL_GPL(ata_tf_from_fis);
 EXPORT_SYMBOL_GPL(ata_check_status);
 EXPORT_SYMBOL_GPL(ata_altstatus);
-EXPORT_SYMBOL_GPL(ata_chk_err);
 EXPORT_SYMBOL_GPL(ata_exec_command);
 EXPORT_SYMBOL_GPL(ata_port_start);
 EXPORT_SYMBOL_GPL(ata_port_stop);

+ 0 - 18
drivers/scsi/sata_mv.c

@@ -258,7 +258,6 @@ struct mv_host_priv {
 static void mv_irq_clear(struct ata_port *ap);
 static u32 mv_scr_read(struct ata_port *ap, unsigned int sc_reg_in);
 static void mv_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val);
-static u8 mv_check_err(struct ata_port *ap);
 static void mv_phy_reset(struct ata_port *ap);
 static void mv_host_stop(struct ata_host_set *host_set);
 static int mv_port_start(struct ata_port *ap);
@@ -296,7 +295,6 @@ static const struct ata_port_operations mv_ops = {
 	.tf_load		= ata_tf_load,
 	.tf_read		= ata_tf_read,
 	.check_status		= ata_check_status,
-	.check_err		= mv_check_err,
 	.exec_command		= ata_exec_command,
 	.dev_select		= ata_std_dev_select,
 
@@ -1185,22 +1183,6 @@ static irqreturn_t mv_interrupt(int irq, void *dev_instance,
 	return IRQ_RETVAL(handled);
 }
 
-/**
- *      mv_check_err - Return the error shadow register to caller.
- *      @ap: ATA channel to manipulate
- *
- *      Marvell requires DMA to be stopped before accessing shadow
- *      registers.  So we do that, then return the needed register.
- *
- *      LOCKING:
- *      Inherited from caller.  FIXME: protect mv_stop_dma with lock?
- */
-static u8 mv_check_err(struct ata_port *ap)
-{
-	mv_stop_dma(ap);		/* can't read shadow regs if DMA on */
-	return readb((void __iomem *) ap->ioaddr.error_addr);
-}
-
 /**
  *      mv_phy_reset - Perform eDMA reset followed by COMRESET
  *      @ap: ATA channel to manipulate

+ 0 - 8
drivers/scsi/sata_sil24.c

@@ -225,7 +225,6 @@ struct sil24_host_priv {
 };
 
 static u8 sil24_check_status(struct ata_port *ap);
-static u8 sil24_check_err(struct ata_port *ap);
 static u32 sil24_scr_read(struct ata_port *ap, unsigned sc_reg);
 static void sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val);
 static void sil24_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
@@ -280,7 +279,6 @@ static const struct ata_port_operations sil24_ops = {
 
 	.check_status		= sil24_check_status,
 	.check_altstatus	= sil24_check_status,
-	.check_err		= sil24_check_err,
 	.dev_select		= ata_noop_dev_select,
 
 	.tf_read		= sil24_tf_read,
@@ -363,12 +361,6 @@ static u8 sil24_check_status(struct ata_port *ap)
 	return pp->tf.command;
 }
 
-static u8 sil24_check_err(struct ata_port *ap)
-{
-	struct sil24_port_priv *pp = ap->private_data;
-	return pp->tf.feature;
-}
-
 static int sil24_scr_map[] = {
 	[SCR_CONTROL]	= 0,
 	[SCR_STATUS]	= 1,

+ 10 - 7
drivers/scsi/sg.c

@@ -49,6 +49,7 @@ static int sg_version_num = 30533;	/* 2 digits for each component */
 #include <linux/seq_file.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
+#include <linux/scatterlist.h>
 
 #include "scsi.h"
 #include <scsi/scsi_dbg.h>
@@ -1886,13 +1887,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		if (dirtied && !PageReserved(sgl[i].page))
-			SetPageDirty(sgl[i].page);
-		/* unlock_page(sgl[i].page); */
+		struct page *page = sgl[i].page;
+
+		/* XXX: just for debug. Remove when PageReserved is removed */
+		BUG_ON(PageReserved(page));
+		if (dirtied)
+			SetPageDirty(page);
+		/* unlock_page(page); */
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(sgl[i].page);
+		page_cache_release(page);
 	}
 
 	return 0;
@@ -1992,9 +1997,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 				if (!p)
 					break;
 			}
-			sclp->page = virt_to_page(p);
-			sclp->offset = offset_in_page(p);
-			sclp->length = ret_sz;
+			sg_set_buf(sclp, p, ret_sz);
 
 			SCSI_LOG_TIMEOUT(5, printk("sg_build_build: k=%d, a=0x%p, len=%d\n",
 					  k, sg_scatg2virt(sclp), ret_sz));

+ 7 - 3
drivers/scsi/st.c

@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		if (dirtied && !PageReserved(sgl[i].page))
-			SetPageDirty(sgl[i].page);
+		struct page *page = sgl[i].page;
+
+		/* XXX: just for debug. Remove when PageReserved is removed */
+		BUG_ON(PageReserved(page));
+		if (dirtied)
+			SetPageDirty(page);
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(sgl[i].page);
+		page_cache_release(page);
 	}
 
 	return 0;

+ 2 - 5
drivers/usb/misc/usbtest.c

@@ -9,7 +9,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 #include <linux/usb.h>
 
@@ -381,7 +381,6 @@ alloc_sglist (int nents, int max, int vary)
 	sg = kmalloc (nents * sizeof *sg, SLAB_KERNEL);
 	if (!sg)
 		return NULL;
-	memset (sg, 0, nents * sizeof *sg);
 
 	for (i = 0; i < nents; i++) {
 		char		*buf;
@@ -394,9 +393,7 @@ alloc_sglist (int nents, int max, int vary)
 		memset (buf, 0, size);
 
 		/* kmalloc pages are always physically contiguous! */
-		sg [i].page = virt_to_page (buf);
-		sg [i].offset = offset_in_page (buf);
-		sg [i].length = size;
+		sg_init_one(&sg[i], buf, size);
 
 		if (vary) {
 			size += vary;

+ 2 - 2
fs/afs/file.c

@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
 		cachefs_uncache_page(vnode->cache, page);
 #endif
 
-		pageio = (struct cachefs_page *) page->private;
-		page->private = 0;
+		pageio = (struct cachefs_page *) page_private(page);
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 
 		if (pageio)

+ 0 - 1
fs/binfmt_aout.c

@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;

+ 0 - 1
fs/binfmt_elf.c

@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),

+ 0 - 7
fs/binfmt_elf_fdpic.c

@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
 				  &interp_params,
 				  &current->mm->start_stack,
 				  &current->mm->start_brk);
-#endif
-
-	/* do this so that we can load the interpreter, if need be
-	 * - we will change some of these later
-	 */
-	set_mm_counter(current->mm, rss, 0);
 
-#ifdef CONFIG_MMU
 	retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);

+ 0 - 1
fs/binfmt_flat.c

@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm,
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
 		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
-		set_mm_counter(current->mm, rss, 0);
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)

+ 0 - 1
fs/binfmt_som.c

@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	create_som_tables(bprm);
 
 	current->mm->start_stack = bprm->p;
-	set_mm_counter(current->mm, rss, 0);
 
 #if 0
 	printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);

+ 1 - 1
fs/buffer.c

@@ -96,7 +96,7 @@ static void
 __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
-	page->private = 0;
+	set_page_private(page, 0);
 	page_cache_release(page);
 }
 

+ 0 - 1
fs/compat.c

@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}

+ 3 - 1
fs/direct-io.c

@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 	up_read(&current->mm->mmap_sem);
 
 	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
 		 * mapped blocks.  We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
-		dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
+		page_cache_get(page);
+		dio->pages[0] = page;
 		dio->head = 0;
 		dio->tail = 1;
 		ret = 0;

+ 6 - 11
fs/exec.c

@@ -309,40 +309,36 @@ void install_arg_page(struct vm_area_struct *vma,
 	pud_t * pud;
 	pmd_t * pmd;
 	pte_t * pte;
+	spinlock_t *ptl;
 
 	if (unlikely(anon_vma_prepare(vma)))
-		goto out_sig;
+		goto out;
 
 	flush_dcache_page(page);
 	pgd = pgd_offset(mm, address);
-
-	spin_lock(&mm->page_table_lock);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
 		goto out;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		goto out;
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
 	if (!pte)
 		goto out;
 	if (!pte_none(*pte)) {
-		pte_unmap(pte);
+		pte_unmap_unlock(pte, ptl);
 		goto out;
 	}
-	inc_mm_counter(mm, rss);
+	inc_mm_counter(mm, anon_rss);
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
 	page_add_anon_rmap(page, vma, address);
-	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte, ptl);
 
 	/* no need for flush_tlb */
 	return;
 out:
-	spin_unlock(&mm->page_table_lock);
-out_sig:
 	__free_page(page);
 	force_sig(SIGKILL, current);
 }
@@ -1207,7 +1203,6 @@ int do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}

+ 120 - 86
fs/hugetlbfs/inode.c

@@ -45,10 +45,58 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
 
 int sysctl_hugetlb_shm_group;
 
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+	int i;
+
+	for (i = 0; i < pagevec_count(pvec); ++i)
+		put_page(pvec->pages[i]);
+
+	pagevec_reinit(pvec);
+}
+
+/*
+ * huge_pages_needed tries to determine the number of new huge pages that
+ * will be required to fully populate this VMA.  This will be equal to
+ * the size of the VMA in huge pages minus the number of huge pages
+ * (covered by this VMA) that are found in the page cache.
+ *
+ * Result is in bytes to be compatible with is_hugepage_mem_enough()
+ */
+unsigned long
+huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	int i;
+	struct pagevec pvec;
+	unsigned long start = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
+	pgoff_t next = vma->vm_pgoff;
+	pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
+
+	pagevec_init(&pvec, 0);
+	while (next < endpg) {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+			break;
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			if (page->index > next)
+				next = page->index;
+			if (page->index >= endpg)
+				break;
+			next++;
+			hugepages--;
+		}
+		huge_pagevec_release(&pvec);
+	}
+	return hugepages << HPAGE_SHIFT;
+}
+
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
+	unsigned long bytes;
 	loff_t len, vma_len;
 	int ret;
 
@@ -67,6 +115,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 		return -EINVAL;
 
+	bytes = huge_pages_needed(mapping, vma);
+	if (!is_hugepage_mem_enough(bytes))
+		return -ENOMEM;
+
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	down(&inode->i_sem);
@@ -79,10 +131,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	ret = hugetlb_prefault(mapping, vma);
-	if (ret)
-		goto out;
-
+	ret = 0;
+	hugetlb_prefault_arch_hook(vma->vm_mm);
 	if (inode->i_size < len)
 		inode->i_size = len;
 out:
@@ -92,7 +142,7 @@ out:
 }
 
 /*
- * Called under down_write(mmap_sem), page_table_lock is not held
+ * Called under down_write(mmap_sem).
  */
 
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -171,16 +221,6 @@ static int hugetlbfs_commit_write(struct file *file,
 	return -EINVAL;
 }
 
-static void huge_pagevec_release(struct pagevec *pvec)
-{
-	int i;
-
-	for (i = 0; i < pagevec_count(pvec); ++i)
-		put_page(pvec->pages[i]);
-
-	pagevec_reinit(pvec);
-}
-
 static void truncate_huge_page(struct page *page)
 {
 	clear_page_dirty(page);
@@ -224,52 +264,35 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 
 static void hugetlbfs_delete_inode(struct inode *inode)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
-
-	hlist_del_init(&inode->i_hash);
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-
 	if (inode->i_data.nrpages)
 		truncate_hugepages(&inode->i_data, 0);
-
-	security_inode_delete(inode);
-
-	if (sbinfo->free_inodes >= 0) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_inodes++;
-		spin_unlock(&sbinfo->stat_lock);
-	}
-
 	clear_inode(inode);
-	destroy_inode(inode);
 }
 
 static void hugetlbfs_forget_inode(struct inode *inode)
 {
-	struct super_block *super_block = inode->i_sb;
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block);
+	struct super_block *sb = inode->i_sb;
 
-	if (hlist_unhashed(&inode->i_hash))
-		goto out_truncate;
-
-	if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
-		list_del(&inode->i_list);
-		list_add(&inode->i_list, &inode_unused);
-	}
-	inodes_stat.nr_unused++;
-	if (!super_block || (super_block->s_flags & MS_ACTIVE)) {
+	if (!hlist_unhashed(&inode->i_hash)) {
+		if (!(inode->i_state & (I_DIRTY|I_LOCK)))
+			list_move(&inode->i_list, &inode_unused);
+		inodes_stat.nr_unused++;
+		if (!sb || (sb->s_flags & MS_ACTIVE)) {
+			spin_unlock(&inode_lock);
+			return;
+		}
+		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode_lock);
-		return;
+		/*
+		 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK
+		 * in our backing_dev_info.
+		 */
+		write_inode_now(inode, 1);
+		spin_lock(&inode_lock);
+		inode->i_state &= ~I_WILL_FREE;
+		inodes_stat.nr_unused--;
+		hlist_del_init(&inode->i_hash);
 	}
-
-	/* write_inode_now() ? */
-	inodes_stat.nr_unused--;
-	hlist_del_init(&inode->i_hash);
-out_truncate:
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
 	inode->i_state |= I_FREEING;
@@ -277,13 +300,6 @@ out_truncate:
 	spin_unlock(&inode_lock);
 	if (inode->i_data.nrpages)
 		truncate_hugepages(&inode->i_data, 0);
-
-	if (sbinfo->free_inodes >= 0) {
-		spin_lock(&sbinfo->stat_lock);
-		sbinfo->free_inodes++;
-		spin_unlock(&sbinfo->stat_lock);
-	}
-
 	clear_inode(inode);
 	destroy_inode(inode);
 }
@@ -291,7 +307,7 @@ out_truncate:
 static void hugetlbfs_drop_inode(struct inode *inode)
 {
 	if (!inode->i_nlink)
-		hugetlbfs_delete_inode(inode);
+		generic_delete_inode(inode);
 	else
 		hugetlbfs_forget_inode(inode);
 }
@@ -308,7 +324,6 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
 
 	vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) {
 		unsigned long h_vm_pgoff;
-		unsigned long v_length;
 		unsigned long v_offset;
 
 		h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
@@ -319,11 +334,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
 		if (h_vm_pgoff >= h_pgoff)
 			v_offset = 0;
 
-		v_length = vma->vm_end - vma->vm_start;
-
-		zap_hugepage_range(vma,
-				vma->vm_start + v_offset,
-				v_length - v_offset);
+		unmap_hugepage_range(vma,
+				vma->vm_start + v_offset, vma->vm_end);
 	}
 }
 
@@ -379,17 +391,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 					gid_t gid, int mode, dev_t dev)
 {
 	struct inode *inode;
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
-
-	if (sbinfo->free_inodes >= 0) {
-		spin_lock(&sbinfo->stat_lock);
-		if (!sbinfo->free_inodes) {
-			spin_unlock(&sbinfo->stat_lock);
-			return NULL;
-		}
-		sbinfo->free_inodes--;
-		spin_unlock(&sbinfo->stat_lock);
-	}
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -531,29 +532,51 @@ static void hugetlbfs_put_super(struct super_block *sb)
 	}
 }
 
+static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		if (unlikely(!sbinfo->free_inodes)) {
+			spin_unlock(&sbinfo->stat_lock);
+			return 0;
+		}
+		sbinfo->free_inodes--;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+
+	return 1;
+}
+
+static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
+{
+	if (sbinfo->free_inodes >= 0) {
+		spin_lock(&sbinfo->stat_lock);
+		sbinfo->free_inodes++;
+		spin_unlock(&sbinfo->stat_lock);
+	}
+}
+
+
 static kmem_cache_t *hugetlbfs_inode_cachep;
 
 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 {
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
 	struct hugetlbfs_inode_info *p;
 
+	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
+		return NULL;
 	p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
-	if (!p)
+	if (unlikely(!p)) {
+		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
+	}
 	return &p->vfs_inode;
 }
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
-{
-	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
-
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(&ei->vfs_inode);
-}
-
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
+	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
@@ -565,6 +588,16 @@ static struct address_space_operations hugetlbfs_aops = {
 	.set_page_dirty	= hugetlbfs_set_page_dirty,
 };
 
+
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&ei->vfs_inode);
+}
+
 struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= simple_sync_file,
@@ -592,6 +625,7 @@ static struct super_operations hugetlbfs_ops = {
 	.alloc_inode    = hugetlbfs_alloc_inode,
 	.destroy_inode  = hugetlbfs_destroy_inode,
 	.statfs		= hugetlbfs_statfs,
+	.delete_inode	= hugetlbfs_delete_inode,
 	.drop_inode	= hugetlbfs_drop_inode,
 	.put_super	= hugetlbfs_put_super,
 };

+ 6 - 6
fs/jfs/jfs_metapage.c

@@ -86,7 +86,7 @@ struct meta_anchor {
 	atomic_t io_count;
 	struct metapage *mp[MPS_PER_PAGE];
 };
-#define mp_anchor(page) ((struct meta_anchor *)page->private)
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
 
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 		if (!a)
 			return -ENOMEM;
 		memset(a, 0, sizeof(struct meta_anchor));
-		page->private = (unsigned long)a;
+		set_page_private(page, (unsigned long)a);
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 	a->mp[index] = NULL;
 	if (--a->mp_count == 0) {
 		kfree(a);
-		page->private = 0;
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 		kunmap(page);
 	}
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
 #else
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
-	return PagePrivate(page) ? (struct metapage *)page->private : NULL;
+	return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
 }
 
 static inline int insert_metapage(struct page *page, struct metapage *mp)
 {
 	if (mp) {
-		page->private = (unsigned long)mp;
+		set_page_private(page, (unsigned long)mp);
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 
 static inline void remove_metapage(struct page *page, struct metapage *mp)
 {
-	page->private = 0;
+	set_page_private(page, 0);
 	ClearPagePrivate(page);
 	kunmap(page);
 }

+ 1 - 1
fs/proc/array.c

@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		jiffies_to_clock_t(it_real_value),
 		start_time,
 		vsize,
-		mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */
+		mm ? get_mm_rss(mm) : 0,
 	        rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,

+ 31 - 20
fs/proc/task_mmu.c

@@ -14,22 +14,41 @@
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
 	unsigned long data, text, lib;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+	/*
+	 * Note: to minimize their overhead, mm maintains hiwater_vm and
+	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
+	 * collector of these hiwater stats must therefore get total_vm
+	 * and rss too, which will usually be the higher.  Barriers? not
+	 * worth the effort, such snapshots can always be inconsistent.
+	 */
+	hiwater_vm = total_vm = mm->total_vm;
+	if (hiwater_vm < mm->hiwater_vm)
+		hiwater_vm = mm->hiwater_vm;
+	hiwater_rss = total_rss = get_mm_rss(mm);
+	if (hiwater_rss < mm->hiwater_rss)
+		hiwater_rss = mm->hiwater_rss;
 
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	buffer += sprintf(buffer,
+		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
 		"VmLck:\t%8lu kB\n"
+		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n",
-		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		hiwater_vm << (PAGE_SHIFT-10),
+		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		get_mm_counter(mm, rss) << (PAGE_SHIFT-10),
+		hiwater_rss << (PAGE_SHIFT-10),
+		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -44,13 +63,11 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	int rss = get_mm_counter(mm, rss);
-
-	*shared = rss - get_mm_counter(mm, anon_rss);
+	*shared = get_mm_counter(mm, file_rss);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = rss;
+	*resident = *shared + get_mm_counter(mm, anon_rss);
 	return mm->total_vm;
 }
 
@@ -186,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				struct mem_size_stats *mss)
 {
 	pte_t *pte, ptent;
+	spinlock_t *ptl;
 	unsigned long pfn;
 	struct page *page;
 
-	pte = pte_offset_map(pmd, addr);
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		ptent = *pte;
-		if (pte_none(ptent) || !pte_present(ptent))
+		if (!pte_present(ptent))
 			continue;
 
 		mss->resident += PAGE_SIZE;
@@ -213,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				mss->private_clean += PAGE_SIZE;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
-	cond_resched_lock(&vma->vm_mm->page_table_lock);
+	pte_unmap_unlock(pte - 1, ptl);
+	cond_resched();
 }
 
 static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -268,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma,
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
-	struct mm_struct *mm = vma->vm_mm;
 	struct mem_size_stats mss;
 
 	memset(&mss, 0, sizeof mss);
-
-	if (mm) {
-		spin_lock(&mm->page_table_lock);
+	if (vma->vm_mm)
 		smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
-		spin_unlock(&mm->page_table_lock);
-	}
-
 	return show_map_internal(m, v, &mss);
 }
 
@@ -407,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 	for_each_node(i)
 		md->node[i] =0;
 
-	spin_lock(&mm->page_table_lock);
  	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
 		page = follow_page(mm, vaddr, 0);
 		if (page) {
@@ -422,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 				md->anon++;
 			md->node[page_to_nid(page)]++;
 		}
+		cond_resched();
 	}
-	spin_unlock(&mm->page_table_lock);
 	return md;
 }
 
@@ -469,7 +480,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 		seq_printf(m, " interleave={");
 		first = 1;
 		for_each_node(n) {
-			if (test_bit(n, pol->v.nodes)) {
+			if (node_isset(n, pol->v.nodes)) {
 				if (!first)
 					seq_putc(m,',');
 				else

+ 4 - 3
fs/xfs/linux-2.6/xfs_buf.c

@@ -181,8 +181,9 @@ set_page_region(
 	size_t		offset,
 	size_t		length)
 {
-	page->private |= page_region_mask(offset, length);
-	if (page->private == ~0UL)
+	set_page_private(page,
+		page_private(page) | page_region_mask(offset, length));
+	if (page_private(page) == ~0UL)
 		SetPageUptodate(page);
 }
 
@@ -194,7 +195,7 @@ test_page_region(
 {
 	unsigned long	mask = page_region_mask(offset, length);
 
-	return (mask && (page->private & mask) == mask);
+	return (mask && (page_private(page) & mask) == mask);
 }
 
 /*

+ 2 - 0
include/asm-alpha/barrier.h

@@ -1,6 +1,8 @@
 #ifndef __BARRIER_H
 #define __BARRIER_H
 
+#include <asm/compiler.h>
+
 #define mb() \
 __asm__ __volatile__("mb": : :"memory")
 

+ 5 - 0
include/asm-alpha/rwsem.h

@@ -262,5 +262,10 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
 #endif
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ALPHA_RWSEM_H */

+ 3 - 20
include/asm-arm/tlb.h

@@ -27,11 +27,7 @@
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
-	unsigned int		freed;
 	unsigned int		fullmm;
-
-	unsigned int		flushes;
-	unsigned int		avoided_flushes;
 };
 
 DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -39,11 +35,9 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	int cpu = smp_processor_id();
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
-	tlb->freed = 0;
 	tlb->fullmm = full_mm_flush;
 
 	return tlb;
@@ -52,24 +46,13 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	struct mm_struct *mm = tlb->mm;
-	unsigned long freed = tlb->freed;
-	int rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
-
 	if (tlb->fullmm)
-		flush_tlb_mm(mm);
+		flush_tlb_mm(tlb->mm);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
-}
 
-static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb)
-{
-	return tlb->fullmm;
+	put_cpu_var(mmu_gathers);
 }
 
 #define tlb_remove_tlb_entry(tlb,ptep,address)	do { } while (0)

+ 16 - 31
include/asm-arm26/tlb.h

@@ -10,24 +10,20 @@
  */
 struct mmu_gather {
         struct mm_struct        *mm;
-        unsigned int            freed;
-	unsigned int            fullmm;
-
-        unsigned int            flushes;
-        unsigned int            avoided_flushes;
+        unsigned int            need_flush;
+        unsigned int            fullmm;
 };
 
-extern struct mmu_gather mmu_gathers[NR_CPUS];
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-        int cpu = smp_processor_id();
-        struct mmu_gather *tlb = &mmu_gathers[cpu];
+        struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
         tlb->mm = mm;
-        tlb->freed = 0;
-	tlb->fullmm = full_mm_flush;
+        tlb->need_flush = 0;
+        tlb->fullmm = full_mm_flush;
 
         return tlb;
 }
@@ -35,30 +31,13 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-        struct mm_struct *mm = tlb->mm;
-        unsigned long freed = tlb->freed;
-        int rss = get_mm_counter(mm, rss);
-
-        if (rss < freed)
-                freed = rss;
-        add_mm_counter(mm, rss, -freed);
-
-        if (freed) {
-                flush_tlb_mm(mm);
-                tlb->flushes++;
-        } else {
-                tlb->avoided_flushes++;
-        }
+        if (tlb->need_flush)
+                flush_tlb_mm(tlb->mm);
 
         /* keep the page table cache within bounds */
         check_pgt_cache();
-}
-
 
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-     return tlb->fullmm;
+        put_cpu_var(mmu_gathers);
 }
 
 #define tlb_remove_tlb_entry(tlb,ptep,address)  do { } while (0)
@@ -71,7 +50,13 @@ tlb_is_full_mm(struct mmu_gather *tlb)
         } while (0)
 #define tlb_end_vma(tlb,vma)                    do { } while (0)
 
-#define tlb_remove_page(tlb,page)       free_page_and_swap_cache(page)
+static inline void
+tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+        tlb->need_flush = 1;
+        free_page_and_swap_cache(page);
+}
+
 #define pte_free_tlb(tlb,ptep)          pte_free(ptep)
 #define pmd_free_tlb(tlb,pmdp)          pmd_free(pmdp)
 

+ 3 - 8
include/asm-generic/4level-fixup.h

@@ -10,14 +10,9 @@
 
 #define pud_t				pgd_t
 
-#define pmd_alloc(mm, pud, address)			\
-({	pmd_t *ret;					\
-	if (pgd_none(*pud))				\
- 		ret = __pmd_alloc(mm, pud, address);	\
- 	else						\
-		ret = pmd_offset(pud, address);		\
- 	ret;						\
-})
+#define pmd_alloc(mm, pud, address) \
+	((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \
+ 		NULL: pmd_offset(pud, address))
 
 #define pud_alloc(mm, pgd, address)	(pgd)
 #define pud_offset(pgd, start)		(pgd)

Деякі файли не було показано, через те що забагато файлів було змінено