Browse Source

Merge commit '3ff195b011d7decf501a4d55aeed312731094796' into for-linus

Conflicts:
	drivers/md/md.c

- Resolved conflict in md_update_sb
- Added extra 'NULL' arg to new instance of sysfs_get_dirent.

Signed-off-by: NeilBrown <neilb@suse.de>
NeilBrown 15 years ago
parent
commit
19fdb9eefb
100 changed files with 3006 additions and 1441 deletions
  1. 12 8
      .gitignore
  2. 2 0
      Documentation/00-INDEX
  3. 31 0
      Documentation/ABI/obsolete/sysfs-bus-usb
  4. 29 0
      Documentation/ABI/obsolete/sysfs-class-rfkill
  5. 67 0
      Documentation/ABI/stable/sysfs-class-rfkill
  6. 1 29
      Documentation/ABI/testing/sysfs-bus-usb
  7. 1 1
      Documentation/ABI/testing/sysfs-devices-memory
  8. 9 0
      Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget
  9. 1 1
      Documentation/Changes
  10. 758 0
      Documentation/DMA-API-HOWTO.txt
  11. 36 86
      Documentation/DMA-API.txt
  12. 29 36
      Documentation/DocBook/libata.tmpl
  13. 11 0
      Documentation/DocBook/media-entities.tmpl
  14. 3 3
      Documentation/DocBook/mtdnand.tmpl
  15. 7 3
      Documentation/DocBook/sh.tmpl
  16. 13 0
      Documentation/DocBook/tracepoint.tmpl
  17. 1 1
      Documentation/DocBook/v4l/common.xml
  18. 69 57
      Documentation/DocBook/v4l/compat.xml
  19. 34 2
      Documentation/DocBook/v4l/controls.xml
  20. 31 0
      Documentation/DocBook/v4l/dev-event.xml
  21. 14 4
      Documentation/DocBook/v4l/io.xml
  22. 12 0
      Documentation/DocBook/v4l/pixfmt.xml
  23. 3 0
      Documentation/DocBook/v4l/v4l2.xml
  24. 10 0
      Documentation/DocBook/v4l/videodev2.h.xml
  25. 131 0
      Documentation/DocBook/v4l/vidioc-dqevent.xml
  26. 1 1
      Documentation/DocBook/v4l/vidioc-enuminput.xml
  27. 1 1
      Documentation/DocBook/v4l/vidioc-g-parm.xml
  28. 12 2
      Documentation/DocBook/v4l/vidioc-qbuf.xml
  29. 1 1
      Documentation/DocBook/v4l/vidioc-queryctrl.xml
  30. 1 1
      Documentation/DocBook/v4l/vidioc-reqbufs.xml
  31. 133 0
      Documentation/DocBook/v4l/vidioc-subscribe-event.xml
  32. 17 10
      Documentation/DocBook/writing-an-alsa-driver.tmpl
  33. 1 1
      Documentation/DocBook/writing_usb_driver.tmpl
  34. 37 78
      Documentation/HOWTO
  35. 12 0
      Documentation/IPMI.txt
  36. 2 2
      Documentation/Makefile
  37. 0 766
      Documentation/PCI/PCI-DMA-mapping.txt
  38. 2 2
      Documentation/PCI/pci-error-recovery.txt
  39. 22 17
      Documentation/RCU/NMI-RCU.txt
  40. 4 3
      Documentation/RCU/checklist.txt
  41. 26 2
      Documentation/RCU/lockdep.txt
  42. 71 23
      Documentation/RCU/stallwarn.txt
  43. 0 10
      Documentation/RCU/torture.txt
  44. 19 16
      Documentation/RCU/trace.txt
  45. 6 0
      Documentation/RCU/whatisRCU.txt
  46. 1 1
      Documentation/Smack.txt
  47. 6 2
      Documentation/SubmitChecklist
  48. 2 0
      Documentation/arm/00-INDEX
  49. 1 1
      Documentation/arm/SA1100/ADSBitsy
  50. 60 0
      Documentation/arm/SPEAr/overview.txt
  51. 2 2
      Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
  52. 86 0
      Documentation/arm/Samsung/Overview.txt
  53. 167 0
      Documentation/arm/Samsung/clksrc-change-registers.awk
  54. 1 1
      Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen
  55. 1 1
      Documentation/atomic_ops.txt
  56. 1 1
      Documentation/blackfin/bfin-gpio-notes.txt
  57. 2 2
      Documentation/block/biodoc.txt
  58. 3 3
      Documentation/cachetlb.txt
  59. 110 0
      Documentation/cgroups/cgroup_event_listener.c
  60. 39 3
      Documentation/cgroups/cgroups.txt
  61. 82 79
      Documentation/cgroups/cpusets.txt
  62. 44 5
      Documentation/cgroups/memcg_test.txt
  63. 79 5
      Documentation/cgroups/memory.txt
  64. 234 0
      Documentation/circular-buffers.txt
  65. 1 0
      Documentation/connector/cn_test.c
  66. 1 1
      Documentation/connector/connector.txt
  67. 1 1
      Documentation/console/console.txt
  68. 5 9
      Documentation/credentials.txt
  69. 1 1
      Documentation/driver-model/platform.txt
  70. 1 1
      Documentation/dvb/ci.txt
  71. 1 1
      Documentation/dvb/contributors.txt
  72. 1 1
      Documentation/eisa.txt
  73. 9 21
      Documentation/email-clients.txt
  74. 31 0
      Documentation/fb/efifb.txt
  75. 0 31
      Documentation/fb/imacfb.txt
  76. 91 27
      Documentation/feature-removal-schedule.txt
  77. 4 0
      Documentation/filesystems/00-INDEX
  78. 16 2
      Documentation/filesystems/9p.txt
  79. 2 2
      Documentation/filesystems/Locking
  80. 8 0
      Documentation/filesystems/Makefile
  81. 1 1
      Documentation/filesystems/autofs4-mount-control.txt
  82. 140 0
      Documentation/filesystems/ceph.txt
  83. 1 1
      Documentation/filesystems/dlmfs.txt
  84. 5 34
      Documentation/filesystems/dnotify.txt
  85. 34 0
      Documentation/filesystems/dnotify_test.c
  86. 6 6
      Documentation/filesystems/fiemap.txt
  87. 2 2
      Documentation/filesystems/fuse.txt
  88. 6 6
      Documentation/filesystems/gfs2.txt
  89. 1 1
      Documentation/filesystems/hpfs.txt
  90. 4 4
      Documentation/filesystems/logfs.txt
  91. 1 1
      Documentation/filesystems/nfs/nfs41-server.txt
  92. 1 1
      Documentation/filesystems/nfs/rpc-cache.txt
  93. 2 2
      Documentation/filesystems/nilfs2.txt
  94. 7 0
      Documentation/filesystems/ocfs2.txt
  95. 9 6
      Documentation/filesystems/proc.txt
  96. 1 1
      Documentation/filesystems/smbfs.txt
  97. 5 1
      Documentation/filesystems/tmpfs.txt
  98. 1 1
      Documentation/filesystems/vfs.txt
  99. 1 1
      Documentation/hwmon/abituguru
  100. 1 1
      Documentation/hwmon/lm85

+ 12 - 8
.gitignore

@@ -34,14 +34,18 @@ modules.builtin
 #
 # Top-level generic files
 #
-tags
-TAGS
-linux
-vmlinux
-vmlinuz
-System.map
-Module.markers
-Module.symvers
+/tags
+/TAGS
+/linux
+/vmlinux
+/vmlinuz
+/System.map
+/Module.markers
+/Module.symvers
+
+#
+# git files that we don't want to ignore even it they are dot-files
+#
 !.gitignore
 !.mailmap
 

+ 2 - 0
Documentation/00-INDEX

@@ -250,6 +250,8 @@ numastat.txt
 	- info on how to read Numa policy hit/miss statistics in sysfs.
 oops-tracing.txt
 	- how to decode those nasty internal kernel error dump messages.
+padata.txt
+	- An introduction to the "padata" parallel execution API
 parisc/
 	- directory with info on using Linux on PA-RISC architecture.
 parport.txt

+ 31 - 0
Documentation/ABI/obsolete/sysfs-bus-usb

@@ -0,0 +1,31 @@
+What:		/sys/bus/usb/devices/.../power/level
+Date:		March 2007
+KernelVersion:	2.6.21
+Contact:	Alan Stern <stern@rowland.harvard.edu>
+Description:
+		Each USB device directory will contain a file named
+		power/level.  This file holds a power-level setting for
+		the device, either "on" or "auto".
+
+		"on" means that the device is not allowed to autosuspend,
+		although normal suspends for system sleep will still
+		be honored.  "auto" means the device will autosuspend
+		and autoresume in the usual manner, according to the
+		capabilities of its driver.
+
+		During normal use, devices should be left in the "auto"
+		level.  The "on" level is meant for administrative uses.
+		If you want to suspend a device immediately but leave it
+		free to wake up in response to I/O requests, you should
+		write "0" to power/autosuspend.
+
+		Device not capable of proper suspend and resume should be
+		left in the "on" level.  Although the USB spec requires
+		devices to support suspend/resume, many of them do not.
+		In fact so many don't that by default, the USB core
+		initializes all non-hub devices in the "on" level.  Some
+		drivers may change this setting when they are bound.
+
+		This file is deprecated and will be removed after 2010.
+		Use the power/control file instead; it does exactly the
+		same thing.

+ 29 - 0
Documentation/ABI/obsolete/sysfs-class-rfkill

@@ -0,0 +1,29 @@
+rfkill - radio frequency (RF) connector kill switch support
+
+For details to this subsystem look at Documentation/rfkill.txt.
+
+What:		/sys/class/rfkill/rfkill[0-9]+/state
+Date:		09-Jul-2007
+KernelVersion	v2.6.22
+Contact:	linux-wireless@vger.kernel.org
+Description: 	Current state of the transmitter.
+		This file is deprecated and sheduled to be removed in 2014,
+		because its not possible to express the 'soft and hard block'
+		state of the rfkill driver.
+Values: 	A numeric value.
+		0: RFKILL_STATE_SOFT_BLOCKED
+			transmitter is turned off by software
+		1: RFKILL_STATE_UNBLOCKED
+			transmitter is (potentially) active
+		2: RFKILL_STATE_HARD_BLOCKED
+			transmitter is forced off by something outside of
+			the driver's control.
+
+What:		/sys/class/rfkill/rfkill[0-9]+/claim
+Date:		09-Jul-2007
+KernelVersion	v2.6.22
+Contact:	linux-wireless@vger.kernel.org
+Description:	This file is deprecated because there no longer is a way to
+		claim just control over a single rfkill instance.
+		This file is scheduled to be removed in 2012.
+Values: 	0: Kernel handles events

+ 67 - 0
Documentation/ABI/stable/sysfs-class-rfkill

@@ -0,0 +1,67 @@
+rfkill - radio frequency (RF) connector kill switch support
+
+For details to this subsystem look at Documentation/rfkill.txt.
+
+For the deprecated /sys/class/rfkill/*/state and
+/sys/class/rfkill/*/claim knobs of this interface look in
+Documentation/ABI/obsolete/sysfs-class-rfkill.
+
+What: 		/sys/class/rfkill
+Date:		09-Jul-2007
+KernelVersion:	v2.6.22
+Contact:	linux-wireless@vger.kernel.org,
+Description: 	The rfkill class subsystem folder.
+		Each registered rfkill driver is represented by an rfkillX
+		subfolder (X being an integer > 0).
+
+
+What:		/sys/class/rfkill/rfkill[0-9]+/name
+Date:		09-Jul-2007
+KernelVersion	v2.6.22
+Contact:	linux-wireless@vger.kernel.org
+Description: 	Name assigned by driver to this key (interface or driver name).
+Values: 	arbitrary string.
+
+
+What: 		/sys/class/rfkill/rfkill[0-9]+/type
+Date:		09-Jul-2007
+KernelVersion	v2.6.22
+Contact:	linux-wireless@vger.kernel.org
+Description: 	Driver type string ("wlan", "bluetooth", etc).
+Values: 	See include/linux/rfkill.h.
+
+
+What:		/sys/class/rfkill/rfkill[0-9]+/persistent
+Date:		09-Jul-2007
+KernelVersion	v2.6.22
+Contact:	linux-wireless@vger.kernel.org
+Description: 	Whether the soft blocked state is initialised from non-volatile
+		storage at startup.
+Values: 	A numeric value.
+		0: false
+		1: true
+
+
+What:		/sys/class/rfkill/rfkill[0-9]+/hard
+Date:		12-March-2010
+KernelVersion	v2.6.34
+Contact:	linux-wireless@vger.kernel.org
+Description: 	Current hardblock state. This file is read only.
+Values: 	A numeric value.
+		0: inactive
+			The transmitter is (potentially) active.
+		1: active
+			The transmitter is forced off by something outside of
+			the driver's control.
+
+
+What:		/sys/class/rfkill/rfkill[0-9]+/soft
+Date:		12-March-2010
+KernelVersion	v2.6.34
+Contact:	linux-wireless@vger.kernel.org
+Description:	Current softblock state. This file is read and write.
+Values: 	A numeric value.
+		0: inactive
+			The transmitter is (potentially) active.
+		1: active
+			The transmitter is turned off by software.

+ 1 - 29
Documentation/ABI/testing/sysfs-bus-usb

@@ -14,34 +14,6 @@ Description:
 		The autosuspend delay for newly-created devices is set to
 		the value of the usbcore.autosuspend module parameter.
 
-What:		/sys/bus/usb/devices/.../power/level
-Date:		March 2007
-KernelVersion:	2.6.21
-Contact:	Alan Stern <stern@rowland.harvard.edu>
-Description:
-		Each USB device directory will contain a file named
-		power/level.  This file holds a power-level setting for
-		the device, either "on" or "auto".
-
-		"on" means that the device is not allowed to autosuspend,
-		although normal suspends for system sleep will still
-		be honored.  "auto" means the device will autosuspend
-		and autoresume in the usual manner, according to the
-		capabilities of its driver.
-
-		During normal use, devices should be left in the "auto"
-		level.  The "on" level is meant for administrative uses.
-		If you want to suspend a device immediately but leave it
-		free to wake up in response to I/O requests, you should
-		write "0" to power/autosuspend.
-
-		Device not capable of proper suspend and resume should be
-		left in the "on" level.  Although the USB spec requires
-		devices to support suspend/resume, many of them do not.
-		In fact so many don't that by default, the USB core
-		initializes all non-hub devices in the "on" level.  Some
-		drivers may change this setting when they are bound.
-
 What:		/sys/bus/usb/devices/.../power/persist
 Date:		May 2007
 KernelVersion:	2.6.23
@@ -160,7 +132,7 @@ Description:
 		match the driver to the device.  For example:
 		# echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id
 
-What:		/sys/bus/usb/device/.../avoid_reset
+What:		/sys/bus/usb/device/.../avoid_reset_quirk
 Date:		December 2009
 Contact:	Oliver Neukum <oliver@neukum.org>
 Description:

+ 1 - 1
Documentation/ABI/testing/sysfs-devices-memory

@@ -43,7 +43,7 @@ Date:		September 2008
 Contact:	Badari Pulavarty <pbadari@us.ibm.com>
 Description:
 		The file /sys/devices/system/memory/memoryX/state
-		is read-write.  When read, it's contents show the
+		is read-write.  When read, its contents show the
 		online/offline state of the memory section.  When written,
 		root can toggle the the online/offline state of a removable
 		memory section (see removable file description above)

+ 9 - 0
Documentation/ABI/testing/sysfs-devices-platform-_UDC_-gadget

@@ -0,0 +1,9 @@
+What:		/sys/devices/platform/_UDC_/gadget/suspended
+Date:		April 2010
+Contact:	Fabien Chouteau <fabien.chouteau@barco.com>
+Description:
+		Show the suspend state of an USB composite gadget.
+		1 -> suspended
+		0 -> resumed
+
+		(_UDC_ is the name of the USB Device Controller driver)

+ 1 - 1
Documentation/Changes

@@ -49,7 +49,7 @@ o  oprofile               0.9                     # oprofiled --version
 o  udev                   081                     # udevinfo -V
 o  grub                   0.93                    # grub --version
 o  mcelog		  0.6
-o  iptables               1.4.1                   # iptables -V
+o  iptables               1.4.2                   # iptables -V
 
 
 Kernel compilation

+ 758 - 0
Documentation/DMA-API-HOWTO.txt

@@ -0,0 +1,758 @@
+		     Dynamic DMA mapping Guide
+		     =========================
+
+		 David S. Miller <davem@redhat.com>
+		 Richard Henderson <rth@cygnus.com>
+		  Jakub Jelinek <jakub@redhat.com>
+
+This is a guide to device driver writers on how to use the DMA API
+with example pseudo-code.  For a concise description of the API, see
+DMA-API.txt.
+
+Most of the 64bit platforms have special hardware that translates bus
+addresses (DMA addresses) into physical addresses.  This is similar to
+how page tables and/or a TLB translates virtual addresses to physical
+addresses on a CPU.  This is needed so that e.g. PCI devices can
+access with a Single Address Cycle (32bit DMA address) any page in the
+64bit physical address space.  Previously in Linux those 64bit
+platforms had to set artificial limits on the maximum RAM size in the
+system, so that the virt_to_bus() static scheme works (the DMA address
+translation tables were simply filled on bootup to map each bus
+address to the physical page __pa(bus_to_virt())).
+
+So that Linux can use the dynamic DMA mapping, it needs some help from the
+drivers, namely it has to take into account that DMA addresses should be
+mapped only for the time they are actually used and unmapped after the DMA
+transfer.
+
+The following API will work of course even on platforms where no such
+hardware exists.
+
+Note that the DMA API works with any bus independent of the underlying
+microprocessor architecture. You should use the DMA API rather than
+the bus specific DMA API (e.g. pci_dma_*).
+
+First of all, you should make sure
+
+#include <linux/dma-mapping.h>
+
+is in your driver. This file will obtain for you the definition of the
+dma_addr_t (which can hold any valid DMA address for the platform)
+type which should be used everywhere you hold a DMA (bus) address
+returned from the DMA mapping functions.
+
+			 What memory is DMA'able?
+
+The first piece of information you must know is what kernel memory can
+be used with the DMA mapping facilities.  There has been an unwritten
+set of rules regarding this, and this text is an attempt to finally
+write them down.
+
+If you acquired your memory via the page allocator
+(i.e. __get_free_page*()) or the generic memory allocators
+(i.e. kmalloc() or kmem_cache_alloc()) then you may DMA to/from
+that memory using the addresses returned from those routines.
+
+This means specifically that you may _not_ use the memory/addresses
+returned from vmalloc() for DMA.  It is possible to DMA to the
+_underlying_ memory mapped into a vmalloc() area, but this requires
+walking page tables to get the physical addresses, and then
+translating each of those pages back to a kernel address using
+something like __va().  [ EDIT: Update this when we integrate
+Gerd Knorr's generic code which does this. ]
+
+This rule also means that you may use neither kernel image addresses
+(items in data/text/bss segments), nor module image addresses, nor
+stack addresses for DMA.  These could all be mapped somewhere entirely
+different than the rest of physical memory.  Even if those classes of
+memory could physically work with DMA, you'd need to ensure the I/O
+buffers were cacheline-aligned.  Without that, you'd see cacheline
+sharing problems (data corruption) on CPUs with DMA-incoherent caches.
+(The CPU could write to one word, DMA would write to a different one
+in the same cache line, and one of them could be overwritten.)
+
+Also, this means that you cannot take the return of a kmap()
+call and DMA to/from that.  This is similar to vmalloc().
+
+What about block I/O and networking buffers?  The block I/O and
+networking subsystems make sure that the buffers they use are valid
+for you to DMA from/to.
+
+			DMA addressing limitations
+
+Does your device have any DMA addressing limitations?  For example, is
+your device only capable of driving the low order 24-bits of address?
+If so, you need to inform the kernel of this fact.
+
+By default, the kernel assumes that your device can address the full
+32-bits.  For a 64-bit capable device, this needs to be increased.
+And for a device with limitations, as discussed in the previous
+paragraph, it needs to be decreased.
+
+Special note about PCI: PCI-X specification requires PCI-X devices to
+support 64-bit addressing (DAC) for all transactions.  And at least
+one platform (SGI SN2) requires 64-bit consistent allocations to
+operate correctly when the IO bus is in PCI-X mode.
+
+For correct operation, you must interrogate the kernel in your device
+probe routine to see if the DMA controller on the machine can properly
+support the DMA addressing limitation your device has.  It is good
+style to do this even if your device holds the default setting,
+because this shows that you did think about these issues wrt. your
+device.
+
+The query is performed via a call to dma_set_mask():
+
+	int dma_set_mask(struct device *dev, u64 mask);
+
+The query for consistent allocations is performed via a call to
+dma_set_coherent_mask():
+
+	int dma_set_coherent_mask(struct device *dev, u64 mask);
+
+Here, dev is a pointer to the device struct of your device, and mask
+is a bit mask describing which bits of an address your device
+supports.  It returns zero if your card can perform DMA properly on
+the machine given the address mask you provided.  In general, the
+device struct of your device is embedded in the bus specific device
+struct of your device.  For example, a pointer to the device struct of
+your PCI device is pdev->dev (pdev is a pointer to the PCI device
+struct of your device).
+
+If it returns non-zero, your device cannot perform DMA properly on
+this platform, and attempting to do so will result in undefined
+behavior.  You must either use a different mask, or not use DMA.
+
+This means that in the failure case, you have three options:
+
+1) Use another DMA mask, if possible (see below).
+2) Use some non-DMA mode for data transfer, if possible.
+3) Ignore this device and do not initialize it.
+
+It is recommended that your driver print a kernel KERN_WARNING message
+when you end up performing either #2 or #3.  In this manner, if a user
+of your driver reports that performance is bad or that the device is not
+even detected, you can ask them for the kernel messages to find out
+exactly why.
+
+The standard 32-bit addressing device would do something like this:
+
+	if (dma_set_mask(dev, DMA_BIT_MASK(32))) {
+		printk(KERN_WARNING
+		       "mydev: No suitable DMA available.\n");
+		goto ignore_this_device;
+	}
+
+Another common scenario is a 64-bit capable device.  The approach here
+is to try for 64-bit addressing, but back down to a 32-bit mask that
+should not fail.  The kernel may fail the 64-bit mask not because the
+platform is not capable of 64-bit addressing.  Rather, it may fail in
+this case simply because 32-bit addressing is done more efficiently
+than 64-bit addressing.  For example, Sparc64 PCI SAC addressing is
+more efficient than DAC addressing.
+
+Here is how you would handle a 64-bit capable device which can drive
+all 64-bits when accessing streaming DMA:
+
+	int using_dac;
+
+	if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
+		using_dac = 1;
+	} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
+		using_dac = 0;
+	} else {
+		printk(KERN_WARNING
+		       "mydev: No suitable DMA available.\n");
+		goto ignore_this_device;
+	}
+
+If a card is capable of using 64-bit consistent allocations as well,
+the case would look like this:
+
+	int using_dac, consistent_using_dac;
+
+	if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
+		using_dac = 1;
+	   	consistent_using_dac = 1;
+		dma_set_coherent_mask(dev, DMA_BIT_MASK(64));
+	} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
+		using_dac = 0;
+		consistent_using_dac = 0;
+		dma_set_coherent_mask(dev, DMA_BIT_MASK(32));
+	} else {
+		printk(KERN_WARNING
+		       "mydev: No suitable DMA available.\n");
+		goto ignore_this_device;
+	}
+
+dma_set_coherent_mask() will always be able to set the same or a
+smaller mask as dma_set_mask(). However for the rare case that a
+device driver only uses consistent allocations, one would have to
+check the return value from dma_set_coherent_mask().
+
+Finally, if your device can only drive the low 24-bits of
+address you might do something like:
+
+	if (dma_set_mask(dev, DMA_BIT_MASK(24))) {
+		printk(KERN_WARNING
+		       "mydev: 24-bit DMA addressing not available.\n");
+		goto ignore_this_device;
+	}
+
+When dma_set_mask() is successful, and returns zero, the kernel saves
+away this mask you have provided.  The kernel will use this
+information later when you make DMA mappings.
+
+There is a case which we are aware of at this time, which is worth
+mentioning in this documentation.  If your device supports multiple
+functions (for example a sound card provides playback and record
+functions) and the various different functions have _different_
+DMA addressing limitations, you may wish to probe each mask and
+only provide the functionality which the machine can handle.  It
+is important that the last call to dma_set_mask() be for the
+most specific mask.
+
+Here is pseudo-code showing how this might be done:
+
+	#define PLAYBACK_ADDRESS_BITS	DMA_BIT_MASK(32)
+	#define RECORD_ADDRESS_BITS	DMA_BIT_MASK(24)
+
+	struct my_sound_card *card;
+	struct device *dev;
+
+	...
+	if (!dma_set_mask(dev, PLAYBACK_ADDRESS_BITS)) {
+		card->playback_enabled = 1;
+	} else {
+		card->playback_enabled = 0;
+		printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n",
+		       card->name);
+	}
+	if (!dma_set_mask(dev, RECORD_ADDRESS_BITS)) {
+		card->record_enabled = 1;
+	} else {
+		card->record_enabled = 0;
+		printk(KERN_WARNING "%s: Record disabled due to DMA limitations.\n",
+		       card->name);
+	}
+
+A sound card was used as an example here because this genre of PCI
+devices seems to be littered with ISA chips given a PCI front end,
+and thus retaining the 16MB DMA addressing limitations of ISA.
+
+			Types of DMA mappings
+
+There are two types of DMA mappings:
+
+- Consistent DMA mappings which are usually mapped at driver
+  initialization, unmapped at the end and for which the hardware should
+  guarantee that the device and the CPU can access the data
+  in parallel and will see updates made by each other without any
+  explicit software flushing.
+
+  Think of "consistent" as "synchronous" or "coherent".
+
+  The current default is to return consistent memory in the low 32
+  bits of the bus space.  However, for future compatibility you should
+  set the consistent mask even if this default is fine for your
+  driver.
+
+  Good examples of what to use consistent mappings for are:
+
+	- Network card DMA ring descriptors.
+	- SCSI adapter mailbox command data structures.
+	- Device firmware microcode executed out of
+	  main memory.
+
+  The invariant these examples all require is that any CPU store
+  to memory is immediately visible to the device, and vice
+  versa.  Consistent mappings guarantee this.
+
+  IMPORTANT: Consistent DMA memory does not preclude the usage of
+             proper memory barriers.  The CPU may reorder stores to
+	     consistent memory just as it may normal memory.  Example:
+	     if it is important for the device to see the first word
+	     of a descriptor updated before the second, you must do
+	     something like:
+
+		desc->word0 = address;
+		wmb();
+		desc->word1 = DESC_VALID;
+
+             in order to get correct behavior on all platforms.
+
+	     Also, on some platforms your driver may need to flush CPU write
+	     buffers in much the same way as it needs to flush write buffers
+	     found in PCI bridges (such as by reading a register's value
+	     after writing it).
+
+- Streaming DMA mappings which are usually mapped for one DMA
+  transfer, unmapped right after it (unless you use dma_sync_* below)
+  and for which hardware can optimize for sequential accesses.
+
+  This of "streaming" as "asynchronous" or "outside the coherency
+  domain".
+
+  Good examples of what to use streaming mappings for are:
+
+	- Networking buffers transmitted/received by a device.
+	- Filesystem buffers written/read by a SCSI device.
+
+  The interfaces for using this type of mapping were designed in
+  such a way that an implementation can make whatever performance
+  optimizations the hardware allows.  To this end, when using
+  such mappings you must be explicit about what you want to happen.
+
+Neither type of DMA mapping has alignment restrictions that come from
+the underlying bus, although some devices may have such restrictions.
+Also, systems with caches that aren't DMA-coherent will work better
+when the underlying buffers don't share cache lines with other data.
+
+
+		 Using Consistent DMA mappings.
+
+To allocate and map large (PAGE_SIZE or so) consistent DMA regions,
+you should do:
+
+	dma_addr_t dma_handle;
+
+	cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp);
+
+where device is a struct device *. This may be called in interrupt
+context with the GFP_ATOMIC flag.
+
+Size is the length of the region you want to allocate, in bytes.
+
+This routine will allocate RAM for that region, so it acts similarly to
+__get_free_pages (but takes size instead of a page order).  If your
+driver needs regions sized smaller than a page, you may prefer using
+the dma_pool interface, described below.
+
+The consistent DMA mapping interfaces, for non-NULL dev, will by
+default return a DMA address which is 32-bit addressable.  Even if the
+device indicates (via DMA mask) that it may address the upper 32-bits,
+consistent allocation will only return > 32-bit addresses for DMA if
+the consistent DMA mask has been explicitly changed via
+dma_set_coherent_mask().  This is true of the dma_pool interface as
+well.
+
+dma_alloc_coherent returns two values: the virtual address which you
+can use to access it from the CPU and dma_handle which you pass to the
+card.
+
+The cpu return address and the DMA bus master address are both
+guaranteed to be aligned to the smallest PAGE_SIZE order which
+is greater than or equal to the requested size.  This invariant
+exists (for example) to guarantee that if you allocate a chunk
+which is smaller than or equal to 64 kilobytes, the extent of the
+buffer you receive will not cross a 64K boundary.
+
+To unmap and free such a DMA region, you call:
+
+	dma_free_coherent(dev, size, cpu_addr, dma_handle);
+
+where dev, size are the same as in the above call and cpu_addr and
+dma_handle are the values dma_alloc_coherent returned to you.
+This function may not be called in interrupt context.
+
+If your driver needs lots of smaller memory regions, you can write
+custom code to subdivide pages returned by dma_alloc_coherent,
+or you can use the dma_pool API to do that.  A dma_pool is like
+a kmem_cache, but it uses dma_alloc_coherent not __get_free_pages.
+Also, it understands common hardware constraints for alignment,
+like queue heads needing to be aligned on N byte boundaries.
+
+Create a dma_pool like this:
+
+	struct dma_pool *pool;
+
+	pool = dma_pool_create(name, dev, size, align, alloc);
+
+The "name" is for diagnostics (like a kmem_cache name); dev and size
+are as above.  The device's hardware alignment requirement for this
+type of data is "align" (which is expressed in bytes, and must be a
+power of two).  If your device has no boundary crossing restrictions,
+pass 0 for alloc; passing 4096 says memory allocated from this pool
+must not cross 4KByte boundaries (but at that time it may be better to
+go for dma_alloc_coherent directly instead).
+
+Allocate memory from a dma pool like this:
+
+	cpu_addr = dma_pool_alloc(pool, flags, &dma_handle);
+
+flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor
+holding SMP locks), SLAB_ATOMIC otherwise.  Like dma_alloc_coherent,
+this returns two values, cpu_addr and dma_handle.
+
+Free memory that was allocated from a dma_pool like this:
+
+	dma_pool_free(pool, cpu_addr, dma_handle);
+
+where pool is what you passed to dma_pool_alloc, and cpu_addr and
+dma_handle are the values dma_pool_alloc returned. This function
+may be called in interrupt context.
+
+Destroy a dma_pool by calling:
+
+	dma_pool_destroy(pool);
+
+Make sure you've called dma_pool_free for all memory allocated
+from a pool before you destroy the pool. This function may not
+be called in interrupt context.
+
+			DMA Direction
+
+The interfaces described in subsequent portions of this document
+take a DMA direction argument, which is an integer and takes on
+one of the following values:
+
+ DMA_BIDIRECTIONAL
+ DMA_TO_DEVICE
+ DMA_FROM_DEVICE
+ DMA_NONE
+
+One should provide the exact DMA direction if you know it.
+
+DMA_TO_DEVICE means "from main memory to the device"
+DMA_FROM_DEVICE means "from the device to main memory"
+It is the direction in which the data moves during the DMA
+transfer.
+
+You are _strongly_ encouraged to specify this as precisely
+as you possibly can.
+
+If you absolutely cannot know the direction of the DMA transfer,
+specify DMA_BIDIRECTIONAL.  It means that the DMA can go in
+either direction.  The platform guarantees that you may legally
+specify this, and that it will work, but this may be at the
+cost of performance for example.
+
+The value DMA_NONE is to be used for debugging.  One can
+hold this in a data structure before you come to know the
+precise direction, and this will help catch cases where your
+direction tracking logic has failed to set things up properly.
+
+Another advantage of specifying this value precisely (outside of
+potential platform-specific optimizations of such) is for debugging.
+Some platforms actually have a write permission boolean which DMA
+mappings can be marked with, much like page protections in the user
+program address space.  Such platforms can and do report errors in the
+kernel logs when the DMA controller hardware detects violation of the
+permission setting.
+
+Only streaming mappings specify a direction, consistent mappings
+implicitly have a direction attribute setting of
+DMA_BIDIRECTIONAL.
+
+The SCSI subsystem tells you the direction to use in the
+'sc_data_direction' member of the SCSI command your driver is
+working on.
+
+For Networking drivers, it's a rather simple affair.  For transmit
+packets, map/unmap them with the DMA_TO_DEVICE direction
+specifier.  For receive packets, just the opposite, map/unmap them
+with the DMA_FROM_DEVICE direction specifier.
+
+		  Using Streaming DMA mappings
+
+The streaming DMA mapping routines can be called from interrupt
+context.  There are two versions of each map/unmap, one which will
+map/unmap a single memory region, and one which will map/unmap a
+scatterlist.
+
+To map a single region, you do:
+
+	struct device *dev = &my_dev->dev;
+	dma_addr_t dma_handle;
+	void *addr = buffer->ptr;
+	size_t size = buffer->len;
+
+	dma_handle = dma_map_single(dev, addr, size, direction);
+
+and to unmap it:
+
+	dma_unmap_single(dev, dma_handle, size, direction);
+
+You should call dma_unmap_single when the DMA activity is finished, e.g.
+from the interrupt which told you that the DMA transfer is done.
+
+Using cpu pointers like this for single mappings has a disadvantage,
+you cannot reference HIGHMEM memory in this way.  Thus, there is a
+map/unmap interface pair akin to dma_{map,unmap}_single.  These
+interfaces deal with page/offset pairs instead of cpu pointers.
+Specifically:
+
+	struct device *dev = &my_dev->dev;
+	dma_addr_t dma_handle;
+	struct page *page = buffer->page;
+	unsigned long offset = buffer->offset;
+	size_t size = buffer->len;
+
+	dma_handle = dma_map_page(dev, page, offset, size, direction);
+
+	...
+
+	dma_unmap_page(dev, dma_handle, size, direction);
+
+Here, "offset" means byte offset within the given page.
+
+With scatterlists, you map a region gathered from several regions by:
+
+	int i, count = dma_map_sg(dev, sglist, nents, direction);
+	struct scatterlist *sg;
+
+	for_each_sg(sglist, sg, count, i) {
+		hw_address[i] = sg_dma_address(sg);
+		hw_len[i] = sg_dma_len(sg);
+	}
+
+where nents is the number of entries in the sglist.
+
+The implementation is free to merge several consecutive sglist entries
+into one (e.g. if DMA mapping is done with PAGE_SIZE granularity, any
+consecutive sglist entries can be merged into one provided the first one
+ends and the second one starts on a page boundary - in fact this is a huge
+advantage for cards which either cannot do scatter-gather or have very
+limited number of scatter-gather entries) and returns the actual number
+of sg entries it mapped them to. On failure 0 is returned.
+
+Then you should loop count times (note: this can be less than nents times)
+and use sg_dma_address() and sg_dma_len() macros where you previously
+accessed sg->address and sg->length as shown above.
+
+To unmap a scatterlist, just call:
+
+	dma_unmap_sg(dev, sglist, nents, direction);
+
+Again, make sure DMA activity has already finished.
+
+PLEASE NOTE:  The 'nents' argument to the dma_unmap_sg call must be
+              the _same_ one you passed into the dma_map_sg call,
+	      it should _NOT_ be the 'count' value _returned_ from the
+              dma_map_sg call.
+
+Every dma_map_{single,sg} call should have its dma_unmap_{single,sg}
+counterpart, because the bus address space is a shared resource (although
+in some ports the mapping is per each BUS so less devices contend for the
+same bus address space) and you could render the machine unusable by eating
+all bus addresses.
+
+If you need to use the same streaming DMA region multiple times and touch
+the data in between the DMA transfers, the buffer needs to be synced
+properly in order for the cpu and device to see the most uptodate and
+correct copy of the DMA buffer.
+
+So, firstly, just map it with dma_map_{single,sg}, and after each DMA
+transfer call either:
+
+	dma_sync_single_for_cpu(dev, dma_handle, size, direction);
+
+or:
+
+	dma_sync_sg_for_cpu(dev, sglist, nents, direction);
+
+as appropriate.
+
+Then, if you wish to let the device get at the DMA area again,
+finish accessing the data with the cpu, and then before actually
+giving the buffer to the hardware call either:
+
+	dma_sync_single_for_device(dev, dma_handle, size, direction);
+
+or:
+
+	dma_sync_sg_for_device(dev, sglist, nents, direction);
+
+as appropriate.
+
+After the last DMA transfer call one of the DMA unmap routines
+dma_unmap_{single,sg}. If you don't touch the data from the first dma_map_*
+call till dma_unmap_*, then you don't have to call the dma_sync_*
+routines at all.
+
+Here is pseudo code which shows a situation in which you would need
+to use the dma_sync_*() interfaces.
+
+	my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
+	{
+		dma_addr_t mapping;
+
+		mapping = dma_map_single(cp->dev, buffer, len, DMA_FROM_DEVICE);
+
+		cp->rx_buf = buffer;
+		cp->rx_len = len;
+		cp->rx_dma = mapping;
+
+		give_rx_buf_to_card(cp);
+	}
+
+	...
+
+	my_card_interrupt_handler(int irq, void *devid, struct pt_regs *regs)
+	{
+		struct my_card *cp = devid;
+
+		...
+		if (read_card_status(cp) == RX_BUF_TRANSFERRED) {
+			struct my_card_header *hp;
+
+			/* Examine the header to see if we wish
+			 * to accept the data.  But synchronize
+			 * the DMA transfer with the CPU first
+			 * so that we see updated contents.
+			 */
+			dma_sync_single_for_cpu(&cp->dev, cp->rx_dma,
+						cp->rx_len,
+						DMA_FROM_DEVICE);
+
+			/* Now it is safe to examine the buffer. */
+			hp = (struct my_card_header *) cp->rx_buf;
+			if (header_is_ok(hp)) {
+				dma_unmap_single(&cp->dev, cp->rx_dma, cp->rx_len,
+						 DMA_FROM_DEVICE);
+				pass_to_upper_layers(cp->rx_buf);
+				make_and_setup_new_rx_buf(cp);
+			} else {
+				/* Just sync the buffer and give it back
+				 * to the card.
+				 */
+				dma_sync_single_for_device(&cp->dev,
+							   cp->rx_dma,
+							   cp->rx_len,
+							   DMA_FROM_DEVICE);
+				give_rx_buf_to_card(cp);
+			}
+		}
+	}
+
+Drivers converted fully to this interface should not use virt_to_bus any
+longer, nor should they use bus_to_virt. Some drivers have to be changed a
+little bit, because there is no longer an equivalent to bus_to_virt in the
+dynamic DMA mapping scheme - you have to always store the DMA addresses
+returned by the dma_alloc_coherent, dma_pool_alloc, and dma_map_single
+calls (dma_map_sg stores them in the scatterlist itself if the platform
+supports dynamic DMA mapping in hardware) in your driver structures and/or
+in the card registers.
+
+All drivers should be using these interfaces with no exceptions.  It
+is planned to completely remove virt_to_bus() and bus_to_virt() as
+they are entirely deprecated.  Some ports already do not provide these
+as it is impossible to correctly support them.
+
+		Optimizing Unmap State Space Consumption
+
+On many platforms, dma_unmap_{single,page}() is simply a nop.
+Therefore, keeping track of the mapping address and length is a waste
+of space.  Instead of filling your drivers up with ifdefs and the like
+to "work around" this (which would defeat the whole purpose of a
+portable API) the following facilities are provided.
+
+Actually, instead of describing the macros one by one, we'll
+transform some example code.
+
+1) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures.
+   Example, before:
+
+	struct ring_state {
+		struct sk_buff *skb;
+		dma_addr_t mapping;
+		__u32 len;
+	};
+
+   after:
+
+	struct ring_state {
+		struct sk_buff *skb;
+		DEFINE_DMA_UNMAP_ADDR(mapping);
+		DEFINE_DMA_UNMAP_LEN(len);
+	};
+
+2) Use dma_unmap_{addr,len}_set to set these values.
+   Example, before:
+
+	ringp->mapping = FOO;
+	ringp->len = BAR;
+
+   after:
+
+	dma_unmap_addr_set(ringp, mapping, FOO);
+	dma_unmap_len_set(ringp, len, BAR);
+
+3) Use dma_unmap_{addr,len} to access these values.
+   Example, before:
+
+	dma_unmap_single(dev, ringp->mapping, ringp->len,
+			 DMA_FROM_DEVICE);
+
+   after:
+
+	dma_unmap_single(dev,
+			 dma_unmap_addr(ringp, mapping),
+			 dma_unmap_len(ringp, len),
+			 DMA_FROM_DEVICE);
+
+It really should be self-explanatory.  We treat the ADDR and LEN
+separately, because it is possible for an implementation to only
+need the address in order to perform the unmap operation.
+
+			Platform Issues
+
+If you are just writing drivers for Linux and do not maintain
+an architecture port for the kernel, you can safely skip down
+to "Closing".
+
+1) Struct scatterlist requirements.
+
+   Struct scatterlist must contain, at a minimum, the following
+   members:
+
+	struct page *page;
+	unsigned int offset;
+	unsigned int length;
+
+   The base address is specified by a "page+offset" pair.
+
+   Previous versions of struct scatterlist contained a "void *address"
+   field that was sometimes used instead of page+offset.  As of Linux
+   2.5., page+offset is always used, and the "address" field has been
+   deleted.
+
+2) More to come...
+
+			Handling Errors
+
+DMA address space is limited on some architectures and an allocation
+failure can be determined by:
+
+- checking if dma_alloc_coherent returns NULL or dma_map_sg returns 0
+
+- checking the returned dma_addr_t of dma_map_single and dma_map_page
+  by using dma_mapping_error():
+
+	dma_addr_t dma_handle;
+
+	dma_handle = dma_map_single(dev, addr, size, direction);
+	if (dma_mapping_error(dev, dma_handle)) {
+		/*
+		 * reduce current DMA mapping usage,
+		 * delay and try again later or
+		 * reset driver.
+		 */
+	}
+
+			   Closing
+
+This document, and the API itself, would not be in its current
+form without the feedback and suggestions from numerous individuals.
+We would like to specifically mention, in no particular order, the
+following people:
+
+	Russell King <rmk@arm.linux.org.uk>
+	Leo Dagum <dagum@barrel.engr.sgi.com>
+	Ralf Baechle <ralf@oss.sgi.com>
+	Grant Grundler <grundler@cup.hp.com>
+	Jay Estabrook <Jay.Estabrook@compaq.com>
+	Thomas Sailer <sailer@ife.ee.ethz.ch>
+	Andrea Arcangeli <andrea@suse.de>
+	Jens Axboe <jens.axboe@oracle.com>
+	David Mosberger-Tang <davidm@hpl.hp.com>

+ 36 - 86
Documentation/DMA-API.txt

@@ -4,20 +4,18 @@
         James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
 
 This document describes the DMA API.  For a more gentle introduction
-phrased in terms of the pci_ equivalents (and actual examples) see
-Documentation/PCI/PCI-DMA-mapping.txt.
+of the API (and actual examples) see
+Documentation/DMA-API-HOWTO.txt.
 
-This API is split into two pieces.  Part I describes the API and the
-corresponding pci_ API.  Part II describes the extensions to the API
-for supporting non-consistent memory machines.  Unless you know that
-your driver absolutely has to support non-consistent platforms (this
-is usually only legacy platforms) you should only use the API
-described in part I.
+This API is split into two pieces.  Part I describes the API.  Part II
+describes the extensions to the API for supporting non-consistent
+memory machines.  Unless you know that your driver absolutely has to
+support non-consistent platforms (this is usually only legacy
+platforms) you should only use the API described in part I.
 
-Part I - pci_ and dma_ Equivalent API 
+Part I - dma_ API
 -------------------------------------
 
-To get the pci_ API, you must #include <linux/pci.h>
 To get the dma_ API, you must #include <linux/dma-mapping.h>
 
 
@@ -27,9 +25,6 @@ Part Ia - Using large dma-coherent buffers
 void *
 dma_alloc_coherent(struct device *dev, size_t size,
 			     dma_addr_t *dma_handle, gfp_t flag)
-void *
-pci_alloc_consistent(struct pci_dev *dev, size_t size,
-			     dma_addr_t *dma_handle)
 
 Consistent memory is memory for which a write by either the device or
 the processor can immediately be read by the processor or device
@@ -53,15 +48,11 @@ The simplest way to do that is to use the dma_pool calls (see below).
 The flag parameter (dma_alloc_coherent only) allows the caller to
 specify the GFP_ flags (see kmalloc) for the allocation (the
 implementation may choose to ignore flags that affect the location of
-the returned memory, like GFP_DMA).  For pci_alloc_consistent, you
-must assume GFP_ATOMIC behaviour.
+the returned memory, like GFP_DMA).
 
 void
 dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
 			   dma_addr_t dma_handle)
-void
-pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr,
-			   dma_addr_t dma_handle)
 
 Free the region of consistent memory you previously allocated.  dev,
 size and dma_handle must all be the same as those passed into the
@@ -89,10 +80,6 @@ for alignment, like queue heads needing to be aligned on N-byte boundaries.
 	dma_pool_create(const char *name, struct device *dev,
 			size_t size, size_t align, size_t alloc);
 
-	struct pci_pool *
-	pci_pool_create(const char *name, struct pci_device *dev,
-			size_t size, size_t align, size_t alloc);
-
 The pool create() routines initialize a pool of dma-coherent buffers
 for use with a given device.  It must be called in a context which
 can sleep.
@@ -108,9 +95,6 @@ from this pool must not cross 4KByte boundaries.
 	void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
 			dma_addr_t *dma_handle);
 
-	void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags,
-			dma_addr_t *dma_handle);
-
 This allocates memory from the pool; the returned memory will meet the size
 and alignment requirements specified at creation time.  Pass GFP_ATOMIC to
 prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks),
@@ -122,9 +106,6 @@ pool's device.
 	void dma_pool_free(struct dma_pool *pool, void *vaddr,
 			dma_addr_t addr);
 
-	void pci_pool_free(struct pci_pool *pool, void *vaddr,
-			dma_addr_t addr);
-
 This puts memory back into the pool.  The pool is what was passed to
 the pool allocation routine; the cpu (vaddr) and dma addresses are what
 were returned when that routine allocated the memory being freed.
@@ -132,8 +113,6 @@ were returned when that routine allocated the memory being freed.
 
 	void dma_pool_destroy(struct dma_pool *pool);
 
-	void pci_pool_destroy(struct pci_pool *pool);
-
 The pool destroy() routines free the resources of the pool.  They must be
 called in a context which can sleep.  Make sure you've freed all allocated
 memory back to the pool before you destroy it.
@@ -144,8 +123,6 @@ Part Ic - DMA addressing limitations
 
 int
 dma_supported(struct device *dev, u64 mask)
-int
-pci_dma_supported(struct pci_dev *hwdev, u64 mask)
 
 Checks to see if the device can support DMA to the memory described by
 mask.
@@ -159,8 +136,14 @@ driver writers.
 
 int
 dma_set_mask(struct device *dev, u64 mask)
+
+Checks to see if the mask is possible and updates the device
+parameters if it is.
+
+Returns: 0 if successful and a negative error if not.
+
 int
-pci_set_dma_mask(struct pci_device *dev, u64 mask)
+dma_set_coherent_mask(struct device *dev, u64 mask)
 
 Checks to see if the mask is possible and updates the device
 parameters if it is.
@@ -187,9 +170,6 @@ Part Id - Streaming DMA mappings
 dma_addr_t
 dma_map_single(struct device *dev, void *cpu_addr, size_t size,
 		      enum dma_data_direction direction)
-dma_addr_t
-pci_map_single(struct pci_dev *hwdev, void *cpu_addr, size_t size,
-		      int direction)
 
 Maps a piece of processor virtual memory so it can be accessed by the
 device and returns the physical handle of the memory.
@@ -198,14 +178,10 @@ The direction for both api's may be converted freely by casting.
 However the dma_ API uses a strongly typed enumerator for its
 direction:
 
-DMA_NONE		= PCI_DMA_NONE		no direction (used for
-						debugging)
-DMA_TO_DEVICE		= PCI_DMA_TODEVICE	data is going from the
-						memory to the device
-DMA_FROM_DEVICE		= PCI_DMA_FROMDEVICE	data is coming from
-						the device to the
-						memory
-DMA_BIDIRECTIONAL	= PCI_DMA_BIDIRECTIONAL	direction isn't known
+DMA_NONE		no direction (used for debugging)
+DMA_TO_DEVICE		data is going from the memory to the device
+DMA_FROM_DEVICE		data is coming from the device to the memory
+DMA_BIDIRECTIONAL	direction isn't known
 
 Notes:  Not all memory regions in a machine can be mapped by this
 API.  Further, regions that appear to be physically contiguous in
@@ -268,9 +244,6 @@ cache lines are updated with data that the device may have changed).
 void
 dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 		 enum dma_data_direction direction)
-void
-pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
-		 size_t size, int direction)
 
 Unmaps the region previously mapped.  All the parameters passed in
 must be identical to those passed in (and returned) by the mapping
@@ -280,15 +253,9 @@ dma_addr_t
 dma_map_page(struct device *dev, struct page *page,
 		    unsigned long offset, size_t size,
 		    enum dma_data_direction direction)
-dma_addr_t
-pci_map_page(struct pci_dev *hwdev, struct page *page,
-		    unsigned long offset, size_t size, int direction)
 void
 dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
 	       enum dma_data_direction direction)
-void
-pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
-	       size_t size, int direction)
 
 API for mapping and unmapping for pages.  All the notes and warnings
 for the other mapping APIs apply here.  Also, although the <offset>
@@ -299,9 +266,6 @@ cache width is.
 int
 dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 
-int
-pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr)
-
 In some circumstances dma_map_single and dma_map_page will fail to create
 a mapping. A driver can check for these errors by testing the returned
 dma address with dma_mapping_error(). A non-zero return value means the mapping
@@ -311,9 +275,6 @@ reduce current DMA mapping usage or delay and try again later).
 	int
 	dma_map_sg(struct device *dev, struct scatterlist *sg,
 		int nents, enum dma_data_direction direction)
-	int
-	pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
-		int nents, int direction)
 
 Returns: the number of physical segments mapped (this may be shorter
 than <nents> passed in if some elements of the scatter/gather list are
@@ -353,9 +314,6 @@ accessed sg->address and sg->length as shown above.
 	void
 	dma_unmap_sg(struct device *dev, struct scatterlist *sg,
 		int nhwentries, enum dma_data_direction direction)
-	void
-	pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
-		int nents, int direction)
 
 Unmap the previously mapped scatter/gather list.  All the parameters
 must be the same as those and passed in to the scatter/gather mapping
@@ -365,21 +323,23 @@ Note: <nents> must be the number you passed in, *not* the number of
 physical entries returned.
 
 void
-dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size,
-		enum dma_data_direction direction)
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+			enum dma_data_direction direction)
 void
-pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle,
-			   size_t size, int direction)
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
+			   enum dma_data_direction direction)
 void
-dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems,
-			  enum dma_data_direction direction)
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+		    enum dma_data_direction direction)
 void
-pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg,
-		       int nelems, int direction)
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+		       enum dma_data_direction direction)
 
-Synchronise a single contiguous or scatter/gather mapping.  All the
-parameters must be the same as those passed into the single mapping
-API.
+Synchronise a single contiguous or scatter/gather mapping for the cpu
+and device. With the sync_sg API, all the parameters must be the same
+as those passed into the single mapping API. With the sync_single API,
+you can use dma_handle and size parameters that aren't identical to
+those passed into the single mapping API to do a partial sync.
 
 Notes:  You must do this:
 
@@ -461,9 +421,9 @@ void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
 Part II - Advanced dma_ usage
 -----------------------------
 
-Warning: These pieces of the DMA API have no PCI equivalent.  They
-should also not be used in the majority of cases, since they cater for
-unlikely corner cases that don't belong in usual drivers.
+Warning: These pieces of the DMA API should not be used in the
+majority of cases, since they cater for unlikely corner cases that
+don't belong in usual drivers.
 
 If you don't understand how cache line coherency works between a
 processor and an I/O device, you should not be using this part of the
@@ -513,16 +473,6 @@ line, but it will guarantee that one or more cache lines fit exactly
 into the width returned by this call.  It will also always be a power
 of two for easy alignment.
 
-void
-dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
-		      unsigned long offset, size_t size,
-		      enum dma_data_direction direction)
-
-Does a partial sync, starting at offset and continuing for size.  You
-must be careful to observe the cache alignment and width when doing
-anything like this.  You must also be extra careful about accessing
-memory you intend to sync partially.
-
 void
 dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 	       enum dma_data_direction direction)

+ 29 - 36
Documentation/DocBook/libata.tmpl

@@ -81,16 +81,14 @@ void (*port_disable) (struct ata_port *);
 	</programlisting>
 
 	<para>
-	Called from ata_bus_probe() and ata_bus_reset() error paths,
-	as well as when unregistering from the SCSI module (rmmod, hot
-	unplug).
+	Called from ata_bus_probe() error path, as well as when
+	unregistering from the SCSI module (rmmod, hot unplug).
 	This function should do whatever needs to be done to take the
 	port out of use.  In most cases, ata_port_disable() can be used
 	as this hook.
 	</para>
 	<para>
 	Called from ata_bus_probe() on a failed probe.
-	Called from ata_bus_reset() on a failed bus reset.
 	Called from ata_scsi_release().
 	</para>
 
@@ -107,10 +105,6 @@ void (*dev_config) (struct ata_port *, struct ata_device *);
 	issue of SET FEATURES - XFER MODE, and prior to operation.
 	</para>
 	<para>
-	Called by ata_device_add() after ata_dev_identify() determines
-	a device is present.
-	</para>
-	<para>
 	This entry may be specified as NULL in ata_port_operations.
 	</para>
 
@@ -154,8 +148,8 @@ unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned in
 
 	<sect2><title>Taskfile read/write</title>
 	<programlisting>
-void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
-void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
+void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
+void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
 	</programlisting>
 
 	<para>
@@ -164,36 +158,35 @@ void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
 	hardware registers / DMA buffers, to obtain the current set of
 	taskfile register values.
 	Most drivers for taskfile-based hardware (PIO or MMIO) use
-	ata_tf_load() and ata_tf_read() for these hooks.
+	ata_sff_tf_load() and ata_sff_tf_read() for these hooks.
 	</para>
 
 	</sect2>
 
 	<sect2><title>PIO data read/write</title>
 	<programlisting>
-void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
+void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
 	</programlisting>
 
 	<para>
 All bmdma-style drivers must implement this hook.  This is the low-level
 operation that actually copies the data bytes during a PIO data
 transfer.
-Typically the driver
-will choose one of ata_pio_data_xfer_noirq(), ata_pio_data_xfer(), or
-ata_mmio_data_xfer().
+Typically the driver will choose one of ata_sff_data_xfer_noirq(),
+ata_sff_data_xfer(), or ata_sff_data_xfer32().
 	</para>
 
 	</sect2>
 
 	<sect2><title>ATA command execute</title>
 	<programlisting>
-void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
+void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
 	</programlisting>
 
 	<para>
 	causes an ATA command, previously loaded with
 	->tf_load(), to be initiated in hardware.
-	Most drivers for taskfile-based hardware use ata_exec_command()
+	Most drivers for taskfile-based hardware use ata_sff_exec_command()
 	for this hook.
 	</para>
 
@@ -218,8 +211,8 @@ command.
 
 	<sect2><title>Read specific ATA shadow registers</title>
 	<programlisting>
-u8   (*check_status)(struct ata_port *ap);
-u8   (*check_altstatus)(struct ata_port *ap);
+u8   (*sff_check_status)(struct ata_port *ap);
+u8   (*sff_check_altstatus)(struct ata_port *ap);
 	</programlisting>
 
 	<para>
@@ -227,20 +220,26 @@ u8   (*check_altstatus)(struct ata_port *ap);
 	hardware.  On some hardware, reading the Status register has
 	the side effect of clearing the interrupt condition.
 	Most drivers for taskfile-based hardware use
-	ata_check_status() for this hook.
+	ata_sff_check_status() for this hook.
 	</para>
+
+	</sect2>
+
+	<sect2><title>Write specific ATA shadow register</title>
+	<programlisting>
+void (*sff_set_devctl)(struct ata_port *ap, u8 ctl);
+	</programlisting>
+
 	<para>
-	Note that because this is called from ata_device_add(), at
-	least a dummy function that clears device interrupts must be
-	provided for all drivers, even if the controller doesn't
-	actually have a taskfile status register.
+	Write the device control ATA shadow register to the hardware.
+	Most drivers don't need to define this.
 	</para>
 
 	</sect2>
 
 	<sect2><title>Select ATA device on bus</title>
 	<programlisting>
-void (*dev_select)(struct ata_port *ap, unsigned int device);
+void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
 	</programlisting>
 
 	<para>
@@ -251,9 +250,7 @@ void (*dev_select)(struct ata_port *ap, unsigned int device);
 	</para>
 	<para>
 	Most drivers for taskfile-based hardware use
-	ata_std_dev_select() for this hook.  Controllers which do not
-	support second drives on a port (such as SATA contollers) will
-	use ata_noop_dev_select().
+	ata_sff_dev_select() for this hook.
 	</para>
 
 	</sect2>
@@ -441,13 +438,13 @@ void (*irq_clear) (struct ata_port *);
 	to struct ata_host_set.
 	</para>
 	<para>
-	Most legacy IDE drivers use ata_interrupt() for the
+	Most legacy IDE drivers use ata_sff_interrupt() for the
 	irq_handler hook, which scans all ports in the host_set,
 	determines which queued command was active (if any), and calls
-	ata_host_intr(ap,qc).
+	ata_sff_host_intr(ap,qc).
 	</para>
 	<para>
-	Most legacy IDE drivers use ata_bmdma_irq_clear() for the
+	Most legacy IDE drivers use ata_sff_irq_clear() for the
 	irq_clear() hook, which simply clears the interrupt and error
 	flags in the DMA status register.
 	</para>
@@ -490,16 +487,12 @@ void (*host_stop) (struct ata_host_set *host_set);
 	allocates space for a legacy IDE PRD table and returns.
 	</para>
 	<para>
-	->port_stop() is called after ->host_stop().  It's sole function
+	->port_stop() is called after ->host_stop().  Its sole function
 	is to release DMA/memory resources, now that they are no longer
 	actively being used.  Many drivers also free driver-private
 	data from port at this time.
 	</para>
 	<para>
-	Many drivers use ata_port_stop() as this hook, which frees the
-	PRD table.
-	</para>
-	<para>
 	->host_stop() is called after all ->port_stop() calls
 have completed.  The hook must finalize hardware shutdown, release DMA
 and other resources, etc.

+ 11 - 0
Documentation/DocBook/media-entities.tmpl

@@ -17,6 +17,7 @@
 <!ENTITY VIDIOC-DBG-G-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_G_REGISTER</constant></link>">
 <!ENTITY VIDIOC-DBG-S-REGISTER "<link linkend='vidioc-dbg-g-register'><constant>VIDIOC_DBG_S_REGISTER</constant></link>">
 <!ENTITY VIDIOC-DQBUF "<link linkend='vidioc-qbuf'><constant>VIDIOC_DQBUF</constant></link>">
+<!ENTITY VIDIOC-DQEVENT "<link linkend='vidioc-dqevent'><constant>VIDIOC_DQEVENT</constant></link>">
 <!ENTITY VIDIOC-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_ENCODER_CMD</constant></link>">
 <!ENTITY VIDIOC-ENUMAUDIO "<link linkend='vidioc-enumaudio'><constant>VIDIOC_ENUMAUDIO</constant></link>">
 <!ENTITY VIDIOC-ENUMAUDOUT "<link linkend='vidioc-enumaudioout'><constant>VIDIOC_ENUMAUDOUT</constant></link>">
@@ -60,6 +61,7 @@
 <!ENTITY VIDIOC-REQBUFS "<link linkend='vidioc-reqbufs'><constant>VIDIOC_REQBUFS</constant></link>">
 <!ENTITY VIDIOC-STREAMOFF "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMOFF</constant></link>">
 <!ENTITY VIDIOC-STREAMON "<link linkend='vidioc-streamon'><constant>VIDIOC_STREAMON</constant></link>">
+<!ENTITY VIDIOC-SUBSCRIBE-EVENT "<link linkend='vidioc-subscribe-event'><constant>VIDIOC_SUBSCRIBE_EVENT</constant></link>">
 <!ENTITY VIDIOC-S-AUDIO "<link linkend='vidioc-g-audio'><constant>VIDIOC_S_AUDIO</constant></link>">
 <!ENTITY VIDIOC-S-AUDOUT "<link linkend='vidioc-g-audioout'><constant>VIDIOC_S_AUDOUT</constant></link>">
 <!ENTITY VIDIOC-S-CROP "<link linkend='vidioc-g-crop'><constant>VIDIOC_S_CROP</constant></link>">
@@ -83,6 +85,7 @@
 <!ENTITY VIDIOC-TRY-ENCODER-CMD "<link linkend='vidioc-encoder-cmd'><constant>VIDIOC_TRY_ENCODER_CMD</constant></link>">
 <!ENTITY VIDIOC-TRY-EXT-CTRLS "<link linkend='vidioc-g-ext-ctrls'><constant>VIDIOC_TRY_EXT_CTRLS</constant></link>">
 <!ENTITY VIDIOC-TRY-FMT "<link linkend='vidioc-g-fmt'><constant>VIDIOC_TRY_FMT</constant></link>">
+<!ENTITY VIDIOC-UNSUBSCRIBE-EVENT "<link linkend='vidioc-subscribe-event'><constant>VIDIOC_UNSUBSCRIBE_EVENT</constant></link>">
 
 <!-- Types -->
 <!ENTITY v4l2-std-id "<link linkend='v4l2-std-id'>v4l2_std_id</link>">
@@ -141,6 +144,9 @@
 <!ENTITY v4l2-enc-idx "struct&nbsp;<link linkend='v4l2-enc-idx'>v4l2_enc_idx</link>">
 <!ENTITY v4l2-enc-idx-entry "struct&nbsp;<link linkend='v4l2-enc-idx-entry'>v4l2_enc_idx_entry</link>">
 <!ENTITY v4l2-encoder-cmd "struct&nbsp;<link linkend='v4l2-encoder-cmd'>v4l2_encoder_cmd</link>">
+<!ENTITY v4l2-event "struct&nbsp;<link linkend='v4l2-event'>v4l2_event</link>">
+<!ENTITY v4l2-event-subscription "struct&nbsp;<link linkend='v4l2-event-subscription'>v4l2_event_subscription</link>">
+<!ENTITY v4l2-event-vsync "struct&nbsp;<link linkend='v4l2-event-vsync'>v4l2_event_vsync</link>">
 <!ENTITY v4l2-ext-control "struct&nbsp;<link linkend='v4l2-ext-control'>v4l2_ext_control</link>">
 <!ENTITY v4l2-ext-controls "struct&nbsp;<link linkend='v4l2-ext-controls'>v4l2_ext_controls</link>">
 <!ENTITY v4l2-fmtdesc "struct&nbsp;<link linkend='v4l2-fmtdesc'>v4l2_fmtdesc</link>">
@@ -200,6 +206,7 @@
 <!ENTITY sub-controls SYSTEM "v4l/controls.xml">
 <!ENTITY sub-dev-capture SYSTEM "v4l/dev-capture.xml">
 <!ENTITY sub-dev-codec SYSTEM "v4l/dev-codec.xml">
+<!ENTITY sub-dev-event SYSTEM "v4l/dev-event.xml">
 <!ENTITY sub-dev-effect SYSTEM "v4l/dev-effect.xml">
 <!ENTITY sub-dev-osd SYSTEM "v4l/dev-osd.xml">
 <!ENTITY sub-dev-output SYSTEM "v4l/dev-output.xml">
@@ -292,6 +299,8 @@
 <!ENTITY sub-v4l2grab-c SYSTEM "v4l/v4l2grab.c.xml">
 <!ENTITY sub-videodev2-h SYSTEM "v4l/videodev2.h.xml">
 <!ENTITY sub-v4l2 SYSTEM "v4l/v4l2.xml">
+<!ENTITY sub-dqevent SYSTEM "v4l/vidioc-dqevent.xml">
+<!ENTITY sub-subscribe-event SYSTEM "v4l/vidioc-subscribe-event.xml">
 <!ENTITY sub-intro SYSTEM "dvb/intro.xml">
 <!ENTITY sub-frontend SYSTEM "dvb/frontend.xml">
 <!ENTITY sub-dvbproperty SYSTEM "dvb/dvbproperty.xml">
@@ -381,3 +390,5 @@
 <!ENTITY reqbufs SYSTEM "v4l/vidioc-reqbufs.xml">
 <!ENTITY s-hw-freq-seek SYSTEM "v4l/vidioc-s-hw-freq-seek.xml">
 <!ENTITY streamon SYSTEM "v4l/vidioc-streamon.xml">
+<!ENTITY dqevent SYSTEM "v4l/vidioc-dqevent.xml">
+<!ENTITY subscribe_event SYSTEM "v4l/vidioc-subscribe-event.xml">

+ 3 - 3
Documentation/DocBook/mtdnand.tmpl

@@ -488,7 +488,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
 				The ECC bytes must be placed immidiately after the data
 				bytes in order to make the syndrome generator work. This
 				is contrary to the usual layout used by software ECC. The
-				seperation of data and out of band area is not longer
+				separation of data and out of band area is not longer
 				possible. The nand driver code handles this layout and
 				the remaining free bytes in the oob area are managed by 
 				the autoplacement code. Provide a matching oob-layout
@@ -560,7 +560,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
 				bad blocks. They have factory marked good blocks. The marker pattern
 				is erased when the block is erased to be reused. So in case of
 				powerloss before writing the pattern back to the chip this block 
-				would be lost and added to the bad blocks. Therefor we scan the 
+				would be lost and added to the bad blocks. Therefore we scan the 
 				chip(s) when we detect them the first time for good blocks and 
 				store this information in a bad block table before erasing any 
 				of the blocks.
@@ -1094,7 +1094,7 @@ in this page</entry>
 		manufacturers specifications. This applies similar to the spare area. 
 	</para>
 	<para>
-		Therefor NAND aware filesystems must either write in page size chunks
+		Therefore NAND aware filesystems must either write in page size chunks
 		or hold a writebuffer to collect smaller writes until they sum up to 
 		pagesize. Available NAND aware filesystems: JFFS2, YAFFS. 		
 	</para>

+ 7 - 3
Documentation/DocBook/sh.tmpl

@@ -19,13 +19,17 @@
   </authorgroup>
 
   <copyright>
-   <year>2008</year>
+   <year>2008-2010</year>
    <holder>Paul Mundt</holder>
   </copyright>
   <copyright>
-   <year>2008</year>
+   <year>2008-2010</year>
    <holder>Renesas Technology Corp.</holder>
   </copyright>
+  <copyright>
+   <year>2010</year>
+   <holder>Renesas Electronics Corp.</holder>
+  </copyright>
 
   <legalnotice>
    <para>
@@ -77,7 +81,7 @@
   </chapter>
   <chapter id="clk">
     <title>Clock Framework Extensions</title>
-!Iarch/sh/include/asm/clock.h
+!Iinclude/linux/sh_clk.h
   </chapter>
   <chapter id="mach">
     <title>Machine Specific Interfaces</title>

+ 13 - 0
Documentation/DocBook/tracepoint.tmpl

@@ -16,6 +16,15 @@
      </address>
     </affiliation>
    </author>
+   <author>
+    <firstname>William</firstname>
+    <surname>Cohen</surname>
+    <affiliation>
+     <address>
+      <email>wcohen@redhat.com</email>
+     </address>
+    </affiliation>
+   </author>
   </authorgroup>
 
   <legalnotice>
@@ -91,4 +100,8 @@
 !Iinclude/trace/events/signal.h
   </chapter>
 
+  <chapter id="block">
+   <title>Block IO</title>
+!Iinclude/trace/events/block.h
+  </chapter>
 </book>

+ 1 - 1
Documentation/DocBook/v4l/common.xml

@@ -1170,7 +1170,7 @@ frames per second. If less than this number of frames is to be
 captured or output, applications can request frame skipping or
 duplicating on the driver side. This is especially useful when using
 the &func-read; or &func-write;, which are not augmented by timestamps
-or sequence counters, and to avoid unneccessary data copying.</para>
+or sequence counters, and to avoid unnecessary data copying.</para>
 
     <para>Finally these ioctls can be used to determine the number of
 buffers used internally by a driver in read/write mode. For

+ 69 - 57
Documentation/DocBook/v4l/compat.xml

@@ -2332,15 +2332,26 @@ more information.</para>
 	</listitem>
       </orderedlist>
     </section>
-   </section>
+    <section>
+      <title>V4L2 in Linux 2.6.34</title>
+      <orderedlist>
+	<listitem>
+	  <para>Added
+<constant>V4L2_CID_IRIS_ABSOLUTE</constant> and
+<constant>V4L2_CID_IRIS_RELATIVE</constant> controls to the
+	    <link linkend="camera-controls">Camera controls class</link>.
+	  </para>
+	</listitem>
+      </orderedlist>
+    </section>
 
-   <section id="other">
-     <title>Relation of V4L2 to other Linux multimedia APIs</title>
+    <section id="other">
+      <title>Relation of V4L2 to other Linux multimedia APIs</title>
 
-    <section id="xvideo">
-      <title>X Video Extension</title>
+      <section id="xvideo">
+        <title>X Video Extension</title>
 
-      <para>The X Video Extension (abbreviated XVideo or just Xv) is
+        <para>The X Video Extension (abbreviated XVideo or just Xv) is
 an extension of the X Window system, implemented for example by the
 XFree86 project. Its scope is similar to V4L2, an API to video capture
 and output devices for X clients. Xv allows applications to display
@@ -2351,7 +2362,7 @@ capture or output still images in XPixmaps<footnote>
 extension available across many operating systems and
 architectures.</para>
 
-      <para>Because the driver is embedded into the X server Xv has a
+        <para>Because the driver is embedded into the X server Xv has a
 number of advantages over the V4L2 <link linkend="overlay">video
 overlay interface</link>. The driver can easily determine the overlay
 target, &ie; visible graphics memory or off-screen buffers for a
@@ -2360,16 +2371,16 @@ overlay, scaling or color-keying, or the clipping functions of the
 video capture hardware, always in sync with drawing operations or
 windows moving or changing their stacking order.</para>
 
-      <para>To combine the advantages of Xv and V4L a special Xv
+        <para>To combine the advantages of Xv and V4L a special Xv
 driver exists in XFree86 and XOrg, just programming any overlay capable
 Video4Linux device it finds. To enable it
 <filename>/etc/X11/XF86Config</filename> must contain these lines:</para>
-      <para><screen>
+        <para><screen>
 Section "Module"
     Load "v4l"
 EndSection</screen></para>
 
-      <para>As of XFree86 4.2 this driver still supports only V4L
+        <para>As of XFree86 4.2 this driver still supports only V4L
 ioctls, however it should work just fine with all V4L2 devices through
 the V4L2 backward-compatibility layer. Since V4L2 permits multiple
 opens it is possible (if supported by the V4L2 driver) to capture
@@ -2377,83 +2388,84 @@ video while an X client requested video overlay. Restrictions of
 simultaneous capturing and overlay are discussed in <xref
 	  linkend="overlay" /> apply.</para>
 
-      <para>Only marginally related to V4L2, XFree86 extended Xv to
+        <para>Only marginally related to V4L2, XFree86 extended Xv to
 support hardware YUV to RGB conversion and scaling for faster video
 playback, and added an interface to MPEG-2 decoding hardware. This API
 is useful to display images captured with V4L2 devices.</para>
-    </section>
+      </section>
 
-    <section>
-      <title>Digital Video</title>
+      <section>
+        <title>Digital Video</title>
 
-      <para>V4L2 does not support digital terrestrial, cable or
+        <para>V4L2 does not support digital terrestrial, cable or
 satellite broadcast. A separate project aiming at digital receivers
 exists. You can find its homepage at <ulink
 url="http://linuxtv.org">http://linuxtv.org</ulink>. The Linux DVB API
 has no connection to the V4L2 API except that drivers for hybrid
 hardware may support both.</para>
-    </section>
+      </section>
 
-    <section>
-      <title>Audio Interfaces</title>
+      <section>
+        <title>Audio Interfaces</title>
 
-      <para>[to do - OSS/ALSA]</para>
+        <para>[to do - OSS/ALSA]</para>
+      </section>
     </section>
-  </section>
 
-  <section id="experimental">
-    <title>Experimental API Elements</title>
+    <section id="experimental">
+      <title>Experimental API Elements</title>
 
-    <para>The following V4L2 API elements are currently experimental
+      <para>The following V4L2 API elements are currently experimental
 and may change in the future.</para>
 
-    <itemizedlist>
-      <listitem>
-	<para>Video Output Overlay (OSD) Interface, <xref
+      <itemizedlist>
+        <listitem>
+	  <para>Video Output Overlay (OSD) Interface, <xref
 	    linkend="osd" />.</para>
-      </listitem>
+        </listitem>
 	<listitem>
-	<para><constant>V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY</constant>,
+	  <para><constant>V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY</constant>,
 	&v4l2-buf-type;, <xref linkend="v4l2-buf-type" />.</para>
-      </listitem>
-      <listitem>
-	<para><constant>V4L2_CAP_VIDEO_OUTPUT_OVERLAY</constant>,
+        </listitem>
+        <listitem>
+	  <para><constant>V4L2_CAP_VIDEO_OUTPUT_OVERLAY</constant>,
 &VIDIOC-QUERYCAP; ioctl, <xref linkend="device-capabilities" />.</para>
-      </listitem>
-      <listitem>
-	<para>&VIDIOC-ENUM-FRAMESIZES; and
+        </listitem>
+        <listitem>
+	  <para>&VIDIOC-ENUM-FRAMESIZES; and
 &VIDIOC-ENUM-FRAMEINTERVALS; ioctls.</para>
-      </listitem>
-      <listitem>
-	<para>&VIDIOC-G-ENC-INDEX; ioctl.</para>
-      </listitem>
-      <listitem>
-	<para>&VIDIOC-ENCODER-CMD; and &VIDIOC-TRY-ENCODER-CMD;
+        </listitem>
+        <listitem>
+	  <para>&VIDIOC-G-ENC-INDEX; ioctl.</para>
+        </listitem>
+        <listitem>
+	  <para>&VIDIOC-ENCODER-CMD; and &VIDIOC-TRY-ENCODER-CMD;
 ioctls.</para>
-      </listitem>
-      <listitem>
-	<para>&VIDIOC-DBG-G-REGISTER; and &VIDIOC-DBG-S-REGISTER;
+        </listitem>
+        <listitem>
+	  <para>&VIDIOC-DBG-G-REGISTER; and &VIDIOC-DBG-S-REGISTER;
 ioctls.</para>
-      </listitem>
-      <listitem>
-	<para>&VIDIOC-DBG-G-CHIP-IDENT; ioctl.</para>
-      </listitem>
-    </itemizedlist>
-  </section>
+        </listitem>
+        <listitem>
+	  <para>&VIDIOC-DBG-G-CHIP-IDENT; ioctl.</para>
+        </listitem>
+      </itemizedlist>
+    </section>
 
-  <section id="obsolete">
-    <title>Obsolete API Elements</title>
+    <section id="obsolete">
+      <title>Obsolete API Elements</title>
 
-    <para>The following V4L2 API elements were superseded by new
+      <para>The following V4L2 API elements were superseded by new
 interfaces and should not be implemented in new drivers.</para>
 
-    <itemizedlist>
-      <listitem>
-	<para><constant>VIDIOC_G_MPEGCOMP</constant> and
+      <itemizedlist>
+        <listitem>
+	  <para><constant>VIDIOC_G_MPEGCOMP</constant> and
 <constant>VIDIOC_S_MPEGCOMP</constant> ioctls. Use Extended Controls,
 <xref linkend="extended-controls" />.</para>
-      </listitem>
-    </itemizedlist>
+        </listitem>
+      </itemizedlist>
+    </section>
   </section>
 
   <!--

+ 34 - 2
Documentation/DocBook/v4l/controls.xml

@@ -266,6 +266,12 @@ minimum value disables backlight compensation.</entry>
 	    <entry>boolean</entry>
 	    <entry>Chroma automatic gain control.</entry>
 	  </row>
+	  <row>
+	    <entry><constant>V4L2_CID_CHROMA_GAIN</constant></entry>
+	    <entry>integer</entry>
+	    <entry>Adjusts the Chroma gain control (for use when chroma AGC
+	    is disabled).</entry>
+	  </row>
 	  <row>
 	    <entry><constant>V4L2_CID_COLOR_KILLER</constant></entry>
 	    <entry>boolean</entry>
@@ -277,8 +283,15 @@ minimum value disables backlight compensation.</entry>
 	    <entry>Selects a color effect. Possible values for
 <constant>enum v4l2_colorfx</constant> are:
 <constant>V4L2_COLORFX_NONE</constant> (0),
-<constant>V4L2_COLORFX_BW</constant> (1) and
-<constant>V4L2_COLORFX_SEPIA</constant> (2).</entry>
+<constant>V4L2_COLORFX_BW</constant> (1),
+<constant>V4L2_COLORFX_SEPIA</constant> (2),
+<constant>V4L2_COLORFX_NEGATIVE</constant> (3),
+<constant>V4L2_COLORFX_EMBOSS</constant> (4),
+<constant>V4L2_COLORFX_SKETCH</constant> (5),
+<constant>V4L2_COLORFX_SKY_BLUE</constant> (6),
+<constant>V4L2_COLORFX_GRASS_GREEN</constant> (7),
+<constant>V4L2_COLORFX_SKIN_WHITEN</constant> (8) and
+<constant>V4L2_COLORFX_VIVID</constant> (9).</entry>
 	  </row>
 	  <row>
 	    <entry><constant>V4L2_CID_ROTATE</constant></entry>
@@ -1824,6 +1837,25 @@ wide-angle direction. The zoom speed unit is driver-specific.</entry>
 	  </row>
 	  <row><entry></entry></row>
 
+	  <row>
+	    <entry spanname="id"><constant>V4L2_CID_IRIS_ABSOLUTE</constant>&nbsp;</entry>
+	    <entry>integer</entry>
+	  </row><row><entry spanname="descr">This control sets the
+camera's aperture to the specified value. The unit is undefined.
+Larger values open the iris wider, smaller values close it.</entry>
+	  </row>
+	  <row><entry></entry></row>
+
+	  <row>
+	    <entry spanname="id"><constant>V4L2_CID_IRIS_RELATIVE</constant>&nbsp;</entry>
+	    <entry>integer</entry>
+	  </row><row><entry spanname="descr">This control modifies the
+camera's aperture by the specified amount. The unit is undefined.
+Positive values open the iris one step further, negative values close
+it one step further. This is a write-only control.</entry>
+	  </row>
+	  <row><entry></entry></row>
+
 	  <row>
 	    <entry spanname="id"><constant>V4L2_CID_PRIVACY</constant>&nbsp;</entry>
 	    <entry>boolean</entry>

+ 31 - 0
Documentation/DocBook/v4l/dev-event.xml

@@ -0,0 +1,31 @@
+  <title>Event Interface</title>
+
+  <para>The V4L2 event interface provides means for user to get
+  immediately notified on certain conditions taking place on a device.
+  This might include start of frame or loss of signal events, for
+  example.
+  </para>
+
+  <para>To receive events, the events the user is interested in first must
+  be subscribed using the &VIDIOC-SUBSCRIBE-EVENT; ioctl. Once an event is
+  subscribed, the events of subscribed types are dequeueable using the
+  &VIDIOC-DQEVENT; ioctl. Events may be unsubscribed using
+  VIDIOC_UNSUBSCRIBE_EVENT ioctl. The special event type V4L2_EVENT_ALL may
+  be used to unsubscribe all the events the driver supports.</para>
+
+  <para>The event subscriptions and event queues are specific to file
+  handles. Subscribing an event on one file handle does not affect
+  other file handles.
+  </para>
+
+  <para>The information on dequeueable events is obtained by using select or
+  poll system calls on video devices. The V4L2 events use POLLPRI events on
+  poll system call and exceptions on select system call.  </para>
+
+  <!--
+Local Variables:
+mode: sgml
+sgml-parent-document: "v4l2.sgml"
+indent-tabs-mode: nil
+End:
+  -->

+ 14 - 4
Documentation/DocBook/v4l/io.xml

@@ -701,6 +701,16 @@ buffer cannot be on both queues at the same time, the
 They can be both cleared however, then the buffer is in "dequeued"
 state, in the application domain to say so.</entry>
 	  </row>
+	  <row>
+	    <entry><constant>V4L2_BUF_FLAG_ERROR</constant></entry>
+	    <entry>0x0040</entry>
+	    <entry>When this flag is set, the buffer has been dequeued
+	    successfully, although the data might have been corrupted.
+	    This is recoverable, streaming may continue as normal and
+	    the buffer may be reused normally.
+	    Drivers set this flag when the <constant>VIDIOC_DQBUF</constant>
+	    ioctl is called.</entry>
+	  </row>
 	  <row>
 	    <entry><constant>V4L2_BUF_FLAG_KEYFRAME</constant></entry>
 	    <entry>0x0008</entry>
@@ -918,8 +928,8 @@ order</emphasis>.</para>
 
     <para>When the driver provides or accepts images field by field
 rather than interleaved, it is also important applications understand
-how the fields combine to frames. We distinguish between top and
-bottom fields, the <emphasis>spatial order</emphasis>: The first line
+how the fields combine to frames. We distinguish between top (aka odd) and
+bottom (aka even) fields, the <emphasis>spatial order</emphasis>: The first line
 of the top field is the first line of an interlaced frame, the first
 line of the bottom field is the second line of that frame.</para>
 
@@ -972,12 +982,12 @@ between <constant>V4L2_FIELD_TOP</constant> and
 	  <row>
 	    <entry><constant>V4L2_FIELD_TOP</constant></entry>
 	    <entry>2</entry>
-	    <entry>Images consist of the top field only.</entry>
+	    <entry>Images consist of the top (aka odd) field only.</entry>
 	  </row>
 	  <row>
 	    <entry><constant>V4L2_FIELD_BOTTOM</constant></entry>
 	    <entry>3</entry>
-	    <entry>Images consist of the bottom field only.
+	    <entry>Images consist of the bottom (aka even) field only.
 Applications may wish to prevent a device from capturing interlaced
 images because they will have "comb" or "feathering" artefacts around
 moving objects.</entry>

+ 12 - 0
Documentation/DocBook/v4l/pixfmt.xml

@@ -792,6 +792,18 @@ http://www.thedirks.org/winnov/</ulink></para></entry>
 	    <entry>'YYUV'</entry>
 	    <entry>unknown</entry>
 	  </row>
+	  <row id="V4L2-PIX-FMT-Y4">
+	    <entry><constant>V4L2_PIX_FMT_Y4</constant></entry>
+	    <entry>'Y04 '</entry>
+	    <entry>Old 4-bit greyscale format. Only the least significant 4 bits of each byte are used,
+the other bits are set to 0.</entry>
+	  </row>
+	  <row id="V4L2-PIX-FMT-Y6">
+	    <entry><constant>V4L2_PIX_FMT_Y6</constant></entry>
+	    <entry>'Y06 '</entry>
+	    <entry>Old 6-bit greyscale format. Only the least significant 6 bits of each byte are used,
+the other bits are set to 0.</entry>
+	  </row>
 	</tbody>
       </tgroup>
     </table>

+ 3 - 0
Documentation/DocBook/v4l/v4l2.xml

@@ -401,6 +401,7 @@ and discussions on the V4L mailing list.</revremark>
     <section id="ttx"> &sub-dev-teletext; </section>
     <section id="radio"> &sub-dev-radio; </section>
     <section id="rds"> &sub-dev-rds; </section>
+    <section id="event"> &sub-dev-event; </section>
   </chapter>
 
   <chapter id="driver">
@@ -426,6 +427,7 @@ and discussions on the V4L mailing list.</revremark>
     &sub-cropcap;
     &sub-dbg-g-chip-ident;
     &sub-dbg-g-register;
+    &sub-dqevent;
     &sub-encoder-cmd;
     &sub-enumaudio;
     &sub-enumaudioout;
@@ -467,6 +469,7 @@ and discussions on the V4L mailing list.</revremark>
     &sub-reqbufs;
     &sub-s-hw-freq-seek;
     &sub-streamon;
+    &sub-subscribe-event;
     <!-- End of ioctls. -->
     &sub-mmap;
     &sub-munmap;

+ 10 - 0
Documentation/DocBook/v4l/videodev2.h.xml

@@ -1018,6 +1018,13 @@ enum <link linkend="v4l2-colorfx">v4l2_colorfx</link> {
         V4L2_COLORFX_NONE       = 0,
         V4L2_COLORFX_BW         = 1,
         V4L2_COLORFX_SEPIA      = 2,
+        V4L2_COLORFX_NEGATIVE   = 3,
+        V4L2_COLORFX_EMBOSS     = 4,
+        V4L2_COLORFX_SKETCH     = 5,
+        V4L2_COLORFX_SKY_BLUE   = 6,
+        V4L2_COLORFX_GRASS_GREEN = 7,
+        V4L2_COLORFX_SKIN_WHITEN = 8,
+        V4L2_COLORFX_VIVID      = 9.
 };
 #define V4L2_CID_AUTOBRIGHTNESS                 (V4L2_CID_BASE+32)
 #define V4L2_CID_BAND_STOP_FILTER               (V4L2_CID_BASE+33)
@@ -1271,6 +1278,9 @@ enum  <link linkend="v4l2-exposure-auto-type">v4l2_exposure_auto_type</link> {
 
 #define V4L2_CID_PRIVACY                        (V4L2_CID_CAMERA_CLASS_BASE+16)
 
+#define V4L2_CID_IRIS_ABSOLUTE                  (V4L2_CID_CAMERA_CLASS_BASE+17)
+#define V4L2_CID_IRIS_RELATIVE                  (V4L2_CID_CAMERA_CLASS_BASE+18)
+
 /* FM Modulator class control IDs */
 #define V4L2_CID_FM_TX_CLASS_BASE               (V4L2_CTRL_CLASS_FM_TX | 0x900)
 #define V4L2_CID_FM_TX_CLASS                    (V4L2_CTRL_CLASS_FM_TX | 1)

+ 131 - 0
Documentation/DocBook/v4l/vidioc-dqevent.xml

@@ -0,0 +1,131 @@
+<refentry id="vidioc-dqevent">
+  <refmeta>
+    <refentrytitle>ioctl VIDIOC_DQEVENT</refentrytitle>
+    &manvol;
+  </refmeta>
+
+  <refnamediv>
+    <refname>VIDIOC_DQEVENT</refname>
+    <refpurpose>Dequeue event</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <funcsynopsis>
+      <funcprototype>
+	<funcdef>int <function>ioctl</function></funcdef>
+	<paramdef>int <parameter>fd</parameter></paramdef>
+	<paramdef>int <parameter>request</parameter></paramdef>
+	<paramdef>struct v4l2_event
+*<parameter>argp</parameter></paramdef>
+      </funcprototype>
+    </funcsynopsis>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>Arguments</title>
+
+    <variablelist>
+      <varlistentry>
+	<term><parameter>fd</parameter></term>
+	<listitem>
+	  <para>&fd;</para>
+	</listitem>
+      </varlistentry>
+      <varlistentry>
+	<term><parameter>request</parameter></term>
+	<listitem>
+	  <para>VIDIOC_DQEVENT</para>
+	</listitem>
+      </varlistentry>
+      <varlistentry>
+	<term><parameter>argp</parameter></term>
+	<listitem>
+	  <para></para>
+	</listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>Description</title>
+
+    <para>Dequeue an event from a video device. No input is required
+    for this ioctl. All the fields of the &v4l2-event; structure are
+    filled by the driver. The file handle will also receive exceptions
+    which the application may get by e.g. using the select system
+    call.</para>
+
+    <table frame="none" pgwide="1" id="v4l2-event">
+      <title>struct <structname>v4l2_event</structname></title>
+      <tgroup cols="4">
+	&cs-str;
+	<tbody valign="top">
+	  <row>
+	    <entry>__u32</entry>
+	    <entry><structfield>type</structfield></entry>
+            <entry></entry>
+	    <entry>Type of the event.</entry>
+	  </row>
+	  <row>
+	    <entry>union</entry>
+	    <entry><structfield>u</structfield></entry>
+            <entry></entry>
+	    <entry></entry>
+	  </row>
+	  <row>
+	    <entry></entry>
+	    <entry>&v4l2-event-vsync;</entry>
+            <entry><structfield>vsync</structfield></entry>
+	    <entry>Event data for event V4L2_EVENT_VSYNC.
+            </entry>
+	  </row>
+	  <row>
+	    <entry></entry>
+	    <entry>__u8</entry>
+            <entry><structfield>data</structfield>[64]</entry>
+	    <entry>Event data. Defined by the event type. The union
+            should be used to define easily accessible type for
+            events.</entry>
+	  </row>
+	  <row>
+	    <entry>__u32</entry>
+	    <entry><structfield>pending</structfield></entry>
+            <entry></entry>
+	    <entry>Number of pending events excluding this one.</entry>
+	  </row>
+	  <row>
+	    <entry>__u32</entry>
+	    <entry><structfield>sequence</structfield></entry>
+            <entry></entry>
+	    <entry>Event sequence number. The sequence number is
+	    incremented for every subscribed event that takes place.
+	    If sequence numbers are not contiguous it means that
+	    events have been lost.
+	    </entry>
+	  </row>
+	  <row>
+	    <entry>struct timespec</entry>
+	    <entry><structfield>timestamp</structfield></entry>
+            <entry></entry>
+	    <entry>Event timestamp.</entry>
+	  </row>
+	  <row>
+	    <entry>__u32</entry>
+	    <entry><structfield>reserved</structfield>[9]</entry>
+            <entry></entry>
+	    <entry>Reserved for future extensions. Drivers must set
+	    the array to zero.</entry>
+	  </row>
+	</tbody>
+      </tgroup>
+    </table>
+
+  </refsect1>
+</refentry>
+<!--
+Local Variables:
+mode: sgml
+sgml-parent-document: "v4l2.sgml"
+indent-tabs-mode: nil
+End:
+-->

+ 1 - 1
Documentation/DocBook/v4l/vidioc-enuminput.xml

@@ -283,7 +283,7 @@ input/output interface to linux-media@vger.kernel.org on 19 Oct 2009.
 	    <entry>This input supports setting DV presets by using VIDIOC_S_DV_PRESET.</entry>
 	  </row>
 	  <row>
-	    <entry><constant>V4L2_OUT_CAP_CUSTOM_TIMINGS</constant></entry>
+	    <entry><constant>V4L2_IN_CAP_CUSTOM_TIMINGS</constant></entry>
 	    <entry>0x00000002</entry>
 	    <entry>This input supports setting custom video timings by using VIDIOC_S_DV_TIMINGS.</entry>
 	  </row>

+ 1 - 1
Documentation/DocBook/v4l/vidioc-g-parm.xml

@@ -55,7 +55,7 @@ captured or output, applications can request frame skipping or
 duplicating on the driver side. This is especially useful when using
 the <function>read()</function> or <function>write()</function>, which
 are not augmented by timestamps or sequence counters, and to avoid
-unneccessary data copying.</para>
+unnecessary data copying.</para>
 
     <para>Further these ioctls can be used to determine the number of
 buffers used internally by a driver in read/write mode. For

+ 12 - 2
Documentation/DocBook/v4l/vidioc-qbuf.xml

@@ -111,7 +111,11 @@ from the driver's outgoing queue. They just set the
 and <structfield>reserved</structfield>
 fields of a &v4l2-buffer; as above, when <constant>VIDIOC_DQBUF</constant>
 is called with a pointer to this structure the driver fills the
-remaining fields or returns an error code.</para>
+remaining fields or returns an error code. The driver may also set
+<constant>V4L2_BUF_FLAG_ERROR</constant> in the <structfield>flags</structfield>
+field. It indicates a non-critical (recoverable) streaming error. In such case
+the application may continue as normal, but should be aware that data in the
+dequeued buffer might be corrupted.</para>
 
     <para>By default <constant>VIDIOC_DQBUF</constant> blocks when no
 buffer is in the outgoing queue. When the
@@ -158,7 +162,13 @@ enqueue a user pointer buffer.</para>
 	  <para><constant>VIDIOC_DQBUF</constant> failed due to an
 internal error. Can also indicate temporary problems like signal
 loss. Note the driver might dequeue an (empty) buffer despite
-returning an error, or even stop capturing.</para>
+returning an error, or even stop capturing. Reusing such buffer may be unsafe
+though and its details (e.g. <structfield>index</structfield>) may not be
+returned either. It is recommended that drivers indicate recoverable errors
+by setting the <constant>V4L2_BUF_FLAG_ERROR</constant> and returning 0 instead.
+In that case the application should be able to safely reuse the buffer and
+continue streaming.
+	</para>
 	</listitem>
       </varlistentry>
     </variablelist>

+ 1 - 1
Documentation/DocBook/v4l/vidioc-queryctrl.xml

@@ -325,7 +325,7 @@ should be part of the control documentation.</entry>
 	    <entry>n/a</entry>
 	    <entry>This is not a control. When
 <constant>VIDIOC_QUERYCTRL</constant> is called with a control ID
-equal to a control class code (see <xref linkend="ctrl-class" />), the
+equal to a control class code (see <xref linkend="ctrl-class" />) + 1, the
 ioctl returns the name of the control class and this control type.
 Older drivers which do not support this feature return an
 &EINVAL;.</entry>

+ 1 - 1
Documentation/DocBook/v4l/vidioc-reqbufs.xml

@@ -61,7 +61,7 @@ fields of the <structname>v4l2_requestbuffers</structname> structure.
 They set the <structfield>type</structfield> field to the respective
 stream or buffer type, the <structfield>count</structfield> field to
 the desired number of buffers, <structfield>memory</structfield>
-must be set to the requested I/O method and the reserved array
+must be set to the requested I/O method and the <structfield>reserved</structfield> array
 must be zeroed. When the ioctl
 is called with a pointer to this structure the driver will attempt to allocate
 the requested number of buffers and it stores the actual number

+ 133 - 0
Documentation/DocBook/v4l/vidioc-subscribe-event.xml

@@ -0,0 +1,133 @@
+<refentry id="vidioc-subscribe-event">
+  <refmeta>
+    <refentrytitle>ioctl VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</refentrytitle>
+    &manvol;
+  </refmeta>
+
+  <refnamediv>
+    <refname>VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</refname>
+    <refpurpose>Subscribe or unsubscribe event</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <funcsynopsis>
+      <funcprototype>
+	<funcdef>int <function>ioctl</function></funcdef>
+	<paramdef>int <parameter>fd</parameter></paramdef>
+	<paramdef>int <parameter>request</parameter></paramdef>
+	<paramdef>struct v4l2_event_subscription
+*<parameter>argp</parameter></paramdef>
+      </funcprototype>
+    </funcsynopsis>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>Arguments</title>
+
+    <variablelist>
+      <varlistentry>
+	<term><parameter>fd</parameter></term>
+	<listitem>
+	  <para>&fd;</para>
+	</listitem>
+      </varlistentry>
+      <varlistentry>
+	<term><parameter>request</parameter></term>
+	<listitem>
+	  <para>VIDIOC_SUBSCRIBE_EVENT, VIDIOC_UNSUBSCRIBE_EVENT</para>
+	</listitem>
+      </varlistentry>
+      <varlistentry>
+	<term><parameter>argp</parameter></term>
+	<listitem>
+	  <para></para>
+	</listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>Description</title>
+
+    <para>Subscribe or unsubscribe V4L2 event. Subscribed events are
+    dequeued by using the &VIDIOC-DQEVENT; ioctl.</para>
+
+    <table frame="none" pgwide="1" id="v4l2-event-subscription">
+      <title>struct <structname>v4l2_event_subscription</structname></title>
+      <tgroup cols="3">
+	&cs-str;
+	<tbody valign="top">
+	  <row>
+	    <entry>__u32</entry>
+	    <entry><structfield>type</structfield></entry>
+	    <entry>Type of the event.</entry>
+	  </row>
+	  <row>
+	    <entry>__u32</entry>
+	    <entry><structfield>reserved</structfield>[7]</entry>
+	    <entry>Reserved for future extensions. Drivers and applications
+	    must set the array to zero.</entry>
+	  </row>
+	</tbody>
+      </tgroup>
+    </table>
+
+    <table frame="none" pgwide="1" id="event-type">
+      <title>Event Types</title>
+      <tgroup cols="3">
+	&cs-def;
+	<tbody valign="top">
+	  <row>
+	    <entry><constant>V4L2_EVENT_ALL</constant></entry>
+	    <entry>0</entry>
+	    <entry>All events. V4L2_EVENT_ALL is valid only for
+	    VIDIOC_UNSUBSCRIBE_EVENT for unsubscribing all events at once.
+	    </entry>
+	  </row>
+	  <row>
+	    <entry><constant>V4L2_EVENT_VSYNC</constant></entry>
+	    <entry>1</entry>
+	    <entry>This event is triggered on the vertical sync.
+	    This event has &v4l2-event-vsync; associated with it.
+	    </entry>
+	  </row>
+	  <row>
+	    <entry><constant>V4L2_EVENT_EOS</constant></entry>
+	    <entry>2</entry>
+	    <entry>This event is triggered when the end of a stream is reached.
+	    This is typically used with MPEG decoders to report to the application
+	    when the last of the MPEG stream has been decoded.
+	    </entry>
+	  </row>
+	  <row>
+	    <entry><constant>V4L2_EVENT_PRIVATE_START</constant></entry>
+	    <entry>0x08000000</entry>
+	    <entry>Base event number for driver-private events.</entry>
+	  </row>
+	</tbody>
+      </tgroup>
+    </table>
+
+    <table frame="none" pgwide="1" id="v4l2-event-vsync">
+      <title>struct <structname>v4l2_event_vsync</structname></title>
+      <tgroup cols="3">
+	&cs-str;
+	<tbody valign="top">
+	  <row>
+	    <entry>__u8</entry>
+	    <entry><structfield>field</structfield></entry>
+	    <entry>The upcoming field. See &v4l2-field;.</entry>
+	  </row>
+	</tbody>
+      </tgroup>
+    </table>
+
+  </refsect1>
+</refentry>
+<!--
+Local Variables:
+mode: sgml
+sgml-parent-document: "v4l2.sgml"
+indent-tabs-mode: nil
+End:
+-->

+ 17 - 10
Documentation/DocBook/writing-an-alsa-driver.tmpl

@@ -5518,34 +5518,41 @@ struct _snd_pcm_runtime {
 ]]>
         </programlisting>
       </informalexample>
+
+      For the raw data, <structfield>size</structfield> field must be
+      set properly.  This specifies the maximum size of the proc file access.
     </para>
 
     <para>
-      The callback is much more complicated than the text-file
-      version. You need to use a low-level I/O functions such as
+      The read/write callbacks of raw mode are more direct than the text mode.
+      You need to use a low-level I/O functions such as
       <function>copy_from/to_user()</function> to transfer the
       data.
 
       <informalexample>
         <programlisting>
 <![CDATA[
-  static long my_file_io_read(struct snd_info_entry *entry,
+  static ssize_t my_file_io_read(struct snd_info_entry *entry,
                               void *file_private_data,
                               struct file *file,
                               char *buf,
-                              unsigned long count,
-                              unsigned long pos)
+                              size_t count,
+                              loff_t pos)
   {
-          long size = count;
-          if (pos + size > local_max_size)
-                  size = local_max_size - pos;
-          if (copy_to_user(buf, local_data + pos, size))
+          if (copy_to_user(buf, local_data + pos, count))
                   return -EFAULT;
-          return size;
+          return count;
   }
 ]]>
         </programlisting>
       </informalexample>
+
+      If the size of the info entry has been set up properly,
+      <structfield>count</structfield> and <structfield>pos</structfield> are
+      guaranteed to fit within 0 and the given size.
+      You don't have to check the range in the callbacks unless any
+      other condition is required.
+
     </para>
 
   </chapter>

+ 1 - 1
Documentation/DocBook/writing_usb_driver.tmpl

@@ -342,7 +342,7 @@ static inline void skel_delete (struct usb_skel *dev)
 {
     kfree (dev->bulk_in_buffer);
     if (dev->bulk_out_buffer != NULL)
-        usb_buffer_free (dev->udev, dev->bulk_out_size,
+        usb_free_coherent (dev->udev, dev->bulk_out_size,
             dev->bulk_out_buffer,
             dev->write_urb->transfer_dma);
     usb_free_urb (dev->write_urb);

+ 37 - 78
Documentation/HOWTO

@@ -221,8 +221,8 @@ branches.  These different branches are:
   - main 2.6.x kernel tree
   - 2.6.x.y -stable kernel tree
   - 2.6.x -git kernel patches
-  - 2.6.x -mm kernel patches
   - subsystem specific kernel trees and patches
+  - the 2.6.x -next kernel tree for integration tests
 
 2.6.x kernel tree
 -----------------
@@ -232,9 +232,9 @@ process is as follows:
   - As soon as a new kernel is released a two weeks window is open,
     during this period of time maintainers can submit big diffs to
     Linus, usually the patches that have already been included in the
-    -mm kernel for a few weeks.  The preferred way to submit big changes
+    -next kernel for a few weeks.  The preferred way to submit big changes
     is using git (the kernel's source management tool, more information
-    can be found at http://git.or.cz/) but plain patches are also just
+    can be found at http://git-scm.com/) but plain patches are also just
     fine.
   - After two weeks a -rc1 kernel is released it is now possible to push
     only patches that do not include new features that could affect the
@@ -293,84 +293,43 @@ daily and represent the current state of Linus' tree.  They are more
 experimental than -rc kernels since they are generated automatically
 without even a cursory glance to see if they are sane.
 
-2.6.x -mm kernel patches
-------------------------
-These are experimental kernel patches released by Andrew Morton.  Andrew
-takes all of the different subsystem kernel trees and patches and mushes
-them together, along with a lot of patches that have been plucked from
-the linux-kernel mailing list.  This tree serves as a proving ground for
-new features and patches.  Once a patch has proved its worth in -mm for
-a while Andrew or the subsystem maintainer pushes it on to Linus for
-inclusion in mainline.
-
-It is heavily encouraged that all new patches get tested in the -mm tree
-before they are sent to Linus for inclusion in the main kernel tree.  Code
-which does not make an appearance in -mm before the opening of the merge
-window will prove hard to merge into the mainline.
-
-These kernels are not appropriate for use on systems that are supposed
-to be stable and they are more risky to run than any of the other
-branches.
-
-If you wish to help out with the kernel development process, please test
-and use these kernel releases and provide feedback to the linux-kernel
-mailing list if you have any problems, and if everything works properly.
-
-In addition to all the other experimental patches, these kernels usually
-also contain any changes in the mainline -git kernels available at the
-time of release.
-
-The -mm kernels are not released on a fixed schedule, but usually a few
--mm kernels are released in between each -rc kernel (1 to 3 is common).
-
 Subsystem Specific kernel trees and patches
 -------------------------------------------
-A number of the different kernel subsystem developers expose their
-development trees so that others can see what is happening in the
-different areas of the kernel.  These trees are pulled into the -mm
-kernel releases as described above.
-
-Here is a list of some of the different kernel trees available:
-  git trees:
-    - Kbuild development tree, Sam Ravnborg <sam@ravnborg.org>
-	git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
-
-    - ACPI development tree, Len Brown <len.brown@intel.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
-
-    - Block development tree, Jens Axboe <jens.axboe@oracle.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
-
-    - DRM development tree, Dave Airlie <airlied@linux.ie>
-	git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
-
-    - ia64 development tree, Tony Luck <tony.luck@intel.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
-
-    - infiniband, Roland Dreier <rolandd@cisco.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
-
-    - libata, Jeff Garzik <jgarzik@pobox.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
-
-    - network drivers, Jeff Garzik <jgarzik@pobox.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
-
-    - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net>
-	git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
-
-    - SCSI, James Bottomley <James.Bottomley@hansenpartnership.com>
-	git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
-
-    - x86, Ingo Molnar <mingo@elte.hu>
-	git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux-2.6-x86.git
-
-  quilt trees:
-    - USB, Driver Core, and I2C, Greg Kroah-Hartman <gregkh@suse.de>
-	kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
+The maintainers of the various kernel subsystems --- and also many
+kernel subsystem developers --- expose their current state of
+development in source repositories.  That way, others can see what is
+happening in the different areas of the kernel.  In areas where
+development is rapid, a developer may be asked to base his submissions
+onto such a subsystem kernel tree so that conflicts between the
+submission and other already ongoing work are avoided.
+
+Most of these repositories are git trees, but there are also other SCMs
+in use, or patch queues being published as quilt series.  Addresses of
+these subsystem repositories are listed in the MAINTAINERS file.  Many
+of them can be browsed at http://git.kernel.org/.
+
+Before a proposed patch is committed to such a subsystem tree, it is
+subject to review which primarily happens on mailing lists (see the
+respective section below).  For several kernel subsystems, this review
+process is tracked with the tool patchwork.  Patchwork offers a web
+interface which shows patch postings, any comments on a patch or
+revisions to it, and maintainers can mark patches as under review,
+accepted, or rejected.  Most of these patchwork sites are listed at
+http://patchwork.kernel.org/ or http://patchwork.ozlabs.org/.
+
+2.6.x -next kernel tree for integration tests
+---------------------------------------------
+Before updates from subsystem trees are merged into the mainline 2.6.x
+tree, they need to be integration-tested.  For this purpose, a special
+testing repository exists into which virtually all subsystem trees are
+pulled on an almost daily basis:
+	http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git
+	http://linux.f-seidel.de/linux-next/pmwiki/
+
+This way, the -next kernel gives a summary outlook onto what will be
+expected to go into the mainline kernel at the next merge period.
+Adventurous testers are very welcome to runtime-test the -next kernel.
 
-  Other kernel trees can be found listed at http://git.kernel.org/ and in
-  the MAINTAINERS file.
 
 Bug Reporting
 -------------

+ 12 - 0
Documentation/IPMI.txt

@@ -365,6 +365,7 @@ You can change this at module load time (for a module) with:
        regshifts=<shift1>,<shift2>,...
        slave_addrs=<addr1>,<addr2>,...
        force_kipmid=<enable1>,<enable2>,...
+       kipmid_max_busy_us=<ustime1>,<ustime2>,...
        unload_when_empty=[0|1]
 
 Each of these except si_trydefaults is a list, the first item for the
@@ -433,6 +434,7 @@ kernel command line as:
        ipmi_si.regshifts=<shift1>,<shift2>,...
        ipmi_si.slave_addrs=<addr1>,<addr2>,...
        ipmi_si.force_kipmid=<enable1>,<enable2>,...
+       ipmi_si.kipmid_max_busy_us=<ustime1>,<ustime2>,...
 
 It works the same as the module parameters of the same names.
 
@@ -450,6 +452,16 @@ force this thread on or off.  If you force it off and don't have
 interrupts, the driver will run VERY slowly.  Don't blame me,
 these interfaces suck.
 
+Unfortunately, this thread can use a lot of CPU depending on the
+interface's performance.  This can waste a lot of CPU and cause
+various issues with detecting idle CPU and using extra power.  To
+avoid this, the kipmid_max_busy_us sets the maximum amount of time, in
+microseconds, that kipmid will spin before sleeping for a tick.  This
+value sets a balance between performance and CPU waste and needs to be
+tuned to your needs.  Maybe, someday, auto-tuning will be added, but
+that's not a simple thing and even the auto-tuning would need to be
+tuned to the user's desired performance.
+
 The driver supports a hot add and remove of interfaces.  This way,
 interfaces can be added or removed after the kernel is up and running.
 This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a

+ 2 - 2
Documentation/Makefile

@@ -1,3 +1,3 @@
 obj-m := DocBook/ accounting/ auxdisplay/ connector/ \
-	filesystems/configfs/ ia64/ networking/ \
-	pcmcia/ spi/ video4linux/ vm/ watchdog/src/
+	filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \
+	pcmcia/ spi/ timers/ video4linux/ vm/ watchdog/src/

+ 0 - 766
Documentation/PCI/PCI-DMA-mapping.txt

@@ -1,766 +0,0 @@
-			Dynamic DMA mapping
-			===================
-
-		 David S. Miller <davem@redhat.com>
-		 Richard Henderson <rth@cygnus.com>
-		  Jakub Jelinek <jakub@redhat.com>
-
-This document describes the DMA mapping system in terms of the pci_
-API.  For a similar API that works for generic devices, see
-DMA-API.txt.
-
-Most of the 64bit platforms have special hardware that translates bus
-addresses (DMA addresses) into physical addresses.  This is similar to
-how page tables and/or a TLB translates virtual addresses to physical
-addresses on a CPU.  This is needed so that e.g. PCI devices can
-access with a Single Address Cycle (32bit DMA address) any page in the
-64bit physical address space.  Previously in Linux those 64bit
-platforms had to set artificial limits on the maximum RAM size in the
-system, so that the virt_to_bus() static scheme works (the DMA address
-translation tables were simply filled on bootup to map each bus
-address to the physical page __pa(bus_to_virt())).
-
-So that Linux can use the dynamic DMA mapping, it needs some help from the
-drivers, namely it has to take into account that DMA addresses should be
-mapped only for the time they are actually used and unmapped after the DMA
-transfer.
-
-The following API will work of course even on platforms where no such
-hardware exists, see e.g. arch/x86/include/asm/pci.h for how it is implemented on
-top of the virt_to_bus interface.
-
-First of all, you should make sure
-
-#include <linux/pci.h>
-
-is in your driver. This file will obtain for you the definition of the
-dma_addr_t (which can hold any valid DMA address for the platform)
-type which should be used everywhere you hold a DMA (bus) address
-returned from the DMA mapping functions.
-
-			 What memory is DMA'able?
-
-The first piece of information you must know is what kernel memory can
-be used with the DMA mapping facilities.  There has been an unwritten
-set of rules regarding this, and this text is an attempt to finally
-write them down.
-
-If you acquired your memory via the page allocator
-(i.e. __get_free_page*()) or the generic memory allocators
-(i.e. kmalloc() or kmem_cache_alloc()) then you may DMA to/from
-that memory using the addresses returned from those routines.
-
-This means specifically that you may _not_ use the memory/addresses
-returned from vmalloc() for DMA.  It is possible to DMA to the
-_underlying_ memory mapped into a vmalloc() area, but this requires
-walking page tables to get the physical addresses, and then
-translating each of those pages back to a kernel address using
-something like __va().  [ EDIT: Update this when we integrate
-Gerd Knorr's generic code which does this. ]
-
-This rule also means that you may use neither kernel image addresses
-(items in data/text/bss segments), nor module image addresses, nor
-stack addresses for DMA.  These could all be mapped somewhere entirely
-different than the rest of physical memory.  Even if those classes of
-memory could physically work with DMA, you'd need to ensure the I/O
-buffers were cacheline-aligned.  Without that, you'd see cacheline
-sharing problems (data corruption) on CPUs with DMA-incoherent caches.
-(The CPU could write to one word, DMA would write to a different one
-in the same cache line, and one of them could be overwritten.)
-
-Also, this means that you cannot take the return of a kmap()
-call and DMA to/from that.  This is similar to vmalloc().
-
-What about block I/O and networking buffers?  The block I/O and
-networking subsystems make sure that the buffers they use are valid
-for you to DMA from/to.
-
-			DMA addressing limitations
-
-Does your device have any DMA addressing limitations?  For example, is
-your device only capable of driving the low order 24-bits of address
-on the PCI bus for SAC DMA transfers?  If so, you need to inform the
-PCI layer of this fact.
-
-By default, the kernel assumes that your device can address the full
-32-bits in a SAC cycle.  For a 64-bit DAC capable device, this needs
-to be increased.  And for a device with limitations, as discussed in
-the previous paragraph, it needs to be decreased.
-
-pci_alloc_consistent() by default will return 32-bit DMA addresses.
-PCI-X specification requires PCI-X devices to support 64-bit
-addressing (DAC) for all transactions. And at least one platform (SGI
-SN2) requires 64-bit consistent allocations to operate correctly when
-the IO bus is in PCI-X mode. Therefore, like with pci_set_dma_mask(),
-it's good practice to call pci_set_consistent_dma_mask() to set the
-appropriate mask even if your device only supports 32-bit DMA
-(default) and especially if it's a PCI-X device.
-
-For correct operation, you must interrogate the PCI layer in your
-device probe routine to see if the PCI controller on the machine can
-properly support the DMA addressing limitation your device has.  It is
-good style to do this even if your device holds the default setting,
-because this shows that you did think about these issues wrt. your
-device.
-
-The query is performed via a call to pci_set_dma_mask():
-
-	int pci_set_dma_mask(struct pci_dev *pdev, u64 device_mask);
-
-The query for consistent allocations is performed via a call to
-pci_set_consistent_dma_mask():
-
-	int pci_set_consistent_dma_mask(struct pci_dev *pdev, u64 device_mask);
-
-Here, pdev is a pointer to the PCI device struct of your device, and
-device_mask is a bit mask describing which bits of a PCI address your
-device supports.  It returns zero if your card can perform DMA
-properly on the machine given the address mask you provided.
-
-If it returns non-zero, your device cannot perform DMA properly on
-this platform, and attempting to do so will result in undefined
-behavior.  You must either use a different mask, or not use DMA.
-
-This means that in the failure case, you have three options:
-
-1) Use another DMA mask, if possible (see below).
-2) Use some non-DMA mode for data transfer, if possible.
-3) Ignore this device and do not initialize it.
-
-It is recommended that your driver print a kernel KERN_WARNING message
-when you end up performing either #2 or #3.  In this manner, if a user
-of your driver reports that performance is bad or that the device is not
-even detected, you can ask them for the kernel messages to find out
-exactly why.
-
-The standard 32-bit addressing PCI device would do something like
-this:
-
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
-		printk(KERN_WARNING
-		       "mydev: No suitable DMA available.\n");
-		goto ignore_this_device;
-	}
-
-Another common scenario is a 64-bit capable device.  The approach
-here is to try for 64-bit DAC addressing, but back down to a
-32-bit mask should that fail.  The PCI platform code may fail the
-64-bit mask not because the platform is not capable of 64-bit
-addressing.  Rather, it may fail in this case simply because
-32-bit SAC addressing is done more efficiently than DAC addressing.
-Sparc64 is one platform which behaves in this way.
-
-Here is how you would handle a 64-bit capable device which can drive
-all 64-bits when accessing streaming DMA:
-
-	int using_dac;
-
-	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
-		using_dac = 1;
-	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
-		using_dac = 0;
-	} else {
-		printk(KERN_WARNING
-		       "mydev: No suitable DMA available.\n");
-		goto ignore_this_device;
-	}
-
-If a card is capable of using 64-bit consistent allocations as well,
-the case would look like this:
-
-	int using_dac, consistent_using_dac;
-
-	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
-		using_dac = 1;
-	   	consistent_using_dac = 1;
-		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
-		using_dac = 0;
-		consistent_using_dac = 0;
-		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-	} else {
-		printk(KERN_WARNING
-		       "mydev: No suitable DMA available.\n");
-		goto ignore_this_device;
-	}
-
-pci_set_consistent_dma_mask() will always be able to set the same or a
-smaller mask as pci_set_dma_mask(). However for the rare case that a
-device driver only uses consistent allocations, one would have to
-check the return value from pci_set_consistent_dma_mask().
-
-Finally, if your device can only drive the low 24-bits of
-address during PCI bus mastering you might do something like:
-
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) {
-		printk(KERN_WARNING
-		       "mydev: 24-bit DMA addressing not available.\n");
-		goto ignore_this_device;
-	}
-
-When pci_set_dma_mask() is successful, and returns zero, the PCI layer
-saves away this mask you have provided.  The PCI layer will use this
-information later when you make DMA mappings.
-
-There is a case which we are aware of at this time, which is worth
-mentioning in this documentation.  If your device supports multiple
-functions (for example a sound card provides playback and record
-functions) and the various different functions have _different_
-DMA addressing limitations, you may wish to probe each mask and
-only provide the functionality which the machine can handle.  It
-is important that the last call to pci_set_dma_mask() be for the
-most specific mask.
-
-Here is pseudo-code showing how this might be done:
-
-	#define PLAYBACK_ADDRESS_BITS	DMA_BIT_MASK(32)
-	#define RECORD_ADDRESS_BITS	DMA_BIT_MASK(24)
-
-	struct my_sound_card *card;
-	struct pci_dev *pdev;
-
-	...
-	if (!pci_set_dma_mask(pdev, PLAYBACK_ADDRESS_BITS)) {
-		card->playback_enabled = 1;
-	} else {
-		card->playback_enabled = 0;
-		printk(KERN_WARNING "%s: Playback disabled due to DMA limitations.\n",
-		       card->name);
-	}
-	if (!pci_set_dma_mask(pdev, RECORD_ADDRESS_BITS)) {
-		card->record_enabled = 1;
-	} else {
-		card->record_enabled = 0;
-		printk(KERN_WARNING "%s: Record disabled due to DMA limitations.\n",
-		       card->name);
-	}
-
-A sound card was used as an example here because this genre of PCI
-devices seems to be littered with ISA chips given a PCI front end,
-and thus retaining the 16MB DMA addressing limitations of ISA.
-
-			Types of DMA mappings
-
-There are two types of DMA mappings:
-
-- Consistent DMA mappings which are usually mapped at driver
-  initialization, unmapped at the end and for which the hardware should
-  guarantee that the device and the CPU can access the data
-  in parallel and will see updates made by each other without any
-  explicit software flushing.
-
-  Think of "consistent" as "synchronous" or "coherent".
-
-  The current default is to return consistent memory in the low 32
-  bits of the PCI bus space.  However, for future compatibility you
-  should set the consistent mask even if this default is fine for your
-  driver.
-
-  Good examples of what to use consistent mappings for are:
-
-	- Network card DMA ring descriptors.
-	- SCSI adapter mailbox command data structures.
-	- Device firmware microcode executed out of
-	  main memory.
-
-  The invariant these examples all require is that any CPU store
-  to memory is immediately visible to the device, and vice
-  versa.  Consistent mappings guarantee this.
-
-  IMPORTANT: Consistent DMA memory does not preclude the usage of
-             proper memory barriers.  The CPU may reorder stores to
-	     consistent memory just as it may normal memory.  Example:
-	     if it is important for the device to see the first word
-	     of a descriptor updated before the second, you must do
-	     something like:
-
-		desc->word0 = address;
-		wmb();
-		desc->word1 = DESC_VALID;
-
-             in order to get correct behavior on all platforms.
-
-	     Also, on some platforms your driver may need to flush CPU write
-	     buffers in much the same way as it needs to flush write buffers
-	     found in PCI bridges (such as by reading a register's value
-	     after writing it).
-
-- Streaming DMA mappings which are usually mapped for one DMA transfer,
-  unmapped right after it (unless you use pci_dma_sync_* below) and for which
-  hardware can optimize for sequential accesses.
-
-  This of "streaming" as "asynchronous" or "outside the coherency
-  domain".
-
-  Good examples of what to use streaming mappings for are:
-
-	- Networking buffers transmitted/received by a device.
-	- Filesystem buffers written/read by a SCSI device.
-
-  The interfaces for using this type of mapping were designed in
-  such a way that an implementation can make whatever performance
-  optimizations the hardware allows.  To this end, when using
-  such mappings you must be explicit about what you want to happen.
-
-Neither type of DMA mapping has alignment restrictions that come
-from PCI, although some devices may have such restrictions.
-Also, systems with caches that aren't DMA-coherent will work better
-when the underlying buffers don't share cache lines with other data.
-
-
-		 Using Consistent DMA mappings.
-
-To allocate and map large (PAGE_SIZE or so) consistent DMA regions,
-you should do:
-
-	dma_addr_t dma_handle;
-
-	cpu_addr = pci_alloc_consistent(pdev, size, &dma_handle);
-
-where pdev is a struct pci_dev *. This may be called in interrupt context.
-You should use dma_alloc_coherent (see DMA-API.txt) for buses
-where devices don't have struct pci_dev (like ISA, EISA).
-
-This argument is needed because the DMA translations may be bus
-specific (and often is private to the bus which the device is attached
-to).
-
-Size is the length of the region you want to allocate, in bytes.
-
-This routine will allocate RAM for that region, so it acts similarly to
-__get_free_pages (but takes size instead of a page order).  If your
-driver needs regions sized smaller than a page, you may prefer using
-the pci_pool interface, described below.
-
-The consistent DMA mapping interfaces, for non-NULL pdev, will by
-default return a DMA address which is SAC (Single Address Cycle)
-addressable.  Even if the device indicates (via PCI dma mask) that it
-may address the upper 32-bits and thus perform DAC cycles, consistent
-allocation will only return > 32-bit PCI addresses for DMA if the
-consistent dma mask has been explicitly changed via
-pci_set_consistent_dma_mask().  This is true of the pci_pool interface
-as well.
-
-pci_alloc_consistent returns two values: the virtual address which you
-can use to access it from the CPU and dma_handle which you pass to the
-card.
-
-The cpu return address and the DMA bus master address are both
-guaranteed to be aligned to the smallest PAGE_SIZE order which
-is greater than or equal to the requested size.  This invariant
-exists (for example) to guarantee that if you allocate a chunk
-which is smaller than or equal to 64 kilobytes, the extent of the
-buffer you receive will not cross a 64K boundary.
-
-To unmap and free such a DMA region, you call:
-
-	pci_free_consistent(pdev, size, cpu_addr, dma_handle);
-
-where pdev, size are the same as in the above call and cpu_addr and
-dma_handle are the values pci_alloc_consistent returned to you.
-This function may not be called in interrupt context.
-
-If your driver needs lots of smaller memory regions, you can write
-custom code to subdivide pages returned by pci_alloc_consistent,
-or you can use the pci_pool API to do that.  A pci_pool is like
-a kmem_cache, but it uses pci_alloc_consistent not __get_free_pages.
-Also, it understands common hardware constraints for alignment,
-like queue heads needing to be aligned on N byte boundaries.
-
-Create a pci_pool like this:
-
-	struct pci_pool *pool;
-
-	pool = pci_pool_create(name, pdev, size, align, alloc);
-
-The "name" is for diagnostics (like a kmem_cache name); pdev and size
-are as above.  The device's hardware alignment requirement for this
-type of data is "align" (which is expressed in bytes, and must be a
-power of two).  If your device has no boundary crossing restrictions,
-pass 0 for alloc; passing 4096 says memory allocated from this pool
-must not cross 4KByte boundaries (but at that time it may be better to
-go for pci_alloc_consistent directly instead).
-
-Allocate memory from a pci pool like this:
-
-	cpu_addr = pci_pool_alloc(pool, flags, &dma_handle);
-
-flags are SLAB_KERNEL if blocking is permitted (not in_interrupt nor
-holding SMP locks), SLAB_ATOMIC otherwise.  Like pci_alloc_consistent,
-this returns two values, cpu_addr and dma_handle.
-
-Free memory that was allocated from a pci_pool like this:
-
-	pci_pool_free(pool, cpu_addr, dma_handle);
-
-where pool is what you passed to pci_pool_alloc, and cpu_addr and
-dma_handle are the values pci_pool_alloc returned. This function
-may be called in interrupt context.
-
-Destroy a pci_pool by calling:
-
-	pci_pool_destroy(pool);
-
-Make sure you've called pci_pool_free for all memory allocated
-from a pool before you destroy the pool. This function may not
-be called in interrupt context.
-
-			DMA Direction
-
-The interfaces described in subsequent portions of this document
-take a DMA direction argument, which is an integer and takes on
-one of the following values:
-
- PCI_DMA_BIDIRECTIONAL
- PCI_DMA_TODEVICE
- PCI_DMA_FROMDEVICE
- PCI_DMA_NONE
-
-One should provide the exact DMA direction if you know it.
-
-PCI_DMA_TODEVICE means "from main memory to the PCI device"
-PCI_DMA_FROMDEVICE means "from the PCI device to main memory"
-It is the direction in which the data moves during the DMA
-transfer.
-
-You are _strongly_ encouraged to specify this as precisely
-as you possibly can.
-
-If you absolutely cannot know the direction of the DMA transfer,
-specify PCI_DMA_BIDIRECTIONAL.  It means that the DMA can go in
-either direction.  The platform guarantees that you may legally
-specify this, and that it will work, but this may be at the
-cost of performance for example.
-
-The value PCI_DMA_NONE is to be used for debugging.  One can
-hold this in a data structure before you come to know the
-precise direction, and this will help catch cases where your
-direction tracking logic has failed to set things up properly.
-
-Another advantage of specifying this value precisely (outside of
-potential platform-specific optimizations of such) is for debugging.
-Some platforms actually have a write permission boolean which DMA
-mappings can be marked with, much like page protections in the user
-program address space.  Such platforms can and do report errors in the
-kernel logs when the PCI controller hardware detects violation of the
-permission setting.
-
-Only streaming mappings specify a direction, consistent mappings
-implicitly have a direction attribute setting of
-PCI_DMA_BIDIRECTIONAL.
-
-The SCSI subsystem tells you the direction to use in the
-'sc_data_direction' member of the SCSI command your driver is
-working on.
-
-For Networking drivers, it's a rather simple affair.  For transmit
-packets, map/unmap them with the PCI_DMA_TODEVICE direction
-specifier.  For receive packets, just the opposite, map/unmap them
-with the PCI_DMA_FROMDEVICE direction specifier.
-
-		  Using Streaming DMA mappings
-
-The streaming DMA mapping routines can be called from interrupt
-context.  There are two versions of each map/unmap, one which will
-map/unmap a single memory region, and one which will map/unmap a
-scatterlist.
-
-To map a single region, you do:
-
-	struct pci_dev *pdev = mydev->pdev;
-	dma_addr_t dma_handle;
-	void *addr = buffer->ptr;
-	size_t size = buffer->len;
-
-	dma_handle = pci_map_single(pdev, addr, size, direction);
-
-and to unmap it:
-
-	pci_unmap_single(pdev, dma_handle, size, direction);
-
-You should call pci_unmap_single when the DMA activity is finished, e.g.
-from the interrupt which told you that the DMA transfer is done.
-
-Using cpu pointers like this for single mappings has a disadvantage,
-you cannot reference HIGHMEM memory in this way.  Thus, there is a
-map/unmap interface pair akin to pci_{map,unmap}_single.  These
-interfaces deal with page/offset pairs instead of cpu pointers.
-Specifically:
-
-	struct pci_dev *pdev = mydev->pdev;
-	dma_addr_t dma_handle;
-	struct page *page = buffer->page;
-	unsigned long offset = buffer->offset;
-	size_t size = buffer->len;
-
-	dma_handle = pci_map_page(pdev, page, offset, size, direction);
-
-	...
-
-	pci_unmap_page(pdev, dma_handle, size, direction);
-
-Here, "offset" means byte offset within the given page.
-
-With scatterlists, you map a region gathered from several regions by:
-
-	int i, count = pci_map_sg(pdev, sglist, nents, direction);
-	struct scatterlist *sg;
-
-	for_each_sg(sglist, sg, count, i) {
-		hw_address[i] = sg_dma_address(sg);
-		hw_len[i] = sg_dma_len(sg);
-	}
-
-where nents is the number of entries in the sglist.
-
-The implementation is free to merge several consecutive sglist entries
-into one (e.g. if DMA mapping is done with PAGE_SIZE granularity, any
-consecutive sglist entries can be merged into one provided the first one
-ends and the second one starts on a page boundary - in fact this is a huge
-advantage for cards which either cannot do scatter-gather or have very
-limited number of scatter-gather entries) and returns the actual number
-of sg entries it mapped them to. On failure 0 is returned.
-
-Then you should loop count times (note: this can be less than nents times)
-and use sg_dma_address() and sg_dma_len() macros where you previously
-accessed sg->address and sg->length as shown above.
-
-To unmap a scatterlist, just call:
-
-	pci_unmap_sg(pdev, sglist, nents, direction);
-
-Again, make sure DMA activity has already finished.
-
-PLEASE NOTE:  The 'nents' argument to the pci_unmap_sg call must be
-              the _same_ one you passed into the pci_map_sg call,
-	      it should _NOT_ be the 'count' value _returned_ from the
-              pci_map_sg call.
-
-Every pci_map_{single,sg} call should have its pci_unmap_{single,sg}
-counterpart, because the bus address space is a shared resource (although
-in some ports the mapping is per each BUS so less devices contend for the
-same bus address space) and you could render the machine unusable by eating
-all bus addresses.
-
-If you need to use the same streaming DMA region multiple times and touch
-the data in between the DMA transfers, the buffer needs to be synced
-properly in order for the cpu and device to see the most uptodate and
-correct copy of the DMA buffer.
-
-So, firstly, just map it with pci_map_{single,sg}, and after each DMA
-transfer call either:
-
-	pci_dma_sync_single_for_cpu(pdev, dma_handle, size, direction);
-
-or:
-
-	pci_dma_sync_sg_for_cpu(pdev, sglist, nents, direction);
-
-as appropriate.
-
-Then, if you wish to let the device get at the DMA area again,
-finish accessing the data with the cpu, and then before actually
-giving the buffer to the hardware call either:
-
-	pci_dma_sync_single_for_device(pdev, dma_handle, size, direction);
-
-or:
-
-	pci_dma_sync_sg_for_device(dev, sglist, nents, direction);
-
-as appropriate.
-
-After the last DMA transfer call one of the DMA unmap routines
-pci_unmap_{single,sg}. If you don't touch the data from the first pci_map_*
-call till pci_unmap_*, then you don't have to call the pci_dma_sync_*
-routines at all.
-
-Here is pseudo code which shows a situation in which you would need
-to use the pci_dma_sync_*() interfaces.
-
-	my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
-	{
-		dma_addr_t mapping;
-
-		mapping = pci_map_single(cp->pdev, buffer, len, PCI_DMA_FROMDEVICE);
-
-		cp->rx_buf = buffer;
-		cp->rx_len = len;
-		cp->rx_dma = mapping;
-
-		give_rx_buf_to_card(cp);
-	}
-
-	...
-
-	my_card_interrupt_handler(int irq, void *devid, struct pt_regs *regs)
-	{
-		struct my_card *cp = devid;
-
-		...
-		if (read_card_status(cp) == RX_BUF_TRANSFERRED) {
-			struct my_card_header *hp;
-
-			/* Examine the header to see if we wish
-			 * to accept the data.  But synchronize
-			 * the DMA transfer with the CPU first
-			 * so that we see updated contents.
-			 */
-			pci_dma_sync_single_for_cpu(cp->pdev, cp->rx_dma,
-						    cp->rx_len,
-						    PCI_DMA_FROMDEVICE);
-
-			/* Now it is safe to examine the buffer. */
-			hp = (struct my_card_header *) cp->rx_buf;
-			if (header_is_ok(hp)) {
-				pci_unmap_single(cp->pdev, cp->rx_dma, cp->rx_len,
-						 PCI_DMA_FROMDEVICE);
-				pass_to_upper_layers(cp->rx_buf);
-				make_and_setup_new_rx_buf(cp);
-			} else {
-				/* Just sync the buffer and give it back
-				 * to the card.
-				 */
-				pci_dma_sync_single_for_device(cp->pdev,
-							       cp->rx_dma,
-							       cp->rx_len,
-							       PCI_DMA_FROMDEVICE);
-				give_rx_buf_to_card(cp);
-			}
-		}
-	}
-
-Drivers converted fully to this interface should not use virt_to_bus any
-longer, nor should they use bus_to_virt. Some drivers have to be changed a
-little bit, because there is no longer an equivalent to bus_to_virt in the
-dynamic DMA mapping scheme - you have to always store the DMA addresses
-returned by the pci_alloc_consistent, pci_pool_alloc, and pci_map_single
-calls (pci_map_sg stores them in the scatterlist itself if the platform
-supports dynamic DMA mapping in hardware) in your driver structures and/or
-in the card registers.
-
-All PCI drivers should be using these interfaces with no exceptions.
-It is planned to completely remove virt_to_bus() and bus_to_virt() as
-they are entirely deprecated.  Some ports already do not provide these
-as it is impossible to correctly support them.
-
-		Optimizing Unmap State Space Consumption
-
-On many platforms, pci_unmap_{single,page}() is simply a nop.
-Therefore, keeping track of the mapping address and length is a waste
-of space.  Instead of filling your drivers up with ifdefs and the like
-to "work around" this (which would defeat the whole purpose of a
-portable API) the following facilities are provided.
-
-Actually, instead of describing the macros one by one, we'll
-transform some example code.
-
-1) Use DECLARE_PCI_UNMAP_{ADDR,LEN} in state saving structures.
-   Example, before:
-
-	struct ring_state {
-		struct sk_buff *skb;
-		dma_addr_t mapping;
-		__u32 len;
-	};
-
-   after:
-
-	struct ring_state {
-		struct sk_buff *skb;
-		DECLARE_PCI_UNMAP_ADDR(mapping)
-		DECLARE_PCI_UNMAP_LEN(len)
-	};
-
-   NOTE: DO NOT put a semicolon at the end of the DECLARE_*()
-         macro.
-
-2) Use pci_unmap_{addr,len}_set to set these values.
-   Example, before:
-
-	ringp->mapping = FOO;
-	ringp->len = BAR;
-
-   after:
-
-	pci_unmap_addr_set(ringp, mapping, FOO);
-	pci_unmap_len_set(ringp, len, BAR);
-
-3) Use pci_unmap_{addr,len} to access these values.
-   Example, before:
-
-	pci_unmap_single(pdev, ringp->mapping, ringp->len,
-			 PCI_DMA_FROMDEVICE);
-
-   after:
-
-	pci_unmap_single(pdev,
-			 pci_unmap_addr(ringp, mapping),
-			 pci_unmap_len(ringp, len),
-			 PCI_DMA_FROMDEVICE);
-
-It really should be self-explanatory.  We treat the ADDR and LEN
-separately, because it is possible for an implementation to only
-need the address in order to perform the unmap operation.
-
-			Platform Issues
-
-If you are just writing drivers for Linux and do not maintain
-an architecture port for the kernel, you can safely skip down
-to "Closing".
-
-1) Struct scatterlist requirements.
-
-   Struct scatterlist must contain, at a minimum, the following
-   members:
-
-	struct page *page;
-	unsigned int offset;
-	unsigned int length;
-
-   The base address is specified by a "page+offset" pair.
-
-   Previous versions of struct scatterlist contained a "void *address"
-   field that was sometimes used instead of page+offset.  As of Linux
-   2.5., page+offset is always used, and the "address" field has been
-   deleted.
-
-2) More to come...
-
-			Handling Errors
-
-DMA address space is limited on some architectures and an allocation
-failure can be determined by:
-
-- checking if pci_alloc_consistent returns NULL or pci_map_sg returns 0
-
-- checking the returned dma_addr_t of pci_map_single and pci_map_page
-  by using pci_dma_mapping_error():
-
-	dma_addr_t dma_handle;
-
-	dma_handle = pci_map_single(pdev, addr, size, direction);
-	if (pci_dma_mapping_error(pdev, dma_handle)) {
-		/*
-		 * reduce current DMA mapping usage,
-		 * delay and try again later or
-		 * reset driver.
-		 */
-	}
-
-			   Closing
-
-This document, and the API itself, would not be in it's current
-form without the feedback and suggestions from numerous individuals.
-We would like to specifically mention, in no particular order, the
-following people:
-
-	Russell King <rmk@arm.linux.org.uk>
-	Leo Dagum <dagum@barrel.engr.sgi.com>
-	Ralf Baechle <ralf@oss.sgi.com>
-	Grant Grundler <grundler@cup.hp.com>
-	Jay Estabrook <Jay.Estabrook@compaq.com>
-	Thomas Sailer <sailer@ife.ee.ethz.ch>
-	Andrea Arcangeli <andrea@suse.de>
-	Jens Axboe <jens.axboe@oracle.com>
-	David Mosberger-Tang <davidm@hpl.hp.com>

+ 2 - 2
Documentation/PCI/pci-error-recovery.txt

@@ -216,7 +216,7 @@ The driver should return one of the following result codes:
 
 		- PCI_ERS_RESULT_NEED_RESET
 		  Driver returns this if it thinks the device is not
-		  recoverable in it's current state and it needs a slot
+		  recoverable in its current state and it needs a slot
 		  reset to proceed.
 
 		- PCI_ERS_RESULT_DISCONNECT
@@ -241,7 +241,7 @@ in working condition.
 
 The driver is not supposed to restart normal driver I/O operations
 at this point.  It should limit itself to "probing" the device to
-check it's recoverability status. If all is right, then the platform
+check its recoverability status. If all is right, then the platform
 will call resume() once all drivers have ack'd link_reset().
 
 	Result codes:

+ 22 - 17
Documentation/RCU/NMI-RCU.txt

@@ -34,7 +34,7 @@ NMI handler.
 		cpu = smp_processor_id();
 		++nmi_count(cpu);
 
-		if (!rcu_dereference(nmi_callback)(regs, cpu))
+		if (!rcu_dereference_sched(nmi_callback)(regs, cpu))
 			default_do_nmi(regs);
 
 		nmi_exit();
@@ -47,12 +47,13 @@ function pointer.  If this handler returns zero, do_nmi() invokes the
 default_do_nmi() function to handle a machine-specific NMI.  Finally,
 preemption is restored.
 
-Strictly speaking, rcu_dereference() is not needed, since this code runs
-only on i386, which does not need rcu_dereference() anyway.  However,
-it is a good documentation aid, particularly for anyone attempting to
-do something similar on Alpha.
+In theory, rcu_dereference_sched() is not needed, since this code runs
+only on i386, which in theory does not need rcu_dereference_sched()
+anyway.  However, in practice it is a good documentation aid, particularly
+for anyone attempting to do something similar on Alpha or on systems
+with aggressive optimizing compilers.
 
-Quick Quiz:  Why might the rcu_dereference() be necessary on Alpha,
+Quick Quiz:  Why might the rcu_dereference_sched() be necessary on Alpha,
 	     given that the code referenced by the pointer is read-only?
 
 
@@ -99,17 +100,21 @@ invoke irq_enter() and irq_exit() on NMI entry and exit, respectively.
 
 Answer to Quick Quiz
 
-	Why might the rcu_dereference() be necessary on Alpha, given
+	Why might the rcu_dereference_sched() be necessary on Alpha, given
 	that the code referenced by the pointer is read-only?
 
 	Answer: The caller to set_nmi_callback() might well have
-		initialized some data that is to be used by the
-		new NMI handler.  In this case, the rcu_dereference()
-		would be needed, because otherwise a CPU that received
-		an NMI just after the new handler was set might see
-		the pointer to the new NMI handler, but the old
-		pre-initialized version of the handler's data.
-
-		More important, the rcu_dereference() makes it clear
-		to someone reading the code that the pointer is being
-		protected by RCU.
+		initialized some data that is to be used by the new NMI
+		handler.  In this case, the rcu_dereference_sched() would
+		be needed, because otherwise a CPU that received an NMI
+		just after the new handler was set might see the pointer
+		to the new NMI handler, but the old pre-initialized
+		version of the handler's data.
+
+		This same sad story can happen on other CPUs when using
+		a compiler with aggressive pointer-value speculation
+		optimizations.
+
+		More important, the rcu_dereference_sched() makes it
+		clear to someone reading the code that the pointer is
+		being protected by RCU-sched.

+ 4 - 3
Documentation/RCU/checklist.txt

@@ -260,7 +260,8 @@ over a rather long period of time, but improvements are always welcome!
 	The reason that it is permissible to use RCU list-traversal
 	primitives when the update-side lock is held is that doing so
 	can be quite helpful in reducing code bloat when common code is
-	shared between readers and updaters.
+	shared between readers and updaters.  Additional primitives
+	are provided for this case, as discussed in lockdep.txt.
 
 10.	Conversely, if you are in an RCU read-side critical section,
 	and you don't hold the appropriate update-side lock, you -must-
@@ -344,8 +345,8 @@ over a rather long period of time, but improvements are always welcome!
 	requiring SRCU's read-side deadlock immunity or low read-side
 	realtime latency.
 
-	Note that, rcu_assign_pointer() and rcu_dereference() relate to
-	SRCU just as they do to other forms of RCU.
+	Note that, rcu_assign_pointer() relates to SRCU just as they do
+	to other forms of RCU.
 
 15.	The whole point of call_rcu(), synchronize_rcu(), and friends
 	is to wait until all pre-existing readers have finished before

+ 26 - 2
Documentation/RCU/lockdep.txt

@@ -32,9 +32,20 @@ checking of rcu_dereference() primitives:
 	srcu_dereference(p, sp):
 		Check for SRCU read-side critical section.
 	rcu_dereference_check(p, c):
-		Use explicit check expression "c".
+		Use explicit check expression "c".  This is useful in
+		code that is invoked by both readers and updaters.
 	rcu_dereference_raw(p)
 		Don't check.  (Use sparingly, if at all.)
+	rcu_dereference_protected(p, c):
+		Use explicit check expression "c", and omit all barriers
+		and compiler constraints.  This is useful when the data
+		structure cannot change, for example, in code that is
+		invoked only by updaters.
+	rcu_access_pointer(p):
+		Return the value of the pointer and omit all barriers,
+		but retain the compiler constraints that prevent duplicating
+		or coalescsing.  This is useful when when testing the
+		value of the pointer itself, for example, against NULL.
 
 The rcu_dereference_check() check expression can be any boolean
 expression, but would normally include one of the rcu_read_lock_held()
@@ -59,7 +70,20 @@ In case (1), the pointer is picked up in an RCU-safe manner for vanilla
 RCU read-side critical sections, in case (2) the ->file_lock prevents
 any change from taking place, and finally, in case (3) the current task
 is the only task accessing the file_struct, again preventing any change
-from taking place.
+from taking place.  If the above statement was invoked only from updater
+code, it could instead be written as follows:
+
+	file = rcu_dereference_protected(fdt->fd[fd],
+					 lockdep_is_held(&files->file_lock) ||
+					 atomic_read(&files->count) == 1);
+
+This would verify cases #2 and #3 above, and furthermore lockdep would
+complain if this was used in an RCU read-side critical section unless one
+of these two cases held.  Because rcu_dereference_protected() omits all
+barriers and compiler constraints, it generates better code than do the
+other flavors of rcu_dereference().  On the other hand, it is illegal
+to use rcu_dereference_protected() if either the RCU-protected pointer
+or the RCU-protected data that it points to can change concurrently.
 
 There are currently only "universal" versions of the rcu_assign_pointer()
 and RCU list-/tree-traversal primitives, which do not (yet) check for

+ 71 - 23
Documentation/RCU/stallwarn.txt

@@ -3,35 +3,79 @@ Using RCU's CPU Stall Detector
 The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables
 RCU's CPU stall detector, which detects conditions that unduly delay
 RCU grace periods.  The stall detector's idea of what constitutes
-"unduly delayed" is controlled by a pair of C preprocessor macros:
+"unduly delayed" is controlled by a set of C preprocessor macros:
 
 RCU_SECONDS_TILL_STALL_CHECK
 
 	This macro defines the period of time that RCU will wait from
 	the beginning of a grace period until it issues an RCU CPU
-	stall warning.	It is normally ten seconds.
+	stall warning.	This time period is normally ten seconds.
 
 RCU_SECONDS_TILL_STALL_RECHECK
 
 	This macro defines the period of time that RCU will wait after
-	issuing a stall warning until it issues another stall warning.
-	It is normally set to thirty seconds.
+	issuing a stall warning until it issues another stall warning
+	for the same stall.  This time period is normally set to thirty
+	seconds.
 
 RCU_STALL_RAT_DELAY
 
-	The CPU stall detector tries to make the offending CPU rat on itself,
-	as this often gives better-quality stack traces.  However, if
-	the offending CPU does not detect its own stall in the number
-	of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will
-	complain.  This is normally set to two jiffies.
+	The CPU stall detector tries to make the offending CPU print its
+	own warnings, as this often gives better-quality stack traces.
+	However, if the offending CPU does not detect its own stall in
+	the number of jiffies specified by RCU_STALL_RAT_DELAY, then
+	some other CPU will complain.  This delay is normally set to
+	two jiffies.
 
-The following problems can result in an RCU CPU stall warning:
+When a CPU detects that it is stalling, it will print a message similar
+to the following:
+
+INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies)
+
+This message indicates that CPU 5 detected that it was causing a stall,
+and that the stall was affecting RCU-sched.  This message will normally be
+followed by a stack dump of the offending CPU.  On TREE_RCU kernel builds,
+RCU and RCU-sched are implemented by the same underlying mechanism,
+while on TREE_PREEMPT_RCU kernel builds, RCU is instead implemented
+by rcu_preempt_state.
+
+On the other hand, if the offending CPU fails to print out a stall-warning
+message quickly enough, some other CPU will print a message similar to
+the following:
+
+INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies)
+
+This message indicates that CPU 2 detected that CPUs 3 and 5 were both
+causing stalls, and that the stall was affecting RCU-bh.  This message
+will normally be followed by stack dumps for each CPU.  Please note that
+TREE_PREEMPT_RCU builds can be stalled by tasks as well as by CPUs,
+and that the tasks will be indicated by PID, for example, "P3421".
+It is even possible for a rcu_preempt_state stall to be caused by both
+CPUs -and- tasks, in which case the offending CPUs and tasks will all
+be called out in the list.
+
+Finally, if the grace period ends just as the stall warning starts
+printing, there will be a spurious stall-warning message:
+
+INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies)
+
+This is rare, but does happen from time to time in real life.
+
+So your kernel printed an RCU CPU stall warning.  The next question is
+"What caused it?"  The following problems can result in RCU CPU stall
+warnings:
 
 o	A CPU looping in an RCU read-side critical section.
 	
-o	A CPU looping with interrupts disabled.
+o	A CPU looping with interrupts disabled.  This condition can
+	result in RCU-sched and RCU-bh stalls.
 
-o	A CPU looping with preemption disabled.
+o	A CPU looping with preemption disabled.  This condition can
+	result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
+	stalls.
+
+o	A CPU looping with bottom halves disabled.  This condition can
+	result in RCU-sched and RCU-bh stalls.
 
 o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
 	without invoking schedule().
@@ -39,20 +83,24 @@ o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
 o	A bug in the RCU implementation.
 
 o	A hardware failure.  This is quite unlikely, but has occurred
-	at least once in a former life.  A CPU failed in a running system,
+	at least once in real life.  A CPU failed in a running system,
 	becoming unresponsive, but not causing an immediate crash.
 	This resulted in a series of RCU CPU stall warnings, eventually
 	leading the realization that the CPU had failed.
 
-The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning.
-SRCU does not do so directly, but its calls to synchronize_sched() will
-result in RCU-sched detecting any CPU stalls that might be occurring.
-
-To diagnose the cause of the stall, inspect the stack traces.  The offending
-function will usually be near the top of the stack.  If you have a series
-of stall warnings from a single extended stall, comparing the stack traces
-can often help determine where the stall is occurring, which will usually
-be in the function nearest the top of the stack that stays the same from
-trace to trace.
+The RCU, RCU-sched, and RCU-bh implementations have CPU stall
+warning.  SRCU does not have its own CPU stall warnings, but its
+calls to synchronize_sched() will result in RCU-sched detecting
+RCU-sched-related CPU stalls.  Please note that RCU only detects
+CPU stalls when there is a grace period in progress.  No grace period,
+no CPU stall warnings.
+
+To diagnose the cause of the stall, inspect the stack traces.
+The offending function will usually be near the top of the stack.
+If you have a series of stall warnings from a single extended stall,
+comparing the stack traces can often help determine where the stall
+is occurring, which will usually be in the function nearest the top of
+that portion of the stack which remains the same from trace to trace.
+If you can reliably trigger the stall, ftrace can be quite helpful.
 
 RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE.

+ 0 - 10
Documentation/RCU/torture.txt

@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
 	sched_expedited-torture: Reader Pipe:  12660320201 95875 0 0 0 0 0 0 0 0 0
 	sched_expedited-torture: Reader Batch:  12660424885 0 0 0 0 0 0 0 0 0 0
 	sched_expedited-torture: Free-Block Circulation:  1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
-	state: -1 / 0:0 3:0 4:0
-
-As before, the first four lines are similar to those for RCU.
-The last line shows the task-migration state.  The first number is
--1 if synchronize_sched_expedited() is idle, -2 if in the process of
-posting wakeups to the migration kthreads, and N when waiting on CPU N.
-Each of the colon-separated fields following the "/" is a CPU:state pair.
-Valid states are "0" for idle, "1" for waiting for quiescent state,
-"2" for passed through quiescent state, and "3" when a race with a
-CPU-hotplug event forces use of the synchronize_sched() primitive.
 
 
 USAGE

+ 19 - 16
Documentation/RCU/trace.txt

@@ -256,23 +256,23 @@ o	Each element of the form "1/1 0:127 ^0" represents one struct
 The output of "cat rcu/rcu_pending" looks as follows:
 
 rcu_sched:
-  0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
-  1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
-  2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
-  3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
-  4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
-  5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
-  6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
-  7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
+  0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
+  1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
+  2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
+  3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
+  4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
+  5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
+  6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
+  7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
 rcu_bh:
-  0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
-  1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
-  2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
-  3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
-  4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
-  5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
-  6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
-  7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
+  0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
+  1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
+  2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
+  3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
+  4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
+  5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
+  6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
+  7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
 
 As always, this is once again split into "rcu_sched" and "rcu_bh"
 portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
@@ -284,6 +284,9 @@ o	"np" is the number of times that __rcu_pending() has been invoked
 o	"qsp" is the number of times that the RCU was waiting for a
 	quiescent state from this CPU.
 
+o	"rpq" is the number of times that the CPU had passed through
+	a quiescent state, but not yet reported it to RCU.
+
 o	"cbr" is the number of times that this CPU had RCU callbacks
 	that had passed through a grace period, and were thus ready
 	to be invoked.

+ 6 - 0
Documentation/RCU/whatisRCU.txt

@@ -840,6 +840,12 @@ SRCU:	Initialization/cleanup
 	init_srcu_struct
 	cleanup_srcu_struct
 
+All:  lockdep-checked RCU-protected pointer access
+
+	rcu_dereference_check
+	rcu_dereference_protected
+	rcu_access_pointer
+
 See the comment headers in the source code (or the docbook generated
 from them) for more information.
 

+ 1 - 1
Documentation/Smack.txt

@@ -73,7 +73,7 @@ NOTE: Smack labels are limited to 23 characters. The attr command
 If you don't do anything special all users will get the floor ("_")
 label when they log in. If you do want to log in via the hacked ssh
 at other labels use the attr command to set the smack value on the
-home directory and it's contents.
+home directory and its contents.
 
 You can add access rules in /etc/smack/accesses. They take the form:
 

+ 6 - 2
Documentation/SubmitChecklist

@@ -9,10 +9,14 @@ Documentation/SubmittingPatches and elsewhere regarding submitting Linux
 kernel patches.
 
 
-1: Builds cleanly with applicable or modified CONFIG options =y, =m, and
+1: If you use a facility then #include the file that defines/declares
+   that facility.  Don't depend on other header files pulling in ones
+   that you use.
+
+2: Builds cleanly with applicable or modified CONFIG options =y, =m, and
    =n.  No gcc warnings/errors, no linker warnings/errors.
 
-2: Passes allnoconfig, allmodconfig
+2b: Passes allnoconfig, allmodconfig
 
 3: Builds on multiple CPU architectures by using local cross-compile tools
    or some other build farm.

+ 2 - 0
Documentation/arm/00-INDEX

@@ -20,6 +20,8 @@ Samsung-S3C24XX
 	- S3C24XX ARM Linux Overview
 Sharp-LH
 	- Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC)
+SPEAr
+	- ST SPEAr platform Linux Overview
 VFP/
 	- Release notes for Linux Kernel Vector Floating Point support code
 empeg/

+ 1 - 1
Documentation/arm/SA1100/ADSBitsy

@@ -32,7 +32,7 @@ Notes:
 
 - The flash on board is divided into 3 partitions.
   You should be careful to use flash on board.
-  It's partition is different from GraphicsClient Plus and GraphicsMaster
+  Its partition is different from GraphicsClient Plus and GraphicsMaster
 
 - 16bpp mode requires a different cable than what ships with the board.
   Contact ADS or look through the manual to wire your own. Currently,

+ 60 - 0
Documentation/arm/SPEAr/overview.txt

@@ -0,0 +1,60 @@
+			SPEAr ARM Linux Overview
+			==========================
+
+Introduction
+------------
+
+  SPEAr (Structured Processor Enhanced Architecture).
+  weblink : http://www.st.com/spear
+
+  The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are
+  supported by the 'spear' platform of ARM Linux. Currently SPEAr300,
+  SPEAr310, SPEAr320 and SPEAr600 SOCs are supported. Support for the SPEAr13XX
+  series is in progress.
+
+  Hierarchy in SPEAr is as follows:
+
+  SPEAr (Platform)
+	- SPEAr3XX (3XX SOC series, based on ARM9)
+		- SPEAr300 (SOC)
+			- SPEAr300_EVB (Evaluation Board)
+		- SPEAr310 (SOC)
+			- SPEAr310_EVB (Evaluation Board)
+		- SPEAr320 (SOC)
+			- SPEAr320_EVB (Evaluation Board)
+	- SPEAr6XX (6XX SOC series, based on ARM9)
+		- SPEAr600 (SOC)
+			- SPEAr600_EVB (Evaluation Board)
+	- SPEAr13XX (13XX SOC series, based on ARM CORTEXA9)
+		- SPEAr1300 (SOC)
+
+  Configuration
+  -------------
+
+  A generic configuration is provided for each machine, and can be used as the
+  default by
+	make spear600_defconfig
+	make spear300_defconfig
+	make spear310_defconfig
+	make spear320_defconfig
+
+  Layout
+  ------
+
+  The common files for multiple machine families (SPEAr3XX, SPEAr6XX and
+  SPEAr13XX) are located in the platform code contained in arch/arm/plat-spear
+  with headers in plat/.
+
+  Each machine series have a directory with name arch/arm/mach-spear followed by
+  series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx.
+
+  Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c and for
+  spear6xx is mach-spear6xx/spear6xx.c. mach-spear* also contain soc/machine
+  specific files, like spear300.c, spear310.c, spear320.c and spear600.c.
+  mach-spear* also contains board specific files for each machine type.
+
+
+  Document Author
+  ---------------
+
+  Viresh Kumar, (c) 2010 ST Microelectronics

+ 2 - 2
Documentation/arm/Samsung-S3C24XX/CPUfreq.txt

@@ -14,8 +14,8 @@ Introduction
  how the clocks are arranged. The first implementation used as single
  PLL to feed the ARM, memory and peripherals via a series of dividers
  and muxes and this is the implementation that is documented here. A
- newer version where there is a seperate PLL and clock divider for the
- ARM core is available as a seperate driver.
+ newer version where there is a separate PLL and clock divider for the
+ ARM core is available as a separate driver.
 
 
 Layout

+ 86 - 0
Documentation/arm/Samsung/Overview.txt

@@ -0,0 +1,86 @@
+		Samsung ARM Linux Overview
+		==========================
+
+Introduction
+------------
+
+  The Samsung range of ARM SoCs spans many similar devices, from the initial
+  ARM9 through to the newest ARM cores. This document shows an overview of
+  the current kernel support, how to use it and where to find the code
+  that supports this.
+
+  The currently supported SoCs are:
+
+  - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list
+  - S3C64XX: S3C6400 and S3C6410
+  - S5PC6440
+
+  S5PC100 and S5PC110 support is currently being merged
+
+
+S3C24XX Systems
+---------------
+
+  There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
+  deals with the architecture and drivers specific to these devices.
+
+  See Documentation/arm/Samsung-S3C24XX/Overview.txt for more information
+  on the implementation details and specific support.
+
+
+Configuration
+-------------
+
+  A number of configurations are supplied, as there is no current way of
+  unifying all the SoCs into one kernel.
+
+  s5p6440_defconfig - S5P6440 specific default configuration
+  s5pc100_defconfig - S5PC100 specific default configuration
+
+
+Layout
+------
+
+  The directory layout is currently being restructured, and consists of
+  several platform directories and then the machine specific directories
+  of the CPUs being built for.
+
+  plat-samsung provides the base for all the implementations, and is the
+  last in the line of include directories that are processed for the build
+  specific information. It contains the base clock, GPIO and device definitions
+  to get the system running.
+
+  plat-s3c is the s3c24xx/s3c64xx platform directory, although it is currently
+  involved in other builds this will be phased out once the relevant code is
+  moved elsewhere.
+
+  plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
+
+  plat-s3c64xx is for the s3c64xx specific bits, see the S3C24XX docs.
+
+  plat-s5p is for s5p specific builds, more to be added.
+
+
+  [ to finish ]
+
+
+Port Contributors
+-----------------
+
+  Ben Dooks (BJD)
+  Vincent Sanders
+  Herbert Potzl
+  Arnaud Patard (RTP)
+  Roc Wu
+  Klaus Fetscher
+  Dimitry Andric
+  Shannon Holland
+  Guillaume Gourat (NexVision)
+  Christer Weinigel (wingel) (Acer N30)
+  Lucas Correia Villa Real (S3C2400 port)
+
+
+Document Author
+---------------
+
+Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org>

+ 167 - 0
Documentation/arm/Samsung/clksrc-change-registers.awk

@@ -0,0 +1,167 @@
+#!/usr/bin/awk -f
+#
+# Copyright 2010 Ben Dooks <ben-linux@fluff.org>
+#
+# Released under GPLv2
+
+# example usage
+# ./clksrc-change-registers.awk arch/arm/plat-s5pc1xx/include/plat/regs-clock.h < src > dst
+
+function extract_value(s)
+{
+    eqat = index(s, "=")
+    comat = index(s, ",")
+    return substr(s, eqat+2, (comat-eqat)-2)
+}
+
+function remove_brackets(b)
+{
+    return substr(b, 2, length(b)-2)
+}
+
+function splitdefine(l, p)
+{
+    r = split(l, tp)
+
+    p[0] = tp[2]
+    p[1] = remove_brackets(tp[3])
+}
+
+function find_length(f)
+{
+    if (0)
+	printf "find_length " f "\n" > "/dev/stderr"
+
+    if (f ~ /0x1/)
+	return 1
+    else if (f ~ /0x3/)
+	return 2
+    else if (f ~ /0x7/)
+	return 3
+    else if (f ~ /0xf/)
+	return 4
+
+    printf "unknown legnth " f "\n" > "/dev/stderr"
+    exit
+}
+
+function find_shift(s)
+{
+    id = index(s, "<")
+    if (id <= 0) {
+	printf "cannot find shift " s "\n" > "/dev/stderr"
+	exit
+    }
+
+    return substr(s, id+2)
+}
+
+
+BEGIN {
+    if (ARGC < 2) {
+	print "too few arguments" > "/dev/stderr"
+	exit
+    }
+
+# read the header file and find the mask values that we will need
+# to replace and create an associative array of values
+
+    while (getline line < ARGV[1] > 0) {
+	if (line ~ /\#define.*_MASK/ &&
+	    !(line ~ /S5PC100_EPLL_MASK/) &&
+	    !(line ~ /USB_SIG_MASK/)) {
+	    splitdefine(line, fields)
+	    name = fields[0]
+	    if (0)
+		printf "MASK " line "\n" > "/dev/stderr"
+	    dmask[name,0] = find_length(fields[1])
+	    dmask[name,1] = find_shift(fields[1])
+	    if (0)
+		printf "=> '" name "' LENGTH=" dmask[name,0] " SHIFT=" dmask[name,1] "\n" > "/dev/stderr"
+	} else {
+	}
+    }
+
+    delete ARGV[1]
+}
+
+/clksrc_clk.*=.*{/ {
+    shift=""
+    mask=""
+    divshift=""
+    reg_div=""
+    reg_src=""
+    indent=1
+
+    print $0
+
+    for(; indent >= 1;) {
+	if ((getline line) <= 0) {
+	    printf "unexpected end of file" > "/dev/stderr"
+	    exit 1;
+	}
+
+	if (line ~ /\.shift/) {
+	    shift = extract_value(line)
+	} else if (line ~ /\.mask/) {
+	    mask = extract_value(line)
+	} else if (line ~ /\.reg_divider/) {
+	    reg_div = extract_value(line)
+	} else if (line ~ /\.reg_source/) {
+	    reg_src = extract_value(line)
+	} else if (line ~ /\.divider_shift/) {
+	    divshift = extract_value(line)
+	} else if (line ~ /{/) {
+		indent++
+		print line
+	    } else if (line ~ /}/) {
+	    indent--
+
+	    if (indent == 0) {
+		if (0) {
+		    printf "shift '" shift   "' ='" dmask[shift,0] "'\n" > "/dev/stderr"
+		    printf "mask  '" mask    "'\n" > "/dev/stderr"
+		    printf "dshft '" divshift "'\n" > "/dev/stderr"
+		    printf "rdiv  '" reg_div "'\n" > "/dev/stderr"
+		    printf "rsrc  '" reg_src "'\n" > "/dev/stderr"
+		}
+
+		generated = mask
+		sub(reg_src, reg_div, generated)
+
+		if (0) {
+		    printf "/* rsrc " reg_src " */\n"
+		    printf "/* rdiv " reg_div " */\n"
+		    printf "/* shift " shift " */\n"
+		    printf "/* mask " mask " */\n"
+		    printf "/* generated " generated " */\n"
+		}
+
+		if (reg_div != "") {
+		    printf "\t.reg_div = { "
+		    printf ".reg = " reg_div ", "
+		    printf ".shift = " dmask[generated,1] ", "
+		    printf ".size = " dmask[generated,0] ", "
+		    printf "},\n"
+		}
+
+		printf "\t.reg_src = { "
+		printf ".reg = " reg_src ", "
+		printf ".shift = " dmask[mask,1] ", "
+		printf ".size = " dmask[mask,0] ", "
+
+		printf "},\n"
+
+	    }
+
+	    print line
+	} else {
+	    print line
+	}
+
+	if (0)
+	    printf indent ":" line "\n" > "/dev/stderr"
+    }
+}
+
+// && ! /clksrc_clk.*=.*{/ { print $0 }

+ 1 - 1
Documentation/arm/Sharp-LH/ADC-LH7-Touchscreen

@@ -7,7 +7,7 @@ The driver only implements a four-wire touch panel protocol.
 
 The touchscreen driver is maintenance free except for the pen-down or
 touch threshold.  Some resistive displays and board combinations may
-require tuning of this threshold.  The driver exposes some of it's
+require tuning of this threshold.  The driver exposes some of its
 internal state in the sys filesystem.  If the kernel is configured
 with it, CONFIG_SYSFS, and sysfs is mounted at /sys, there will be a
 directory

+ 1 - 1
Documentation/atomic_ops.txt

@@ -320,7 +320,7 @@ counter decrement would not become globally visible until the
 obj->active update does.
 
 As a historical note, 32-bit Sparc used to only allow usage of
-24-bits of it's atomic_t type.  This was because it used 8 bits
+24-bits of its atomic_t type.  This was because it used 8 bits
 as a spinlock for SMP safety.  Sparc32 lacked a "compare and swap"
 type instruction.  However, 32-bit Sparc has since been moved over
 to a "hash table of spinlocks" scheme, that allows the full 32-bit

+ 1 - 1
Documentation/blackfin/bfin-gpio-notes.txt

@@ -43,7 +43,7 @@
 	void bfin_gpio_irq_free(unsigned gpio);
 
     The request functions will record the function state for a certain pin,
-    the free functions will clear it's function state.
+    the free functions will clear its function state.
     Once a pin is requested, it can't be requested again before it is freed by
     previous caller, otherwise kernel will dump stacks, and the request
     function fail.

+ 2 - 2
Documentation/block/biodoc.txt

@@ -1162,8 +1162,8 @@ where a driver received a request ala this before:
 
 As mentioned, there is no virtual mapping of a bio. For DMA, this is
 not a problem as the driver probably never will need a virtual mapping.
-Instead it needs a bus mapping (pci_map_page for a single segment or
-use blk_rq_map_sg for scatter gather) to be able to ship it to the driver. For
+Instead it needs a bus mapping (dma_map_page for a single segment or
+use dma_map_sg for scatter gather) to be able to ship it to the driver. For
 PIO drivers (or drivers that need to revert to PIO transfer once in a
 while (IDE for example)), where the CPU is doing the actual data
 transfer a virtual mapping is needed. If the driver supports highmem I/O,

+ 3 - 3
Documentation/cachetlb.txt

@@ -5,7 +5,7 @@
 
 This document describes the cache/tlb flushing interfaces called
 by the Linux VM subsystem.  It enumerates over each interface,
-describes it's intended purpose, and what side effect is expected
+describes its intended purpose, and what side effect is expected
 after the interface is invoked.
 
 The side effects described below are stated for a uniprocessor
@@ -231,7 +231,7 @@ require a whole different set of interfaces to handle properly.
 The biggest problem is that of virtual aliasing in the data cache
 of a processor.
 
-Is your port susceptible to virtual aliasing in it's D-cache?
+Is your port susceptible to virtual aliasing in its D-cache?
 Well, if your D-cache is virtually indexed, is larger in size than
 PAGE_SIZE, and does not prevent multiple cache lines for the same
 physical address from existing at once, you have this problem.
@@ -249,7 +249,7 @@ one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
 Next, you have to solve the D-cache aliasing issue for all
 other cases.  Please keep in mind that fact that, for a given page
 mapped into some user address space, there is always at least one more
-mapping, that of the kernel in it's linear mapping starting at
+mapping, that of the kernel in its linear mapping starting at
 PAGE_OFFSET.  So immediately, once the first user maps a given
 physical page into its address space, by implication the D-cache
 aliasing problem has the potential to exist since the kernel already

+ 110 - 0
Documentation/cgroups/cgroup_event_listener.c

@@ -0,0 +1,110 @@
+/*
+ * cgroup_event_listener.c - Simple listener of cgroup events
+ *
+ * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/eventfd.h>
+
+#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n"
+
+int main(int argc, char **argv)
+{
+	int efd = -1;
+	int cfd = -1;
+	int event_control = -1;
+	char event_control_path[PATH_MAX];
+	char line[LINE_MAX];
+	int ret;
+
+	if (argc != 3) {
+		fputs(USAGE_STR, stderr);
+		return 1;
+	}
+
+	cfd = open(argv[1], O_RDONLY);
+	if (cfd == -1) {
+		fprintf(stderr, "Cannot open %s: %s\n", argv[1],
+				strerror(errno));
+		goto out;
+	}
+
+	ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
+			dirname(argv[1]));
+	if (ret >= PATH_MAX) {
+		fputs("Path to cgroup.event_control is too long\n", stderr);
+		goto out;
+	}
+
+	event_control = open(event_control_path, O_WRONLY);
+	if (event_control == -1) {
+		fprintf(stderr, "Cannot open %s: %s\n", event_control_path,
+				strerror(errno));
+		goto out;
+	}
+
+	efd = eventfd(0, 0);
+	if (efd == -1) {
+		perror("eventfd() failed");
+		goto out;
+	}
+
+	ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
+	if (ret >= LINE_MAX) {
+		fputs("Arguments string is too long\n", stderr);
+		goto out;
+	}
+
+	ret = write(event_control, line, strlen(line) + 1);
+	if (ret == -1) {
+		perror("Cannot write to cgroup.event_control");
+		goto out;
+	}
+
+	while (1) {
+		uint64_t result;
+
+		ret = read(efd, &result, sizeof(result));
+		if (ret == -1) {
+			if (errno == EINTR)
+				continue;
+			perror("Cannot read from eventfd");
+			break;
+		}
+		assert(ret == sizeof(result));
+
+		ret = access(event_control_path, W_OK);
+		if ((ret == -1) && (errno == ENOENT)) {
+				puts("The cgroup seems to have removed.");
+				ret = 0;
+				break;
+		}
+
+		if (ret == -1) {
+			perror("cgroup.event_control "
+					"is not accessable any more");
+			break;
+		}
+
+		printf("%s %s: crossed\n", argv[1], argv[2]);
+	}
+
+out:
+	if (efd >= 0)
+		close(efd);
+	if (event_control >= 0)
+		close(event_control);
+	if (cfd >= 0)
+		close(cfd);
+
+	return (ret != 0);
+}

+ 39 - 3
Documentation/cgroups/cgroups.txt

@@ -22,6 +22,8 @@ CONTENTS:
 2. Usage Examples and Syntax
   2.1 Basic Usage
   2.2 Attaching processes
+  2.3 Mounting hierarchies by name
+  2.4 Notification API
 3. Kernel API
   3.1 Overview
   3.2 Synchronization
@@ -233,8 +235,7 @@ containing the following files describing that cgroup:
  - cgroup.procs: list of tgids in the cgroup.  This list is not
    guaranteed to be sorted or free of duplicate tgids, and userspace
    should sort/uniquify the list if this property is required.
-   Writing a tgid into this file moves all threads with that tgid into
-   this cgroup.
+   This is a read-only file, for now.
  - notify_on_release flag: run the release agent on exit?
  - release_agent: the path to use for release notifications (this file
    exists in the top cgroup only)
@@ -434,6 +435,25 @@ you give a subsystem a name.
 The name of the subsystem appears as part of the hierarchy description
 in /proc/mounts and /proc/<pid>/cgroups.
 
+2.4 Notification API
+--------------------
+
+There is mechanism which allows to get notifications about changing
+status of a cgroup.
+
+To register new notification handler you need:
+ - create a file descriptor for event notification using eventfd(2);
+ - open a control file to be monitored (e.g. memory.usage_in_bytes);
+ - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
+   Interpretation of args is defined by control file implementation;
+
+eventfd will be woken up by control file implementation or when the
+cgroup is removed.
+
+To unregister notification handler just close eventfd.
+
+NOTE: Support of notifications should be implemented for the control
+file. See documentation for the subsystem.
 
 3. Kernel API
 =============
@@ -488,6 +508,11 @@ Each subsystem should:
 - add an entry in linux/cgroup_subsys.h
 - define a cgroup_subsys object called <name>_subsys
 
+If a subsystem can be compiled as a module, it should also have in its
+module initcall a call to cgroup_load_subsys(), and in its exitcall a
+call to cgroup_unload_subsys(). It should also set its_subsys.module =
+THIS_MODULE in its .c file.
+
 Each subsystem may export the following methods. The only mandatory
 methods are create/destroy. Any others that are null are presumed to
 be successful no-ops.
@@ -536,10 +561,21 @@ returns an error, this will abort the attach operation.  If a NULL
 task is passed, then a successful result indicates that *any*
 unspecified task can be moved into the cgroup. Note that this isn't
 called on a fork. If this method returns 0 (success) then this should
-remain valid while the caller holds cgroup_mutex. If threadgroup is
+remain valid while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future. If threadgroup is
 true, then a successful result indicates that all threads in the given
 thread's threadgroup can be moved together.
 
+void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+	       struct task_struct *task, bool threadgroup)
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsystem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded.
+
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	    struct cgroup *old_cgrp, struct task_struct *task,
 	    bool threadgroup)

+ 82 - 79
Documentation/cgroups/cpusets.txt

@@ -42,7 +42,7 @@ Nodes to a set of tasks.   In this document "Memory Node" refers to
 an on-line node that contains memory.
 
 Cpusets constrain the CPU and Memory placement of tasks to only
-the resources within a tasks current cpuset.  They form a nested
+the resources within a task's current cpuset.  They form a nested
 hierarchy visible in a virtual file system.  These are the essential
 hooks, beyond what is already present, required to manage dynamic
 job placement on large systems.
@@ -53,11 +53,11 @@ Documentation/cgroups/cgroups.txt.
 Requests by a task, using the sched_setaffinity(2) system call to
 include CPUs in its CPU affinity mask, and using the mbind(2) and
 set_mempolicy(2) system calls to include Memory Nodes in its memory
-policy, are both filtered through that tasks cpuset, filtering out any
+policy, are both filtered through that task's cpuset, filtering out any
 CPUs or Memory Nodes not in that cpuset.  The scheduler will not
 schedule a task on a CPU that is not allowed in its cpus_allowed
 vector, and the kernel page allocator will not allocate a page on a
-node that is not allowed in the requesting tasks mems_allowed vector.
+node that is not allowed in the requesting task's mems_allowed vector.
 
 User level code may create and destroy cpusets by name in the cgroup
 virtual file system, manage the attributes and permissions of these
@@ -121,9 +121,9 @@ Cpusets extends these two mechanisms as follows:
  - Each task in the system is attached to a cpuset, via a pointer
    in the task structure to a reference counted cgroup structure.
  - Calls to sched_setaffinity are filtered to just those CPUs
-   allowed in that tasks cpuset.
+   allowed in that task's cpuset.
  - Calls to mbind and set_mempolicy are filtered to just
-   those Memory Nodes allowed in that tasks cpuset.
+   those Memory Nodes allowed in that task's cpuset.
  - The root cpuset contains all the systems CPUs and Memory
    Nodes.
  - For any cpuset, one can define child cpusets containing a subset
@@ -141,11 +141,11 @@ into the rest of the kernel, none in performance critical paths:
  - in init/main.c, to initialize the root cpuset at system boot.
  - in fork and exit, to attach and detach a task from its cpuset.
  - in sched_setaffinity, to mask the requested CPUs by what's
-   allowed in that tasks cpuset.
+   allowed in that task's cpuset.
  - in sched.c migrate_live_tasks(), to keep migrating tasks within
    the CPUs allowed by their cpuset, if possible.
  - in the mbind and set_mempolicy system calls, to mask the requested
-   Memory Nodes by what's allowed in that tasks cpuset.
+   Memory Nodes by what's allowed in that task's cpuset.
  - in page_alloc.c, to restrict memory to allowed nodes.
  - in vmscan.c, to restrict page recovery to the current cpuset.
 
@@ -155,7 +155,7 @@ new system calls are added for cpusets - all support for querying and
 modifying cpusets is via this cpuset file system.
 
 The /proc/<pid>/status file for each task has four added lines,
-displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
+displaying the task's cpus_allowed (on which CPUs it may be scheduled)
 and mems_allowed (on which Memory Nodes it may obtain memory),
 in the two formats seen in the following example:
 
@@ -168,20 +168,20 @@ Each cpuset is represented by a directory in the cgroup file system
 containing (on top of the standard cgroup files) the following
 files describing that cpuset:
 
- - cpus: list of CPUs in that cpuset
- - mems: list of Memory Nodes in that cpuset
- - memory_migrate flag: if set, move pages to cpusets nodes
- - cpu_exclusive flag: is cpu placement exclusive?
- - mem_exclusive flag: is memory placement exclusive?
- - mem_hardwall flag:  is memory allocation hardwalled
- - memory_pressure: measure of how much paging pressure in cpuset
- - memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - sched_relax_domain_level: the searching range when migrating tasks
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag:  is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks
 
 In addition, the root cpuset only has the following file:
- - memory_pressure_enabled flag: compute memory_pressure?
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?
 
 New cpusets are created using the mkdir system call or shell
 command.  The properties of a cpuset, such as its flags, allowed
@@ -229,7 +229,7 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
 a direct ancestor or descendant, may share any of the same CPUs or
 Memory Nodes.
 
-A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
 i.e. it restricts kernel allocations for page, buffer and other data
 commonly shared by the kernel across multiple users.  All cpusets,
 whether hardwalled or not, restrict allocations of memory for user
@@ -304,15 +304,15 @@ times 1000.
 ---------------------------
 There are two boolean flag files per cpuset that control where the
 kernel allocates pages for the file system buffers and related in
-kernel data structures.  They are called 'memory_spread_page' and
-'memory_spread_slab'.
+kernel data structures.  They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.
 
-If the per-cpuset boolean flag file 'memory_spread_page' is set, then
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
 the kernel will spread the file system buffers (page cache) evenly
 over all the nodes that the faulting task is allowed to use, instead
 of preferring to put those pages on the node where the task is running.
 
-If the per-cpuset boolean flag file 'memory_spread_slab' is set,
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
 then the kernel will spread some file system related slab caches,
 such as for inodes and dentries evenly over all the nodes that the
 faulting task is allowed to use, instead of preferring to put those
@@ -323,41 +323,41 @@ stack segment pages of a task.
 
 By default, both kinds of memory spreading are off, and memory
 pages are allocated on the node local to where the task is running,
-except perhaps as modified by the tasks NUMA mempolicy or cpuset
+except perhaps as modified by the task's NUMA mempolicy or cpuset
 configuration, so long as sufficient free memory pages are available.
 
 When new cpusets are created, they inherit the memory spread settings
 of their parent.
 
 Setting memory spreading causes allocations for the affected page
-or slab caches to ignore the tasks NUMA mempolicy and be spread
+or slab caches to ignore the task's NUMA mempolicy and be spread
 instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
 mempolicies will not notice any change in these calls as a result of
-their containing tasks memory spread settings.  If memory spreading
+their containing task's memory spread settings.  If memory spreading
 is turned off, then the currently specified NUMA mempolicy once again
 applies to memory page allocations.
 
-Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
 files.  By default they contain "0", meaning that the feature is off
 for that cpuset.  If a "1" is written to that file, then that turns
 the named feature on.
 
 The implementation is simple.
 
-Setting the flag 'memory_spread_page' turns on a per-process flag
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
 PF_SPREAD_PAGE for each task that is in that cpuset or subsequently
 joins that cpuset.  The page allocation calls for the page cache
 is modified to perform an inline check for this PF_SPREAD_PAGE task
 flag, and if set, a call to a new routine cpuset_mem_spread_node()
 returns the node to prefer for the allocation.
 
-Similarly, setting 'memory_spread_slab' turns on the flag
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
 PF_SPREAD_SLAB, and appropriately marked slab caches will allocate
 pages from the node returned by cpuset_mem_spread_node().
 
 The cpuset_mem_spread_node() routine is also simple.  It uses the
 value of a per-task rotor cpuset_mem_spread_rotor to select the next
-node in the current tasks mems_allowed to prefer for the allocation.
+node in the current task's mems_allowed to prefer for the allocation.
 
 This memory placement policy is also known (in other contexts) as
 round-robin or interleave.
@@ -404,24 +404,24 @@ the following two situations:
     system overhead on those CPUs, including avoiding task load
     balancing if that is not needed.
 
-When the per-cpuset flag "sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpus'
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
 be contained in a single sched domain, ensuring that load balancing
 can move a task (not otherwised pinned, as by sched_setaffinity)
 from any CPU in that cpuset to any other.
 
-When the per-cpuset flag "sched_load_balance" is disabled, then the
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
 scheduler will avoid load balancing across the CPUs in that cpuset,
 --except-- in so far as is necessary because some overlapping cpuset
 has "sched_load_balance" enabled.
 
-So, for example, if the top cpuset has the flag "sched_load_balance"
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
 enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "sched_load_balance" flag in any other
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
 cpusets won't matter, as we're already fully load balancing.
 
 Therefore in the above two situations, the top cpuset flag
-"sched_load_balance" should be disabled, and only some of the smaller,
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
 child cpusets have this flag enabled.
 
 When doing this, you don't usually want to leave any unpinned tasks in
@@ -433,7 +433,7 @@ scheduler might not consider the possibility of load balancing that
 task to that underused CPU.
 
 Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "sched_load_balance" as those tasks aren't going anywhere
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
 else anyway.
 
 There is an impedance mismatch here, between cpusets and sched domains.
@@ -443,19 +443,19 @@ overlap and each CPU is in at most one sched domain.
 It is necessary for sched domains to be flat because load balancing
 across partially overlapping sets of CPUs would risk unstable dynamics
 that would be beyond our understanding.  So if each of two partially
-overlapping cpusets enables the flag 'sched_load_balance', then we
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
 form a single sched domain that is a superset of both.  We won't move
 a task to a CPU outside it cpuset, but the scheduler load balancing
 code might waste some compute cycles considering that possibility.
 
 This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "sched_load_balance" enabled,
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
 and the sched domain configuration.  If a cpuset enables the flag, it
 will get balancing across all its CPUs, but if it disables the flag,
 it will only be assured of no load balancing if no other overlapping
 cpuset enables the flag.
 
-If two cpusets have partially overlapping 'cpus' allowed, and only
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
 one of them has this flag enabled, then the other may find its
 tasks only partially load balanced, just on the overlapping CPUs.
 This is just the general case of the top_cpuset example given a few
@@ -468,23 +468,23 @@ load balancing to the other CPUs.
 1.7.1 sched_load_balance implementation details.
 ------------------------------------------------
 
-The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
 to most cpuset flags.)  When enabled for a cpuset, the kernel will
 ensure that it can load balance across all the CPUs in that cpuset
 (makes sure that all the CPUs in the cpus_allowed of that cpuset are
 in the same sched domain.)
 
-If two overlapping cpusets both have 'sched_load_balance' enabled,
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
 then they will be (must be) both in the same sched domain.
 
-If, as is the default, the top cpuset has 'sched_load_balance' enabled,
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
 then by the above that means there is a single sched domain covering
 the whole system, regardless of any other cpuset settings.
 
 The kernel commits to user space that it will avoid load balancing
 where it can.  It will pick as fine a granularity partition of sched
 domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'sched_load_balance' enabled.
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
 
 The internal kernel cpuset to scheduler interface passes from the
 cpuset code to the scheduler code a partition of the load balanced
@@ -495,9 +495,9 @@ all the CPUs that must be load balanced.
 The cpuset code builds a new such partition and passes it to the
 scheduler sched domain setup code, to have the sched domains rebuilt
 as necessary, whenever:
- - the 'sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
  - or CPUs come or go from a cpuset with this flag enabled,
- - or 'sched_relax_domain_level' value of a cpuset with non-empty CPUs
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
    and with this flag enabled changes,
  - or a cpuset with non-empty CPUs and with this flag enabled is removed,
  - or a cpu is offlined/onlined.
@@ -542,7 +542,7 @@ As the result, task B on CPU X need to wait task A or wait load balance
 on the next tick.  For some applications in special situation, waiting
 1 tick may be too long.
 
-The 'sched_relax_domain_level' file allows you to request changing
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
 this searching range as you like.  This file takes int value which
 indicates size of searching range in levels ideally as follows,
 otherwise initial value -1 that indicates the cpuset has no request.
@@ -559,8 +559,8 @@ The system default is architecture dependent.  The system default
 can be changed using the relax_domain_level= boot parameter.
 
 This file is per-cpuset and affect the sched domain where the cpuset
-belongs to.  Therefore if the flag 'sched_load_balance' of a cpuset
-is disabled, then 'sched_relax_domain_level' have no effect since
+belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
 there is no sched domain belonging the cpuset.
 
 If multiple cpusets are overlapping and hence they form a single sched
@@ -594,7 +594,7 @@ is attached, is subtle.
 If a cpuset has its Memory Nodes modified, then for each task attached
 to that cpuset, the next time that the kernel attempts to allocate
 a page of memory for that task, the kernel will notice the change
-in the tasks cpuset, and update its per-task memory placement to
+in the task's cpuset, and update its per-task memory placement to
 remain within the new cpusets memory placement.  If the task was using
 mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
 its new cpuset, then the task will continue to use whatever subset
@@ -603,13 +603,13 @@ was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
 in the new cpuset, then the task will be essentially treated as if it
 was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
 as queried by get_mempolicy(), doesn't change).  If a task is moved
-from one cpuset to another, then the kernel will adjust the tasks
+from one cpuset to another, then the kernel will adjust the task's
 memory placement, as above, the next time that the kernel attempts
 to allocate a page of memory for that task.
 
-If a cpuset has its 'cpus' modified, then each task in that cpuset
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
 will have its allowed CPU placement changed immediately.  Similarly,
-if a tasks pid is written to another cpusets 'tasks' file, then its
+if a task's pid is written to another cpusets 'cpuset.tasks' file, then its
 allowed CPU placement is changed immediately.  If such a task had been
 bound to some subset of its cpuset using the sched_setaffinity() call,
 the task will be allowed to run on any CPU allowed in its new cpuset,
@@ -622,21 +622,21 @@ and the processor placement is updated immediately.
 Normally, once a page is allocated (given a physical page
 of main memory) then that page stays on whatever node it
 was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'mems' subsequently changes.
-If the cpuset flag file 'memory_migrate' is set true, then when
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
 tasks are attached to that cpuset, any pages that task had
 allocated to it on nodes in its previous cpuset are migrated
-to the tasks new cpuset. The relative placement of the page within
+to the task's new cpuset. The relative placement of the page within
 the cpuset is preserved during these migration operations if possible.
 For example if the page was on the second valid node of the prior cpuset
 then the page will be placed on the second valid node of the new cpuset.
 
-Also if 'memory_migrate' is set true, then if that cpusets
-'mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'mems',
+Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
 will be moved to nodes in the new setting of 'mems.'
-Pages that were not in the tasks prior cpuset, or in the cpusets
-prior 'mems' setting, will not be moved.
+Pages that were not in the task's prior cpuset, or in the cpuset's
+prior 'cpuset.mems' setting, will not be moved.
 
 There is an exception to the above.  If hotplug functionality is used
 to remove all the CPUs that are currently assigned to a cpuset,
@@ -655,7 +655,7 @@ There is a second exception to the above.  GFP_ATOMIC requests are
 kernel internal allocations that must be satisfied, immediately.
 The kernel may drop some request, in rare cases even panic, if a
 GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
-the current tasks cpuset, then we relax the cpuset, and look for
+the current task's cpuset, then we relax the cpuset, and look for
 memory anywhere we can find it.  It's better to violate the cpuset
 than stress the kernel.
 
@@ -678,8 +678,8 @@ and then start a subshell 'sh' in that cpuset:
   cd /dev/cpuset
   mkdir Charlie
   cd Charlie
-  /bin/echo 2-3 > cpus
-  /bin/echo 1 > mems
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
   /bin/echo $$ > tasks
   sh
   # The subshell 'sh' is now running in cpuset Charlie
@@ -725,10 +725,13 @@ Now you want to do something with this cpuset.
 
 In this directory you can find several files:
 # ls
-cpu_exclusive  memory_migrate      mems                      tasks
-cpus           memory_pressure     notify_on_release
-mem_exclusive  memory_spread_page  sched_load_balance
-mem_hardwall   memory_spread_slab  sched_relax_domain_level
+cpuset.cpu_exclusive       cpuset.memory_spread_slab
+cpuset.cpus                cpuset.mems
+cpuset.mem_exclusive       cpuset.sched_load_balance
+cpuset.mem_hardwall        cpuset.sched_relax_domain_level
+cpuset.memory_migrate      notify_on_release
+cpuset.memory_pressure     tasks
+cpuset.memory_spread_page
 
 Reading them will give you information about the state of this cpuset:
 the CPUs and Memory Nodes it can use, the processes that are using
@@ -736,13 +739,13 @@ it, its properties.  By writing to these files you can manipulate
 the cpuset.
 
 Set some flags:
-# /bin/echo 1 > cpu_exclusive
+# /bin/echo 1 > cpuset.cpu_exclusive
 
 Add some cpus:
-# /bin/echo 0-7 > cpus
+# /bin/echo 0-7 > cpuset.cpus
 
 Add some mems:
-# /bin/echo 0-7 > mems
+# /bin/echo 0-7 > cpuset.mems
 
 Now attach your shell to this cpuset:
 # /bin/echo $$ > tasks
@@ -774,28 +777,28 @@ echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
 This is the syntax to use when writing in the cpus or mems files
 in cpuset directories:
 
-# /bin/echo 1-4 > cpus		-> set cpus list to cpus 1,2,3,4
-# /bin/echo 1,2,3,4 > cpus	-> set cpus list to cpus 1,2,3,4
+# /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
+# /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
 
 To add a CPU to a cpuset, write the new list of CPUs including the
 CPU to be added. To add 6 to the above cpuset:
 
-# /bin/echo 1-4,6 > cpus	-> set cpus list to cpus 1,2,3,4,6
+# /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
 
 Similarly to remove a CPU from a cpuset, write the new list of CPUs
 without the CPU to be removed.
 
 To remove all the CPUs:
 
-# /bin/echo "" > cpus		-> clear cpus list
+# /bin/echo "" > cpuset.cpus		-> clear cpus list
 
 2.3 Setting flags
 -----------------
 
 The syntax is very simple:
 
-# /bin/echo 1 > cpu_exclusive 	-> set flag 'cpu_exclusive'
-# /bin/echo 0 > cpu_exclusive 	-> unset flag 'cpu_exclusive'
+# /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
+# /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
 
 2.4 Attaching processes
 -----------------------

+ 44 - 5
Documentation/cgroups/memcg_test.txt

@@ -1,6 +1,6 @@
 Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2009/1/20
-Base Kernel Version: based on 2.6.29-rc2.
+Last Updated: 2010/2
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
 
 Because VM is getting complex (one of reasons is memcg...), memcg's behavior
 is complex. This is a document for memcg's internal behavior.
@@ -244,7 +244,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	  we have to check if OLDPAGE/NEWPAGE is a valid page after commit().
 
 8. LRU
-        Each memcg has its own private LRU. Now, it's handling is under global
+        Each memcg has its own private LRU. Now, its handling is under global
 	VM's control (means that it's handled under global zone->lru_lock).
 	Almost all routines around memcg's LRU is called by global LRU's
 	list management functions under zone->lru_lock().
@@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	race and lock dependency with other cgroup subsystems.
 
 	example)
-	# mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices
+	# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
 
 	and do task move, mkdir, rmdir etc...under this.
 
@@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 
 	For example, test like following is good.
 	(Shell-A)
-	# mount -t cgroup none /cgroup -t memory
+	# mount -t cgroup none /cgroup -o memory
 	# mkdir /cgroup/test
 	# echo 40M > /cgroup/test/memory.limit_in_bytes
 	# echo 0 > /cgroup/test/tasks
@@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	#echo 50M > memory.limit_in_bytes
 	#echo 50M > memory.memsw.limit_in_bytes
 	run 51M of malloc
+
+ 9.9 Move charges at task migration
+	Charges associated with a task can be moved along with task migration.
+
+	(Shell-A)
+	#mkdir /cgroup/A
+	#echo $$ >/cgroup/A/tasks
+	run some programs which uses some amount of memory in /cgroup/A.
+
+	(Shell-B)
+	#mkdir /cgroup/B
+	#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+	#echo "pid of the program running in group A" >/cgroup/B/tasks
+
+	You can see charges have been moved by reading *.usage_in_bytes or
+	memory.stat of both A and B.
+	See 8.2 of Documentation/cgroups/memory.txt to see what value should be
+	written to move_charge_at_immigrate.
+
+ 9.10 Memory thresholds
+	Memory controler implements memory thresholds using cgroups notification
+	API. You can use Documentation/cgroups/cgroup_event_listener.c to test
+	it.
+
+	(Shell-A) Create cgroup and run event listener
+	# mkdir /cgroup/A
+	# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+	(Shell-B) Add task to cgroup and try to allocate and free memory
+	# echo $$ >/cgroup/A/tasks
+	# a="$(dd if=/dev/zero bs=1M count=10)"
+	# a=
+
+	You will see message from cgroup_event_listener every time you cross
+	the thresholds.
+
+	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+	It's good idea to test root cgroup as well.

+ 79 - 5
Documentation/cgroups/memory.txt

@@ -182,6 +182,8 @@ list.
 NOTE: Reclaim does not work for the root cgroup, since we cannot set any
 limits on the root cgroup.
 
+Note2: When panic_on_oom is set to "2", the whole system will panic.
+
 2. Locking
 
 The memory controller uses the following hierarchy
@@ -261,11 +263,13 @@ some of the pages cached in the cgroup (page cache pages).
 
 4.2 Task migration
 
-When a task migrates from one cgroup to another, it's charge is not
-carried forward. The pages allocated from the original cgroup still
+When a task migrates from one cgroup to another, its charge is not
+carried forward by default. The pages allocated from the original cgroup still
 remain charged to it, the charge is dropped when the page is freed or
 reclaimed.
 
+Note: You can move charges of a task along with task migration. See 8.
+
 4.3 Removing a cgroup
 
 A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
@@ -336,7 +340,7 @@ Note:
 5.3 swappiness
   Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
 
-  Following cgroups' swapiness can't be changed.
+  Following cgroups' swappiness can't be changed.
   - root cgroup (uses /proc/sys/vm/swappiness).
   - a cgroup which uses hierarchy and it has child cgroup.
   - a cgroup which uses hierarchy and not the root of hierarchy.
@@ -377,7 +381,8 @@ The feature can be disabled by
 NOTE1: Enabling/disabling will fail if the cgroup already has other
 cgroups created below it.
 
-NOTE2: This feature can be enabled/disabled per subtree.
+NOTE2: When panic_on_oom is set to "2", the whole system will panic in
+case of an oom event in any cgroup.
 
 7. Soft limits
 
@@ -414,7 +419,76 @@ NOTE1: Soft limits take effect over a long period of time, since they involve
 NOTE2: It is recommended to set the soft limit always below the hard limit,
        otherwise the hard limit will take precedence.
 
-8. TODO
+8. Move charges at task migration
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+
+This feature is disabled by default. It can be enabled(and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it:
+
+# echo (some positive value) > memory.move_charge_at_immigrate
+
+Note: Each bits of move_charge_at_immigrate has its own meaning about what type
+      of charges should be moved. See 8.2 for details.
+Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread
+      group.
+Note: If we cannot find enough space for the task in the destination cgroup, we
+      try to make space by reclaiming memory. Task migration may fail if we
+      cannot make enough space.
+Note: It can take several seconds if you move charges in giga bytes order.
+
+And if you want disable it again:
+
+# echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be move
+
+Each bits of move_charge_at_immigrate has its own meaning about what type of
+charges should be moved.
+
+  bit | what type of charges would be moved ?
+ -----+------------------------------------------------------------------------
+   0  | A charge of an anonymous page(or swap of it) used by the target task.
+      | Those pages and swaps must be used only by the target task. You must
+      | enable Swap Extension(see 2.4) to enable move of swap charges.
+
+Note: Those pages and swaps must be charged to the old cgroup.
+Note: More type of pages(e.g. file cache, shmem,) will be supported by other
+      bits in future.
+
+8.3 TODO
+
+- Add support for other types of pages(e.g. file cache, shmem, etc.).
+- Implement madvise(2) to let users decide the vma to be moved or not to be
+  moved.
+- All of moving charge operations are done under cgroup_mutex. It's not good
+  behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+
+Memory controler implements memory thresholds using cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold application need:
+ - create an eventfd using eventfd(2);
+ - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+ - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
+   cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. TODO
 
 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first

+ 234 - 0
Documentation/circular-buffers.txt

@@ -0,0 +1,234 @@
+			       ================
+			       CIRCULAR BUFFERS
+			       ================
+
+By: David Howells <dhowells@redhat.com>
+    Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+
+Linux provides a number of features that can be used to implement circular
+buffering.  There are two sets of such features:
+
+ (1) Convenience functions for determining information about power-of-2 sized
+     buffers.
+
+ (2) Memory barriers for when the producer and the consumer of objects in the
+     buffer don't want to share a lock.
+
+To use these facilities, as discussed below, there needs to be just one
+producer and just one consumer.  It is possible to handle multiple producers by
+serialising them, and to handle multiple consumers by serialising them.
+
+
+Contents:
+
+ (*) What is a circular buffer?
+
+ (*) Measuring power-of-2 buffers.
+
+ (*) Using memory barriers with circular buffers.
+     - The producer.
+     - The consumer.
+
+
+==========================
+WHAT IS A CIRCULAR BUFFER?
+==========================
+
+First of all, what is a circular buffer?  A circular buffer is a buffer of
+fixed, finite size into which there are two indices:
+
+ (1) A 'head' index - the point at which the producer inserts items into the
+     buffer.
+
+ (2) A 'tail' index - the point at which the consumer finds the next item in
+     the buffer.
+
+Typically when the tail pointer is equal to the head pointer, the buffer is
+empty; and the buffer is full when the head pointer is one less than the tail
+pointer.
+
+The head index is incremented when items are added, and the tail index when
+items are removed.  The tail index should never jump the head index, and both
+indices should be wrapped to 0 when they reach the end of the buffer, thus
+allowing an infinite amount of data to flow through the buffer.
+
+Typically, items will all be of the same unit size, but this isn't strictly
+required to use the techniques below.  The indices can be increased by more
+than 1 if multiple items or variable-sized items are to be included in the
+buffer, provided that neither index overtakes the other.  The implementer must
+be careful, however, as a region more than one unit in size may wrap the end of
+the buffer and be broken into two segments.
+
+
+============================
+MEASURING POWER-OF-2 BUFFERS
+============================
+
+Calculation of the occupancy or the remaining capacity of an arbitrarily sized
+circular buffer would normally be a slow operation, requiring the use of a
+modulus (divide) instruction.  However, if the buffer is of a power-of-2 size,
+then a much quicker bitwise-AND instruction can be used instead.
+
+Linux provides a set of macros for handling power-of-2 circular buffers.  These
+can be made use of by:
+
+	#include <linux/circ_buf.h>
+
+The macros are:
+
+ (*) Measure the remaining capacity of a buffer:
+
+	CIRC_SPACE(head_index, tail_index, buffer_size);
+
+     This returns the amount of space left in the buffer[1] into which items
+     can be inserted.
+
+
+ (*) Measure the maximum consecutive immediate space in a buffer:
+
+	CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
+
+     This returns the amount of consecutive space left in the buffer[1] into
+     which items can be immediately inserted without having to wrap back to the
+     beginning of the buffer.
+
+
+ (*) Measure the occupancy of a buffer:
+
+	CIRC_CNT(head_index, tail_index, buffer_size);
+
+     This returns the number of items currently occupying a buffer[2].
+
+
+ (*) Measure the non-wrapping occupancy of a buffer:
+
+	CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
+
+     This returns the number of consecutive items[2] that can be extracted from
+     the buffer without having to wrap back to the beginning of the buffer.
+
+
+Each of these macros will nominally return a value between 0 and buffer_size-1,
+however:
+
+ [1] CIRC_SPACE*() are intended to be used in the producer.  To the producer
+     they will return a lower bound as the producer controls the head index,
+     but the consumer may still be depleting the buffer on another CPU and
+     moving the tail index.
+
+     To the consumer it will show an upper bound as the producer may be busy
+     depleting the space.
+
+ [2] CIRC_CNT*() are intended to be used in the consumer.  To the consumer they
+     will return a lower bound as the consumer controls the tail index, but the
+     producer may still be filling the buffer on another CPU and moving the
+     head index.
+
+     To the producer it will show an upper bound as the consumer may be busy
+     emptying the buffer.
+
+ [3] To a third party, the order in which the writes to the indices by the
+     producer and consumer become visible cannot be guaranteed as they are
+     independent and may be made on different CPUs - so the result in such a
+     situation will merely be a guess, and may even be negative.
+
+
+===========================================
+USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
+===========================================
+
+By using memory barriers in conjunction with circular buffers, you can avoid
+the need to:
+
+ (1) use a single lock to govern access to both ends of the buffer, thus
+     allowing the buffer to be filled and emptied at the same time; and
+
+ (2) use atomic counter operations.
+
+There are two sides to this: the producer that fills the buffer, and the
+consumer that empties it.  Only one thing should be filling a buffer at any one
+time, and only one thing should be emptying a buffer at any one time, but the
+two sides can operate simultaneously.
+
+
+THE PRODUCER
+------------
+
+The producer will look something like this:
+
+	spin_lock(&producer_lock);
+
+	unsigned long head = buffer->head;
+	unsigned long tail = ACCESS_ONCE(buffer->tail);
+
+	if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
+		/* insert one item into the buffer */
+		struct item *item = buffer[head];
+
+		produce_item(item);
+
+		smp_wmb(); /* commit the item before incrementing the head */
+
+		buffer->head = (head + 1) & (buffer->size - 1);
+
+		/* wake_up() will make sure that the head is committed before
+		 * waking anyone up */
+		wake_up(consumer);
+	}
+
+	spin_unlock(&producer_lock);
+
+This will instruct the CPU that the contents of the new item must be written
+before the head index makes it available to the consumer and then instructs the
+CPU that the revised head index must be written before the consumer is woken.
+
+Note that wake_up() doesn't have to be the exact mechanism used, but whatever
+is used must guarantee a (write) memory barrier between the update of the head
+index and the change of state of the consumer, if a change of state occurs.
+
+
+THE CONSUMER
+------------
+
+The consumer will look something like this:
+
+	spin_lock(&consumer_lock);
+
+	unsigned long head = ACCESS_ONCE(buffer->head);
+	unsigned long tail = buffer->tail;
+
+	if (CIRC_CNT(head, tail, buffer->size) >= 1) {
+		/* read index before reading contents at that index */
+		smp_read_barrier_depends();
+
+		/* extract one item from the buffer */
+		struct item *item = buffer[tail];
+
+		consume_item(item);
+
+		smp_mb(); /* finish reading descriptor before incrementing tail */
+
+		buffer->tail = (tail + 1) & (buffer->size - 1);
+	}
+
+	spin_unlock(&consumer_lock);
+
+This will instruct the CPU to make sure the index is up to date before reading
+the new item, and then it shall make sure the CPU has finished reading the item
+before it writes the new tail pointer, which will erase the item.
+
+
+Note the use of ACCESS_ONCE() in both algorithms to read the opposition index.
+This prevents the compiler from discarding and reloading its cached value -
+which some compilers will do across smp_read_barrier_depends().  This isn't
+strictly needed if you can be sure that the opposition index will _only_ be
+used the once.
+
+
+===============
+FURTHER READING
+===============
+
+See also Documentation/memory-barriers.txt for a description of Linux's memory
+barrier facilities.

+ 1 - 0
Documentation/connector/cn_test.c

@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/skbuff.h>
+#include <linux/slab.h>
 #include <linux/timer.h>
 
 #include <linux/connector.h>

+ 1 - 1
Documentation/connector/connector.txt

@@ -88,7 +88,7 @@ int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask);
  int gfp_mask			- GFP mask.
 
  Note: When registering new callback user, connector core assigns
- netlink group to the user which is equal to it's id.idx.
+ netlink group to the user which is equal to its id.idx.
 
 /*****************************************/
 Protocol description.

+ 1 - 1
Documentation/console/console.txt

@@ -74,7 +74,7 @@ driver takes over the consoles vacated by the driver. Binding, on the other
 hand, will bind the driver to the consoles that are currently occupied by a
 system driver.
 
-NOTE1: Binding and binding must be selected in Kconfig. It's under:
+NOTE1: Binding and unbinding must be selected in Kconfig. It's under:
 
 Device Drivers -> Character devices -> Support for binding and unbinding
 console drivers

+ 5 - 9
Documentation/credentials.txt

@@ -408,9 +408,6 @@ This should be used inside the RCU read lock, as in the following example:
 		...
 	}
 
-A function need not get RCU read lock to use __task_cred() if it is holding a
-spinlock at the time as this implicitly holds the RCU read lock.
-
 Should it be necessary to hold another task's credentials for a long period of
 time, and possibly to sleep whilst doing so, then the caller should get a
 reference on them using:
@@ -426,17 +423,16 @@ credentials, hiding the RCU magic from the caller:
 	uid_t task_uid(task)		Task's real UID
 	uid_t task_euid(task)		Task's effective UID
 
-If the caller is holding a spinlock or the RCU read lock at the time anyway,
-then:
+If the caller is holding the RCU read lock at the time anyway, then:
 
 	__task_cred(task)->uid
 	__task_cred(task)->euid
 
 should be used instead.  Similarly, if multiple aspects of a task's credentials
-need to be accessed, RCU read lock or a spinlock should be used, __task_cred()
-called, the result stored in a temporary pointer and then the credential
-aspects called from that before dropping the lock.  This prevents the
-potentially expensive RCU magic from being invoked multiple times.
+need to be accessed, RCU read lock should be used, __task_cred() called, the
+result stored in a temporary pointer and then the credential aspects called
+from that before dropping the lock.  This prevents the potentially expensive
+RCU magic from being invoked multiple times.
 
 Should some other single aspect of another task's credentials need to be
 accessed, then this can be used:

+ 1 - 1
Documentation/driver-model/platform.txt

@@ -192,7 +192,7 @@ command line. This will execute all matching early_param() callbacks.
 User specified early platform devices will be registered at this point.
 For the early serial console case the user can specify port on the
 kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
-the class string, "serial" is the name of the platfrom driver and
+the class string, "serial" is the name of the platform driver and
 0 is the platform device id. If the id is -1 then the dot and the
 id can be omitted.
 

+ 1 - 1
Documentation/dvb/ci.txt

@@ -41,7 +41,7 @@ This application requires the following to function properly as of now.
 
 * Cards that fall in this category
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-At present the cards that fall in this category are the Twinhan and it's
+At present the cards that fall in this category are the Twinhan and its
 clones, these cards are available as VVMER, Tomato, Hercules, Orange and
 so on.
 

+ 1 - 1
Documentation/dvb/contributors.txt

@@ -1,7 +1,7 @@
 Thanks go to the following people for patches and contributions:
 
 Michael Hunold <m.hunold@gmx.de>
-  for the initial saa7146 driver and it's recent overhaul
+  for the initial saa7146 driver and its recent overhaul
 
 Christian Theiss
   for his work on the initial Linux DVB driver

+ 1 - 1
Documentation/eisa.txt

@@ -171,7 +171,7 @@ device.
 virtual_root.force_probe :
 
 Force the probing code to probe EISA slots even when it cannot find an
-EISA compliant mainboard (nothing appears on slot 0). Defaultd to 0
+EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
 (don't force), and set to 1 (force probing) when either
 CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
 

+ 9 - 21
Documentation/email-clients.txt

@@ -216,26 +216,14 @@ Works.  Use "Insert file..." or external editor.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Gmail (Web GUI)
 
-If you just have to use Gmail to send patches, it CAN be made to work.  It
-requires a bit of external help, though.
-
-The first problem is that Gmail converts tabs to spaces.  This will
-totally break your patches.  To prevent this, you have to use a different
-editor.  There is a firefox extension called "ViewSourceWith"
-(https://addons.mozilla.org/en-US/firefox/addon/394) which allows you to
-edit any text box in the editor of your choice.  Configure it to launch
-your favorite editor.  When you want to send a patch, use this technique.
-Once you have crafted your messsage + patch, save and exit the editor,
-which should reload the Gmail edit box.  GMAIL WILL PRESERVE THE TABS.
-Hoorah.  Apparently you can cut-n-paste literal tabs, but Gmail will
-convert those to spaces upon sending!
-
-The second problem is that Gmail converts tabs to spaces on replies.  If
-you reply to a patch, don't expect to be able to apply it as a patch.
-
-The last problem is that Gmail will base64-encode any message that has a
-non-ASCII character.  That includes things like European names.  Be aware.
-
-Gmail is not convenient for lkml patches, but CAN be made to work.
+Does not work for sending patches.
+
+Gmail web client converts tabs to spaces automatically.
+
+At the same time it wraps lines every 78 chars with CRLF style line breaks
+although tab2space problem can be solved with external editor.
+
+Another problem is that Gmail will base64-encode any message that has a
+non-ASCII character. That includes things like European names.
 
                                 ###

+ 31 - 0
Documentation/fb/efifb.txt

@@ -0,0 +1,31 @@
+
+What is efifb?
+===============
+
+This is a generic EFI platform driver for Intel based Apple computers.
+efifb is only for EFI booted Intel Macs.
+
+Supported Hardware
+==================
+
+iMac 17"/20"
+Macbook
+Macbook Pro 15"/17"
+MacMini
+
+How to use it?
+==============
+
+efifb does not have any kind of autodetection of your machine.
+You have to add the following kernel parameters in your elilo.conf:
+	Macbook :
+		video=efifb:macbook
+	MacMini :
+		video=efifb:mini
+	Macbook Pro 15", iMac 17" :
+		video=efifb:i17
+	Macbook Pro 17", iMac 20" :
+		video=efifb:i20
+
+--
+Edgar Hucek <gimli@dark-green.com>

+ 0 - 31
Documentation/fb/imacfb.txt

@@ -1,31 +0,0 @@
-
-What is imacfb?
-===============
-
-This is a generic EFI platform driver for Intel based Apple computers.
-Imacfb is only for EFI booted Intel Macs.
-
-Supported Hardware
-==================
-
-iMac 17"/20"
-Macbook
-Macbook Pro 15"/17"
-MacMini
-
-How to use it?
-==============
-
-Imacfb does not have any kind of autodetection of your machine.
-You have to add the following kernel parameters in your elilo.conf:
-	Macbook :
-		video=imacfb:macbook
-	MacMini :
-		video=imacfb:mini
-	Macbook Pro 15", iMac 17" :
-		video=imacfb:i17
-	Macbook Pro 17", iMac 20" :
-		video=imacfb:i20
-
---
-Edgar Hucek <gimli@dark-green.com>

+ 91 - 27
Documentation/feature-removal-schedule.txt

@@ -241,16 +241,6 @@ Who:	Thomas Gleixner <tglx@linutronix.de>
 
 ---------------------------
 
-What (Why):
-	- xt_recent: the old ipt_recent proc dir
-	  (superseded by /proc/net/xt_recent)
-
-When:	January 2009 or Linux 2.7.0, whichever comes first
-Why:	Superseded by newer revisions or modules
-Who:	Jan Engelhardt <jengelh@computergmbh.de>
-
----------------------------
-
 What:	GPIO autorequest on gpio_direction_{input,output}() in gpiolib
 When:	February 2010
 Why:	All callers should use explicit gpio_request()/gpio_free().
@@ -520,26 +510,21 @@ Who:	Hans de Goede <hdegoede@redhat.com>
 
 ----------------------------
 
-What:	corgikbd, spitzkbd, tosakbd driver
-When:	2.6.35
-Files:	drivers/input/keyboard/{corgi,spitz,tosa}kbd.c
-Why:	We now have a generic GPIO based matrix keyboard driver that
-	are fully capable of handling all the keys on these devices.
-	The original drivers manipulate the GPIO registers directly
-	and so are difficult to maintain.
-Who:	Eric Miao <eric.y.miao@gmail.com>
+What:	sysfs-class-rfkill state file
+When:	Feb 2014
+Files:	net/rfkill/core.c
+Why: 	Documented as obsolete since Feb 2010. This file is limited to 3
+	states while the rfkill drivers can have 4 states.
+Who: 	anybody or Florian Mickler <florian@mickler.org>
 
 ----------------------------
 
-What:	corgi_ssp and corgi_ts driver
-When:	2.6.35
-Files:	arch/arm/mach-pxa/corgi_ssp.c, drivers/input/touchscreen/corgi_ts.c
-Why:	The corgi touchscreen is now deprecated in favour of the generic
-	ads7846.c driver. The noise reduction technique used in corgi_ts.c,
-	that's to wait till vsync before ADC sampling, is also integrated into
-	ads7846 driver now. Provided that the original driver is not generic
-	and is difficult to maintain, it will be removed later.
-Who:	Eric Miao <eric.y.miao@gmail.com>
+What: 	sysfs-class-rfkill claim file
+When:	Feb 2012
+Files:	net/rfkill/core.c
+Why:	It is not possible to claim an rfkill driver since 2007. This is
+	Documented as obsolete since Feb 2010.
+Who: 	anybody or Florian Mickler <florian@mickler.org>
 
 ----------------------------
 
@@ -564,6 +549,16 @@ Who:	Avi Kivity <avi@redhat.com>
 
 ----------------------------
 
+What:	xtime, wall_to_monotonic
+When:	2.6.36+
+Files:	kernel/time/timekeeping.c include/linux/time.h
+Why:	Cleaning up timekeeping internal values. Please use
+	existing timekeeping accessor functions to access
+	the equivalent functionality.
+Who:	John Stultz <johnstul@us.ibm.com>
+
+----------------------------
+
 What:	KVM kernel-allocated memory slots
 When:	July 2010
 Why:	Since 2.6.25, kvm supports user-allocated memory slots, which are
@@ -582,3 +577,72 @@ Why:	The paravirt mmu host support is slower than non-paravirt mmu, both
 Who:	Avi Kivity <avi@redhat.com>
 
 ----------------------------
+
+What: 	"acpi=ht" boot option
+When:	2.6.35
+Why:	Useful in 2003, implementation is a hack.
+	Generally invoked by accident today.
+	Seen as doing more harm than good.
+Who:	Len Brown <len.brown@intel.com>
+
+----------------------------
+
+What:	iwlwifi 50XX module parameters
+When:	2.6.40
+Why:	The "..50" modules parameters were used to configure 5000 series and
+	up devices; different set of module parameters also available for 4965
+	with same functionalities. Consolidate both set into single place
+	in drivers/net/wireless/iwlwifi/iwl-agn.c
+
+Who:	Wey-Yi Guy <wey-yi.w.guy@intel.com>
+
+----------------------------
+
+What:	iwl4965 alias support
+When:	2.6.40
+Why:	Internal alias support has been present in module-init-tools for some
+	time, the MODULE_ALIAS("iwl4965") boilerplate aliases can be removed
+	with no impact.
+
+Who:	Wey-Yi Guy <wey-yi.w.guy@intel.com>
+
+---------------------------
+
+What:	xt_NOTRACK
+Files:	net/netfilter/xt_NOTRACK.c
+When:	April 2011
+Why:	Superseded by xt_CT
+Who:	Netfilter developer team <netfilter-devel@vger.kernel.org>
+
+---------------------------
+
+What:	video4linux /dev/vtx teletext API support
+When:	2.6.35
+Files:	drivers/media/video/saa5246a.c drivers/media/video/saa5249.c
+	include/linux/videotext.h
+Why:	The vtx device nodes have been superseded by vbi device nodes
+	for many years. No applications exist that use the vtx support.
+	Of the two i2c drivers that actually support this API the saa5249
+	has been impossible to use for a year now and no known hardware
+	that supports this device exists. The saa5246a is theoretically
+	supported by the old mxb boards, but it never actually worked.
+
+	In summary: there is no hardware that can use this API and there
+	are no applications actually implementing this API.
+
+	The vtx support still reserves minors 192-223 and we would really
+	like to reuse those for upcoming new functionality. In the unlikely
+	event that new hardware appears that wants to use the functionality
+	provided by the vtx API, then that functionality should be build
+	around the sliced VBI API instead.
+Who:	Hans Verkuil <hverkuil@xs4all.nl>
+
+----------------------------
+
+What:	IRQF_DISABLED
+When:	2.6.36
+Why:	The flag is a NOOP as we run interrupt handlers with interrupts disabled
+Who:	Thomas Gleixner <tglx@linutronix.de>
+
+----------------------------
+

+ 4 - 0
Documentation/filesystems/00-INDEX

@@ -16,6 +16,8 @@ befs.txt
 	- information about the BeOS filesystem for Linux.
 bfs.txt
 	- info for the SCO UnixWare Boot Filesystem (BFS).
+ceph.txt
+	- info for the Ceph Distributed File System
 cifs.txt
 	- description of the CIFS filesystem.
 coda.txt
@@ -32,6 +34,8 @@ dlmfs.txt
 	- info on the userspace interface to the OCFS2 DLM.
 dnotify.txt
 	- info about directory notification in Linux.
+dnotify_test.c
+	- example program for dnotify
 ecryptfs.txt
 	- docs on eCryptfs: stacked cryptographic filesystem for Linux.
 exofs.txt

+ 16 - 2
Documentation/filesystems/9p.txt

@@ -37,6 +37,15 @@ For Plan 9 From User Space applications (http://swtch.com/plan9)
 
 	mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
 
+For server running on QEMU host with virtio transport:
+
+	mount -t 9p -o trans=virtio <mount_tag> /mnt/9
+
+where mount_tag is the tag associated by the server to each of the exported
+mount points. Each 9P export is seen by the client as a virtio device with an
+associated "mount_tag" property. Available mount tags can be
+seen by reading /sys/bus/virtio/drivers/9pnet_virtio/virtio<n>/mount_tag files.
+
 OPTIONS
 =======
 
@@ -47,7 +56,7 @@ OPTIONS
 			fd   	- used passed file descriptors for connection
                                 (see rfdno and wfdno)
 			virtio	- connect to the next virtio channel available
-				(from lguest or KVM with trans_virtio module)
+				(from QEMU with trans_virtio module)
 			rdma	- connect to a specified RDMA channel
 
   uname=name	user name to attempt mount as on the remote server.  The
@@ -85,7 +94,12 @@ OPTIONS
 
   port=n	port to connect to on the remote server
 
-  noextend	force legacy mode (no 9p2000.u semantics)
+  noextend	force legacy mode (no 9p2000.u or 9p2000.L semantics)
+
+  version=name	Select 9P protocol version. Valid options are:
+			9p2000          - Legacy mode (same as noextend)
+			9p2000.u        - Use 9P2000.u protocol
+			9p2000.L        - Use 9P2000.L protocol
 
   dfltuid	attempt to mount as a particular uid
 

+ 2 - 2
Documentation/filesystems/Locking

@@ -178,7 +178,7 @@ prototypes:
 locking rules:
 	All except set_page_dirty may block
 
-			BKL	PageLocked(page)	i_sem
+			BKL	PageLocked(page)	i_mutex
 writepage:		no	yes, unlocks (see below)
 readpage:		no	yes, unlocks
 sync_page:		no	maybe
@@ -429,7 +429,7 @@ check_flags:		no
 implementations.  If your fs is not using generic_file_llseek, you
 need to acquire and release the appropriate locks in your ->llseek().
 For many filesystems, it is probably safe to acquire the inode
-semaphore.  Note some filesystems (i.e. remote ones) provide no
+mutex.  Note some filesystems (i.e. remote ones) provide no
 protection for i_size so you will need to use the BKL.
 
 Note: ext2_release() was *the* source of contention on fs-intensive

+ 8 - 0
Documentation/filesystems/Makefile

@@ -0,0 +1,8 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := dnotify_test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)

+ 1 - 1
Documentation/filesystems/autofs4-mount-control.txt

@@ -146,7 +146,7 @@ found to be inadequate, in this case. The Generic Netlink system was
 used for this as raw Netlink would lead to a significant increase in
 complexity. There's no question that the Generic Netlink system is an
 elegant solution for common case ioctl functions but it's not a complete
-replacement probably because it's primary purpose in life is to be a
+replacement probably because its primary purpose in life is to be a
 message bus implementation rather than specifically an ioctl replacement.
 While it would be possible to work around this there is one concern
 that lead to the decision to not use it. This is that the autofs

+ 140 - 0
Documentation/filesystems/ceph.txt

@@ -0,0 +1,140 @@
+Ceph Distributed File System
+============================
+
+Ceph is a distributed network file system designed to provide good
+performance, reliability, and scalability.
+
+Basic features include:
+
+ * POSIX semantics
+ * Seamless scaling from 1 to many thousands of nodes
+ * High availability and reliability.  No single point of failure.
+ * N-way replication of data across storage nodes
+ * Fast recovery from node failures
+ * Automatic rebalancing of data on node addition/removal
+ * Easy deployment: most FS components are userspace daemons
+
+Also,
+ * Flexible snapshots (on any directory)
+ * Recursive accounting (nested files, directories, bytes)
+
+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
+on symmetric access by all clients to shared block devices, Ceph
+separates data and metadata management into independent server
+clusters, similar to Lustre.  Unlike Lustre, however, metadata and
+storage nodes run entirely as user space daemons.  Storage nodes
+utilize btrfs to store data objects, leveraging its advanced features
+(checksumming, metadata replication, etc.).  File data is striped
+across storage nodes in large chunks to distribute workload and
+facilitate high throughputs.  When storage nodes fail, data is
+re-replicated in a distributed fashion by the storage nodes themselves
+(with some minimal coordination from a cluster monitor), making the
+system extremely efficient and scalable.
+
+Metadata servers effectively form a large, consistent, distributed
+in-memory cache above the file namespace that is extremely scalable,
+dynamically redistributes metadata in response to workload changes,
+and can tolerate arbitrary (well, non-Byzantine) node failures.  The
+metadata server takes a somewhat unconventional approach to metadata
+storage to significantly improve performance for common workloads.  In
+particular, inodes with only a single link are embedded in
+directories, allowing entire directories of dentries and inodes to be
+loaded into its cache with a single I/O operation.  The contents of
+extremely large directories can be fragmented and managed by
+independent metadata servers, allowing scalable concurrent access.
+
+The system offers automatic data rebalancing/migration when scaling
+from a small cluster of just a few nodes to many hundreds, without
+requiring an administrator carve the data set into static volumes or
+go through the tedious process of migrating data between servers.
+When the file system approaches full, new nodes can be easily added
+and things will "just work."
+
+Ceph includes flexible snapshot mechanism that allows a user to create
+a snapshot on any subdirectory (and its nested contents) in the
+system.  Snapshot creation and deletion are as simple as 'mkdir
+.snap/foo' and 'rmdir .snap/foo'.
+
+Ceph also provides some recursive accounting on directories for nested
+files and bytes.  That is, a 'getfattr -d foo' on any directory in the
+system will reveal the total number of nested regular files and
+subdirectories, and a summation of all nested file sizes.  This makes
+the identification of large disk space consumers relatively quick, as
+no 'du' or similar recursive scan of the file system is required.
+
+
+Mount Syntax
+============
+
+The basic mount syntax is:
+
+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
+
+You only need to specify a single monitor, as the client will get the
+full list when it connects.  (However, if the monitor you specify
+happens to be down, the mount won't succeed.)  The port can be left
+off if the monitor is using the default.  So if the monitor is at
+1.2.3.4,
+
+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
+
+is sufficient.  If /sbin/mount.ceph is installed, a hostname can be
+used instead of an IP address.
+
+
+
+Mount Options
+=============
+
+  ip=A.B.C.D[:N]
+	Specify the IP and/or port the client should bind to locally.
+	There is normally not much reason to do this.  If the IP is not
+	specified, the client's IP address is determined by looking at the
+	address its connection to the monitor originates from.
+
+  wsize=X
+	Specify the maximum write size in bytes.  By default there is no
+	maximum.  Ceph will normally size writes based on the file stripe
+	size.
+
+  rsize=X
+	Specify the maximum readahead.
+
+  mount_timeout=X
+	Specify the timeout value for mount (in seconds), in the case
+	of a non-responsive Ceph file system.  The default is 30
+	seconds.
+
+  rbytes
+	When stat() is called on a directory, set st_size to 'rbytes',
+	the summation of file sizes over all files nested beneath that
+	directory.  This is the default.
+
+  norbytes
+	When stat() is called on a directory, set st_size to the
+	number of entries in that directory.
+
+  nocrc
+	Disable CRC32C calculation for data writes.  If set, the storage node
+	must rely on TCP's error correction to detect data corruption
+	in the data payload.
+
+  noasyncreaddir
+	Disable client's use its local cache to satisfy	readdir
+	requests.  (This does not change correctness; the client uses
+	cached metadata only when a lease or capability ensures it is
+	valid.)
+
+
+More Information
+================
+
+For more information on Ceph, see the home page at
+	http://ceph.newdream.net/
+
+The Linux kernel client source tree is available at
+	git://ceph.newdream.net/git/ceph-client.git
+	git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+
+and the source for the full system is at
+	git://ceph.newdream.net/git/ceph.git

+ 1 - 1
Documentation/filesystems/dlmfs.txt

@@ -47,7 +47,7 @@ You'll want to start heartbeating on a volume which all the nodes in
 your lockspace can access. The easiest way to do this is via
 ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
 that an OCFS2 file system be in place so that it can automatically
-find it's heartbeat area, though it will eventually support heartbeat
+find its heartbeat area, though it will eventually support heartbeat
 against raw disks.
 
 Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed

+ 5 - 34
Documentation/filesystems/dnotify.txt

@@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
 
 Example
 -------
+See Documentation/filesystems/dnotify_test.c for an example.
 
-	#define _GNU_SOURCE	/* needed to get the defines */
-	#include <fcntl.h>	/* in glibc 2.2 this has the needed
-					   values defined */
-	#include <signal.h>
-	#include <stdio.h>
-	#include <unistd.h>
-
-	static volatile int event_fd;
-
-	static void handler(int sig, siginfo_t *si, void *data)
-	{
-		event_fd = si->si_fd;
-	}
-
-	int main(void)
-	{
-		struct sigaction act;
-		int fd;
-
-		act.sa_sigaction = handler;
-		sigemptyset(&act.sa_mask);
-		act.sa_flags = SA_SIGINFO;
-		sigaction(SIGRTMIN + 1, &act, NULL);
-
-		fd = open(".", O_RDONLY);
-		fcntl(fd, F_SETSIG, SIGRTMIN + 1);
-		fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
-		/* we will now be notified if any of the files
-		   in "." is modified or new files are created */
-		while (1) {
-			pause();
-			printf("Got event on fd=%d\n", event_fd);
-		}
-	}
+NOTE
+----
+Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
+See Documentation/filesystems/inotify.txt for more information on it.

+ 34 - 0
Documentation/filesystems/dnotify_test.c

@@ -0,0 +1,34 @@
+#define _GNU_SOURCE	/* needed to get the defines */
+#include <fcntl.h>	/* in glibc 2.2 this has the needed
+				   values defined */
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+
+static volatile int event_fd;
+
+static void handler(int sig, siginfo_t *si, void *data)
+{
+	event_fd = si->si_fd;
+}
+
+int main(void)
+{
+	struct sigaction act;
+	int fd;
+
+	act.sa_sigaction = handler;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = SA_SIGINFO;
+	sigaction(SIGRTMIN + 1, &act, NULL);
+
+	fd = open(".", O_RDONLY);
+	fcntl(fd, F_SETSIG, SIGRTMIN + 1);
+	fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
+	/* we will now be notified if any of the files
+	   in "." is modified or new files are created */
+	while (1) {
+		pause();
+		printf("Got event on fd=%d\n", event_fd);
+	}
+}

+ 6 - 6
Documentation/filesystems/fiemap.txt

@@ -38,7 +38,7 @@ flags, it will return EBADR and the contents of fm_flags will contain
 the set of flags which caused the error. If the kernel is compatible
 with all flags passed, the contents of fm_flags will be unmodified.
 It is up to userspace to determine whether rejection of a particular
-flag is fatal to it's operation. This scheme is intended to allow the
+flag is fatal to its operation. This scheme is intended to allow the
 fiemap interface to grow in the future but without losing
 compatibility with old software.
 
@@ -56,7 +56,7 @@ If this flag is set, the kernel will sync the file before mapping extents.
 
 * FIEMAP_FLAG_XATTR
 If this flag is set, the extents returned will describe the inodes
-extended attribute lookup tree, instead of it's data tree.
+extended attribute lookup tree, instead of its data tree.
 
 
 Extent Mapping
@@ -89,7 +89,7 @@ struct fiemap_extent {
 };
 
 All offsets and lengths are in bytes and mirror those on disk.  It is valid
-for an extents logical offset to start before the request or it's logical
+for an extents logical offset to start before the request or its logical
 length to extend past the request.  Unless FIEMAP_EXTENT_NOT_ALIGNED is
 returned, fe_logical, fe_physical, and fe_length will be aligned to the
 block size of the file system.  With the exception of extents flagged as
@@ -125,7 +125,7 @@ been allocated for the file yet.
 
 * FIEMAP_EXTENT_DELALLOC
   - This will also set FIEMAP_EXTENT_UNKNOWN.
-Delayed allocation - while there is data for this extent, it's
+Delayed allocation - while there is data for this extent, its
 physical location has not been allocated yet.
 
 * FIEMAP_EXTENT_ENCODED
@@ -159,7 +159,7 @@ Data is located within a meta data block.
 Data is packed into a block with data from other files.
 
 * FIEMAP_EXTENT_UNWRITTEN
-Unwritten extent - the extent is allocated but it's data has not been
+Unwritten extent - the extent is allocated but its data has not been
 initialized.  This indicates the extent's data will be all zero if read
 through the filesystem but the contents are undefined if read directly from
 the device.
@@ -176,7 +176,7 @@ VFS -> File System Implementation
 
 File systems wishing to support fiemap must implement a ->fiemap callback on
 their inode_operations structure. The fs ->fiemap call is responsible for
-defining it's set of supported fiemap flags, and calling a helper function on
+defining its set of supported fiemap flags, and calling a helper function on
 each discovered extent:
 
 struct inode_operations {

+ 2 - 2
Documentation/filesystems/fuse.txt

@@ -91,7 +91,7 @@ Mount options
 'default_permissions'
 
   By default FUSE doesn't check file access permissions, the
-  filesystem is free to implement it's access policy or leave it to
+  filesystem is free to implement its access policy or leave it to
   the underlying file access mechanism (e.g. in case of network
   filesystems).  This option enables permission checking, restricting
   access based on file mode.  It is usually useful together with the
@@ -171,7 +171,7 @@ or may honor them by sending a reply to the _original_ request, with
 the error set to EINTR.
 
 It is also possible that there's a race between processing the
-original request and it's INTERRUPT request.  There are two possibilities:
+original request and its INTERRUPT request.  There are two possibilities:
 
   1) The INTERRUPT request is processed before the original request is
      processed

+ 6 - 6
Documentation/filesystems/gfs2.txt

@@ -1,7 +1,7 @@
 Global File System
 ------------------
 
-http://sources.redhat.com/cluster/
+http://sources.redhat.com/cluster/wiki/
 
 GFS is a cluster file system. It allows a cluster of computers to
 simultaneously use a block device that is shared between them (with FC,
@@ -36,11 +36,11 @@ GFS2 is not on-disk compatible with previous versions of GFS, but it
 is pretty close.
 
 The following man pages can be found at the URL above:
-  fsck.gfs2	to repair a filesystem
-  gfs2_grow	to expand a filesystem online
-  gfs2_jadd	to add journals to a filesystem online
-  gfs2_tool	to manipulate, examine and tune a filesystem
+  fsck.gfs2		to repair a filesystem
+  gfs2_grow		to expand a filesystem online
+  gfs2_jadd		to add journals to a filesystem online
+  gfs2_tool		to manipulate, examine and tune a filesystem
   gfs2_quota	to examine and change quota values in a filesystem
   gfs2_convert	to convert a gfs filesystem to gfs2 in-place
   mount.gfs2	to help mount(8) mount a filesystem
-  mkfs.gfs2	to make a filesystem
+  mkfs.gfs2		to make a filesystem

+ 1 - 1
Documentation/filesystems/hpfs.txt

@@ -103,7 +103,7 @@ to analyze or change OS2SYS.INI.
 Codepages
 
 HPFS can contain several uppercasing tables for several codepages and each
-file has a pointer to codepage it's name is in. However OS/2 was created in
+file has a pointer to codepage its name is in. However OS/2 was created in
 America where people don't care much about codepages and so multiple codepages
 support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk.
 Once I booted English OS/2 working in cp 850 and I created a file on my 852

+ 4 - 4
Documentation/filesystems/logfs.txt

@@ -59,7 +59,7 @@ Levels
 ------
 
 Garbage collection (GC) may fail if all data is written
-indiscriminately.  One requirement of GC is that data is seperated
+indiscriminately.  One requirement of GC is that data is separated
 roughly according to the distance between the tree root and the data.
 Effectively that means all file data is on level 0, indirect blocks
 are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
@@ -67,7 +67,7 @@ respectively.  Inode file data is on level 6 for the inodes and 7-11
 for indirect blocks.
 
 Each segment contains objects of a single level only.  As a result,
-each level requires its own seperate segment to be open for writing.
+each level requires its own separate segment to be open for writing.
 
 Inode File
 ----------
@@ -106,9 +106,9 @@ Vim
 ---
 
 By cleverly predicting the life time of data, it is possible to
-seperate long-living data from short-living data and thereby reduce
+separate long-living data from short-living data and thereby reduce
 the GC overhead later.  Each type of distinc life expectency (vim) can
-have a seperate segment open for writing.  Each (level, vim) tupel can
+have a separate segment open for writing.  Each (level, vim) tupel can
 be open just once.  If an open segment with unknown vim is encountered
 at mount time, it is closed and ignored henceforth.
 

+ 1 - 1
Documentation/filesystems/nfs/nfs41-server.txt

@@ -137,7 +137,7 @@ NS*| OPENATTR             | OPT        |              | Section 18.17  |
    | READ                 | REQ        |              | Section 18.22  |
    | READDIR              | REQ        |              | Section 18.23  |
    | READLINK             | OPT        |              | Section 18.24  |
-NS | RECLAIM_COMPLETE     | REQ        |              | Section 18.51  |
+   | RECLAIM_COMPLETE     | REQ        |              | Section 18.51  |
    | RELEASE_LOCKOWNER    | MNI        |              | N/A            |
    | REMOVE               | REQ        |              | Section 18.25  |
    | RENAME               | REQ        |              | Section 18.26  |

+ 1 - 1
Documentation/filesystems/nfs/rpc-cache.txt

@@ -185,7 +185,7 @@ failed lookup meant a definite 'no'.
 request/response format
 -----------------------
 
-While each cache is free to use it's own format for requests
+While each cache is free to use its own format for requests
 and responses over channel, the following is recommended as
 appropriate and support routines are available to help:
 Each request or response record should be printable ASCII

+ 2 - 2
Documentation/filesystems/nilfs2.txt

@@ -50,8 +50,8 @@ NILFS2 supports the following mount options:
 (*) == default
 
 nobarrier		Disables barriers.
-errors=continue(*)	Keep going on a filesystem error.
-errors=remount-ro	Remount the filesystem read-only on an error.
+errors=continue		Keep going on a filesystem error.
+errors=remount-ro(*)	Remount the filesystem read-only on an error.
 errors=panic		Panic and halt the machine if an error occurs.
 cp=n			Specify the checkpoint-number of the snapshot to be
 			mounted.  Checkpoints and snapshots are listed by lscp

+ 7 - 0
Documentation/filesystems/ocfs2.txt

@@ -80,3 +80,10 @@ user_xattr	(*)	Enables Extended User Attributes.
 nouser_xattr		Disables Extended User Attributes.
 acl			Enables POSIX Access Control Lists support.
 noacl		(*)	Disables POSIX Access Control Lists support.
+resv_level=2	(*)	Set how agressive allocation reservations will be.
+			Valid values are between 0 (reservations off) to 8
+			(maximum space for reservations).
+dir_resv_level=	(*)	By default, directory reservations will scale with file
+			reservations - users should rarely need to change this
+			value. If allocation reservations are turned off, this
+			option will have no effect.

+ 9 - 6
Documentation/filesystems/proc.txt

@@ -195,7 +195,7 @@ asynchronous manner and the vaule may not be very precise. To see a precise
 snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
 It's slow but very precise.
 
-Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
+Table 1-2: Contents of the status files (as of 2.6.30-rc7)
 ..............................................................................
  Field                       Content
  Name                        filename of the executable
@@ -305,7 +305,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
   cgtime        guest time of the task children in jiffies
 ..............................................................................
 
-The /proc/PID/map file containing the currently mapped memory regions and
+The /proc/PID/maps file containing the currently mapped memory regions and
 their access permissions.
 
 The format is:
@@ -316,7 +316,7 @@ address           perms offset  dev   inode      pathname
 08049000-0804a000 rw-p 00001000 03:00 8312       /opt/test
 0804a000-0806b000 rw-p 00000000 00:00 0          [heap]
 a7cb1000-a7cb2000 ---p 00000000 00:00 0
-a7cb2000-a7eb2000 rw-p 00000000 00:00 0          [threadstack:001ff4b4]
+a7cb2000-a7eb2000 rw-p 00000000 00:00 0
 a7eb2000-a7eb3000 ---p 00000000 00:00 0
 a7eb3000-a7ed5000 rw-p 00000000 00:00 0
 a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
@@ -352,7 +352,6 @@ is not associated with a file:
  [stack]                  = the stack of the main process
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
- [threadstack:xxxxxxxx]   = the stack of the thread, xxxxxxxx is the stack size
 
  or if empty, the mapping is anonymous.
 
@@ -566,6 +565,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the
 IRQs which have not yet been allocated/activated, and hence which lack a
 /proc/irq/[0-9]* directory.
 
+The node file on an SMP system shows the node to which the device using the IRQ
+reports itself as being attached. This hardware locality information does not
+include information about any possible driver locality preference.
+
 prof_cpu_mask specifies which CPUs are to be profiled by the system wide
 profiler. Default value is ffffffff (all cpus).
 
@@ -965,7 +968,7 @@ your system and how much traffic was routed over those devices:
   ...] 1375103    17405    0    0    0     0       0          0 
   ...] 1703981     5535    0    0    0     3       0          0 
 
-In addition, each Channel Bond interface has it's own directory.  For
+In addition, each Channel Bond interface has its own directory.  For
 example, the bond0 device will have a directory called /proc/net/bond0/.
 It will contain information that is specific to that bond, such as the
 current slaves of the bond, the link status of the slaves, and how
@@ -1362,7 +1365,7 @@ been accounted as having caused 1MB of write.
 In other words: The number of bytes which this process caused to not happen,
 by truncating pagecache. A task can cause "negative" IO too. If this task
 truncates some dirty pagecache, some IO which another task has been accounted
-for (in it's write_bytes) will not be happening. We _could_ just subtract that
+for (in its write_bytes) will not be happening. We _could_ just subtract that
 from the truncating task's write_bytes, but there is information loss in doing
 that.
 

+ 1 - 1
Documentation/filesystems/smbfs.txt

@@ -3,6 +3,6 @@ protocol used by Windows for Workgroups, Windows 95 and Windows NT.
 Smbfs was inspired by Samba, the program written by Andrew Tridgell
 that turns any Unix host into a file server for DOS or Windows clients.
 
-Smbfs is a SMB client, but uses parts of samba for it's operation. For
+Smbfs is a SMB client, but uses parts of samba for its operation. For
 more info on samba, including documentation, please go to
 http://www.samba.org/ and then on to your nearest mirror.

+ 5 - 1
Documentation/filesystems/tmpfs.txt

@@ -82,11 +82,13 @@ tmpfs has a mount option to set the NUMA memory allocation policy for
 all files in that instance (if CONFIG_NUMA is enabled) - which can be
 adjusted on the fly via 'mount -o remount ...'
 
-mpol=default             prefers to allocate memory from the local node
+mpol=default             use the process allocation policy
+                         (see set_mempolicy(2))
 mpol=prefer:Node         prefers to allocate memory from the given Node
 mpol=bind:NodeList       allocates memory only from nodes in NodeList
 mpol=interleave          prefers to allocate from each node in turn
 mpol=interleave:NodeList allocates from each node of NodeList in turn
+mpol=local		 prefers to allocate memory from the local node
 
 NodeList format is a comma-separated list of decimal numbers and ranges,
 a range being two hyphen-separated decimal numbers, the smallest and
@@ -134,3 +136,5 @@ Author:
    Christoph Rohland <cr@sap.com>, 1.12.01
 Updated:
    Hugh Dickins, 4 June 2007
+Updated:
+   KOSAKI Motohiro, 16 Mar 2010

+ 1 - 1
Documentation/filesystems/vfs.txt

@@ -72,7 +72,7 @@ structure (this is the kernel-side implementation of file
 descriptors). The freshly allocated file structure is initialized with
 a pointer to the dentry and a set of file operation member functions.
 These are taken from the inode data. The open() file method is then
-called so the specific filesystem implementation can do it's work. You
+called so the specific filesystem implementation can do its work. You
 can see that this is another switch performed by the VFS. The file
 structure is placed into the file descriptor table for the process.
 

+ 1 - 1
Documentation/hwmon/abituguru

@@ -30,7 +30,7 @@ Supported chips:
 	   bank1_types=1,1,0,0,0,0,0,2,0,0,0,0,2,0,0,1
 	   You may also need to specify the fan_sensors option for these boards
 	   fan_sensors=5
-	2) There is a seperate abituguru3 driver for these motherboards,
+	2) There is a separate abituguru3 driver for these motherboards,
 	   the abituguru (without the 3 !) driver will not work on these
 	   motherboards (and visa versa)!
 

+ 1 - 1
Documentation/hwmon/lm85

@@ -157,7 +157,7 @@ temperature configuration points:
 
 There are three PWM outputs. The LM85 datasheet suggests that the
 pwm3 output control both fan3 and fan4. Each PWM can be individually
-configured and assigned to a zone for it's control value. Each PWM can be
+configured and assigned to a zone for its control value. Each PWM can be
 configured individually according to the following options.
 
 * pwm#_auto_pwm_min - this specifies the PWM value for temp#_auto_temp_off

Some files were not shown because too many files changed in this diff