16 years ago · 521c180874
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ include/linux/compile.h
 
				 include/linux/version.h
			
 
				 include/linux/utsrelease.h
			
 
				 include/linux/bounds.h
			
 
				+include/generated
			
 
				 
			
 
				 # stgit generated dirs
			
 
				 patches-*
			
--- a/Documentation/ABI/testing/debugfs-kmemtrace
+++ b/Documentation/ABI/testing/debugfs-kmemtrace
@@ -0,0 +1,71 @@
 
				+What:		/sys/kernel/debug/kmemtrace/
			
 
				+Date:		July 2008
			
 
				+Contact:	Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
			
 
				+Description:
			
 
				+
			
 
				+In kmemtrace-enabled kernels, the following files are created:
			
 
				+
			
 
				+/sys/kernel/debug/kmemtrace/
			
 
				+	cpu<n>		(0400)	Per-CPU tracing data, see below. (binary)
			
 
				+	total_overruns	(0400)	Total number of bytes which were dropped from
			
 
				+				cpu<n> files because of full buffer condition,
			
 
				+				non-binary. (text)
			
 
				+	abi_version	(0400)	Kernel's kmemtrace ABI version. (text)
			
 
				+
			
 
				+Each per-CPU file should be read according to the relay interface. That is,
			
 
				+the reader should set affinity to that specific CPU and, as currently done by
			
 
				+the userspace application (though there are other methods), use poll() with
			
 
				+an infinite timeout before every read(). Otherwise, erroneous data may be
			
 
				+read. The binary data has the following _core_ format:
			
 
				+
			
 
				+	Event ID	(1 byte)	Unsigned integer, one of:
			
 
				+		0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
			
 
				+		1 - represents a freeing of previously allocated memory
			
 
				+		    (KMEMTRACE_EVENT_FREE)
			
 
				+	Type ID		(1 byte)	Unsigned integer, one of:
			
 
				+		0 - this is a kmalloc() / kfree()
			
 
				+		1 - this is a kmem_cache_alloc() / kmem_cache_free()
			
 
				+		2 - this is a __get_free_pages() et al.
			
 
				+	Event size	(2 bytes)	Unsigned integer representing the
			
 
				+					size of this event. Used to extend
			
 
				+					kmemtrace. Discard the bytes you
			
 
				+					don't know about.
			
 
				+	Sequence number	(4 bytes)	Signed integer used to reorder data
			
 
				+					logged on SMP machines. Wraparound
			
 
				+					must be taken into account, although
			
 
				+					it is unlikely.
			
 
				+	Caller address	(8 bytes)	Return address to the caller.
			
 
				+	Pointer to mem	(8 bytes)	Pointer to target memory area. Can be
			
 
				+					NULL, but not all such calls might be
			
 
				+					recorded.
			
 
				+
			
 
				+In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
			
 
				+
			
 
				+	Requested bytes	(8 bytes)	Total number of requested bytes,
			
 
				+					unsigned, must not be zero.
			
 
				+	Allocated bytes (8 bytes)	Total number of actually allocated
			
 
				+					bytes, unsigned, must not be lower
			
 
				+					than requested bytes.
			
 
				+	Requested flags	(4 bytes)	GFP flags supplied by the caller.
			
 
				+	Target CPU	(4 bytes)	Signed integer, valid for event id 1.
			
 
				+					If equal to -1, target CPU is the same
			
 
				+					as origin CPU, but the reverse might
			
 
				+					not be true.
			
 
				+
			
 
				+The data is made available in the same endianness the machine has.
			
 
				+
			
 
				+Other event ids and type ids may be defined and added. Other fields may be
			
 
				+added by increasing event size, but see below for details.
			
 
				+Every modification to the ABI, including new id definitions, are followed
			
 
				+by bumping the ABI version by one.
			
 
				+
			
 
				+Adding new data to the packet (features) is done at the end of the mandatory
			
 
				+data:
			
 
				+	Feature size	(2 byte)
			
 
				+	Feature ID	(1 byte)
			
 
				+	Feature data	(Feature size - 3 bytes)
			
 
				+
			
 
				+
			
 
				+Users:
			
 
				+	kmemtrace-user - git://repo.or.cz/kmemtrace-user.git
			
 
				+
			
--- a/Documentation/ABI/testing/debugfs-pktcdvd
+++ b/Documentation/ABI/testing/debugfs-pktcdvd
@@ -1,4 +1,4 @@
 
				-What:           /debug/pktcdvd/pktcdvd[0-7]
			
 
				+What:           /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
			
 
				 Date:           Oct. 2006
			
 
				 KernelVersion:  2.6.20
			
 
				 Contact:        Thomas Maier <balagi@justmail.de>
			
@@ -10,10 +10,10 @@ debugfs interface
 
				 The pktcdvd module (packet writing driver) creates
			
 
				 these files in debugfs:
			
 
				 
			
 
				-/debug/pktcdvd/pktcdvd[0-7]/
			
 
				+/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
			
 
				     info            (0444) Lots of driver statistics and infos.
			
 
				 
			
 
				 Example:
			
 
				 -------
			
 
				 
			
 
				-cat /debug/pktcdvd/pktcdvd0/info
			
 
				+cat /sys/kernel/debug/pktcdvd/pktcdvd0/info
			
--- a/Documentation/ABI/testing/sysfs-firmware-acpi
+++ b/Documentation/ABI/testing/sysfs-firmware-acpi
@@ -69,9 +69,13 @@ Description:
 
				 		gpe1F:	     0	invalid
			
 
				 		gpe_all:    1192
			
 
				 		sci:	1194
			
 
				+		sci_not:     0	
			
 
				 
			
 
				-		sci - The total number of times the ACPI SCI
			
 
				-		has claimed an interrupt.
			
 
				+		sci - The number of times the ACPI SCI
			
 
				+		has been called and claimed an interrupt.
			
 
				+
			
 
				+		sci_not - The number of times the ACPI SCI
			
 
				+		has been called and NOT claimed an interrupt.
			
 
				 
			
 
				 		gpe_all - count of SCI caused by GPEs.
			
 
				 
			
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -0,0 +1,479 @@
 
				+What:		/sys/kernel/slab
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The /sys/kernel/slab directory contains a snapshot of the
			
 
				+		internal state of the SLUB allocator for each cache.  Certain
			
 
				+		files may be modified to change the behavior of the cache (and
			
 
				+		any cache it aliases, if any).
			
 
				+Users:		kernel memory tuning tools
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/aliases
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The aliases file is read-only and specifies how many caches
			
 
				+		have merged into this cache.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/align
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The align file is read-only and specifies the cache's object
			
 
				+		alignment in bytes.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/alloc_calls
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The alloc_calls file is read-only and lists the kernel code
			
 
				+		locations from which allocations for this cache were performed.
			
 
				+		The alloc_calls file only contains information if debugging is
			
 
				+		enabled for that cache (see Documentation/vm/slub.txt).
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/alloc_fastpath
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The alloc_fastpath file is read-only and specifies how many
			
 
				+		objects have been allocated using the fast path.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/alloc_from_partial
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The alloc_from_partial file is read-only and specifies how
			
 
				+		many times a cpu slab has been full and it has been refilled
			
 
				+		by using a slab from the list of partially used slabs.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/alloc_refill
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The alloc_refill file is read-only and specifies how many
			
 
				+		times the per-cpu freelist was empty but there were objects
			
 
				+		available as the result of remote cpu frees.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/alloc_slab
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The alloc_slab file is read-only and specifies how many times
			
 
				+		a new slab had to be allocated from the page allocator.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/alloc_slowpath
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The alloc_slowpath file is read-only and specifies how many
			
 
				+		objects have been allocated using the slow path because of a
			
 
				+		refill or allocation from a partial or new slab.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/cache_dma
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The cache_dma file is read-only and specifies whether objects
			
 
				+		are from ZONE_DMA.
			
 
				+		Available when CONFIG_ZONE_DMA is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/cpu_slabs
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The cpu_slabs file is read-only and displays how many cpu slabs
			
 
				+		are active and their NUMA locality.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/cpuslab_flush
			
 
				+Date:		April 2009
			
 
				+KernelVersion:	2.6.31
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file cpuslab_flush is read-only and specifies how many
			
 
				+		times a cache's cpu slabs have been flushed as the result of
			
 
				+		destroying or shrinking a cache, a cpu going offline, or as
			
 
				+		the result of forcing an allocation from a certain node.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/ctor
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The ctor file is read-only and specifies the cache's object
			
 
				+		constructor function, which is invoked for each object when a
			
 
				+		new slab is allocated.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/deactivate_empty
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file deactivate_empty is read-only and specifies how many
			
 
				+		times an empty cpu slab was deactivated.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/deactivate_full
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file deactivate_full is read-only and specifies how many
			
 
				+		times a full cpu slab was deactivated.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/deactivate_remote_frees
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file deactivate_remote_frees is read-only and specifies how
			
 
				+		many times a cpu slab has been deactivated and contained free
			
 
				+		objects that were freed remotely.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/deactivate_to_head
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file deactivate_to_head is read-only and specifies how
			
 
				+		many times a partial cpu slab was deactivated and added to the
			
 
				+		head of its node's partial list.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/deactivate_to_tail
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file deactivate_to_tail is read-only and specifies how
			
 
				+		many times a partial cpu slab was deactivated and added to the
			
 
				+		tail of its node's partial list.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/destroy_by_rcu
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The destroy_by_rcu file is read-only and specifies whether
			
 
				+		slabs (not objects) are freed by rcu.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/free_add_partial
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file free_add_partial is read-only and specifies how many
			
 
				+		times an object has been freed in a full slab so that it had to
			
 
				+		added to its node's partial list.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/free_calls
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The free_calls file is read-only and lists the locations of
			
 
				+		object frees if slab debugging is enabled (see
			
 
				+		Documentation/vm/slub.txt).
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/free_fastpath
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The free_fastpath file is read-only and specifies how many
			
 
				+		objects have been freed using the fast path because it was an
			
 
				+		object from the cpu slab.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/free_frozen
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The free_frozen file is read-only and specifies how many
			
 
				+		objects have been freed to a frozen slab (i.e. a remote cpu
			
 
				+		slab).
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/free_remove_partial
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file free_remove_partial is read-only and specifies how
			
 
				+		many times an object has been freed to a now-empty slab so
			
 
				+		that it had to be removed from its node's partial list.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/free_slab
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The free_slab file is read-only and specifies how many times an
			
 
				+		empty slab has been freed back to the page allocator.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/free_slowpath
			
 
				+Date:		February 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The free_slowpath file is read-only and specifies how many
			
 
				+		objects have been freed using the slow path (i.e. to a full or
			
 
				+		partial slab).
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/hwcache_align
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The hwcache_align file is read-only and specifies whether
			
 
				+		objects are aligned on cachelines.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/min_partial
			
 
				+Date:		February 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		David Rientjes <rientjes@google.com>
			
 
				+Description:
			
 
				+		The min_partial file specifies how many empty slabs shall
			
 
				+		remain on a node's partial list to avoid the overhead of
			
 
				+		allocating new slabs.  Such slabs may be reclaimed by utilizing
			
 
				+		the shrink file.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/object_size
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The object_size file is read-only and specifies the cache's
			
 
				+		object size.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/objects
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The objects file is read-only and displays how many objects are
			
 
				+		active and from which nodes they are from.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/objects_partial
			
 
				+Date:		April 2008
			
 
				+KernelVersion:	2.6.26
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The objects_partial file is read-only and displays how many
			
 
				+		objects are on partial slabs and from which nodes they are
			
 
				+		from.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/objs_per_slab
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file objs_per_slab is read-only and specifies how many
			
 
				+		objects may be allocated from a single slab of the order
			
 
				+		specified in /sys/kernel/slab/cache/order.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/order
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The order file specifies the page order at which new slabs are
			
 
				+		allocated.  It is writable and can be changed to increase the
			
 
				+		number of objects per slab.  If a slab cannot be allocated
			
 
				+		because of fragmentation, SLUB will retry with the minimum order
			
 
				+		possible depending on its characteristics.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/order_fallback
			
 
				+Date:		April 2008
			
 
				+KernelVersion:	2.6.26
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file order_fallback is read-only and specifies how many
			
 
				+		times an allocation of a new slab has not been possible at the
			
 
				+		cache's order and instead fallen back to its minimum possible
			
 
				+		order.
			
 
				+		Available when CONFIG_SLUB_STATS is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/partial
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The partial file is read-only and displays how long many
			
 
				+		partial slabs there are and how long each node's list is.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/poison
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The poison file specifies whether objects should be poisoned
			
 
				+		when a new slab is allocated.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/reclaim_account
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The reclaim_account file specifies whether the cache's objects
			
 
				+		are reclaimable (and grouped by their mobility).
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/red_zone
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The red_zone file specifies whether the cache's objects are red
			
 
				+		zoned.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/remote_node_defrag_ratio
			
 
				+Date:		January 2008
			
 
				+KernelVersion:	2.6.25
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The file remote_node_defrag_ratio specifies the percentage of
			
 
				+		times SLUB will attempt to refill the cpu slab with a partial
			
 
				+		slab from a remote node as opposed to allocating a new slab on
			
 
				+		the local node.  This reduces the amount of wasted memory over
			
 
				+		the entire system but can be expensive.
			
 
				+		Available when CONFIG_NUMA is enabled.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/sanity_checks
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The sanity_checks file specifies whether expensive checks
			
 
				+		should be performed on free and, at minimum, enables double free
			
 
				+		checks.  Caches that enable sanity_checks cannot be merged with
			
 
				+		caches that do not.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/shrink
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The shrink file is written when memory should be reclaimed from
			
 
				+		a cache.  Empty partial slabs are freed and the partial list is
			
 
				+		sorted so the slabs with the fewest available objects are used
			
 
				+		first.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/slab_size
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The slab_size file is read-only and specifies the object size
			
 
				+		with metadata (debugging information and alignment) in bytes.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/slabs
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The slabs file is read-only and displays how long many slabs
			
 
				+		there are (both cpu and partial) and from which nodes they are
			
 
				+		from.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/store_user
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The store_user file specifies whether the location of
			
 
				+		allocation or free should be tracked for a cache.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/total_objects
			
 
				+Date:		April 2008
			
 
				+KernelVersion:	2.6.26
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The total_objects file is read-only and displays how many total
			
 
				+		objects a cache has and from which nodes they are from.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/trace
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		The trace file specifies whether object allocations and frees
			
 
				+		should be traced.
			
 
				+
			
 
				+What:		/sys/kernel/slab/cache/validate
			
 
				+Date:		May 2007
			
 
				+KernelVersion:	2.6.22
			
 
				+Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
			
 
				+		Christoph Lameter <cl@linux-foundation.org>
			
 
				+Description:
			
 
				+		Writing to the validate file causes SLUB to traverse all of its
			
 
				+		cache's objects and check the validity of metadata.
			
--- a/Documentation/DMA-mapping.txt
+++ b/Documentation/DMA-mapping.txt
@@ -136,7 +136,7 @@ exactly why.
 
				 The standard 32-bit addressing PCI device would do something like
			
 
				 this:
			
 
				 
			
 
				-	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
			
 
				+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
			
 
				 		printk(KERN_WARNING
			
 
				 		       "mydev: No suitable DMA available.\n");
			
 
				 		goto ignore_this_device;
			
@@ -155,9 +155,9 @@ all 64-bits when accessing streaming DMA:
 
				 
			
 
				 	int using_dac;
			
 
				 
			
 
				-	if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) {
			
 
				+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
			
 
				 		using_dac = 1;
			
 
				-	} else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
			
 
				+	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
			
 
				 		using_dac = 0;
			
 
				 	} else {
			
 
				 		printk(KERN_WARNING
			
@@ -170,14 +170,14 @@ the case would look like this:
 
				 
			
 
				 	int using_dac, consistent_using_dac;
			
 
				 
			
 
				-	if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) {
			
 
				+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
			
 
				 		using_dac = 1;
			
 
				 	   	consistent_using_dac = 1;
			
 
				-		pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
			
 
				-	} else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
			
 
				+		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
			
 
				+	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
			
 
				 		using_dac = 0;
			
 
				 		consistent_using_dac = 0;
			
 
				-		pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
			
 
				+		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
			
 
				 	} else {
			
 
				 		printk(KERN_WARNING
			
 
				 		       "mydev: No suitable DMA available.\n");
			
@@ -192,7 +192,7 @@ check the return value from pci_set_consistent_dma_mask().
 
				 Finally, if your device can only drive the low 24-bits of
			
 
				 address during PCI bus mastering you might do something like:
			
 
				 
			
 
				-	if (pci_set_dma_mask(pdev, DMA_24BIT_MASK)) {
			
 
				+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) {
			
 
				 		printk(KERN_WARNING
			
 
				 		       "mydev: 24-bit DMA addressing not available.\n");
			
 
				 		goto ignore_this_device;
			
@@ -213,7 +213,7 @@ most specific mask.
 
				 
			
 
				 Here is pseudo-code showing how this might be done:
			
 
				 
			
 
				-	#define PLAYBACK_ADDRESS_BITS	DMA_32BIT_MASK
			
 
				+	#define PLAYBACK_ADDRESS_BITS	DMA_BIT_MASK(32)
			
 
				 	#define RECORD_ADDRESS_BITS	0x00ffffff
			
 
				 
			
 
				 	struct my_sound_card *card;
			
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -31,7 +31,7 @@ PS_METHOD	= $(prefer-db2x)
 
				 
			
 
				 ###
			
 
				 # The targets that may be used.
			
 
				-PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs
			
 
				+PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs cleandocs
			
 
				 
			
 
				 BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
			
 
				 xmldocs: $(BOOKS)
			
@@ -143,7 +143,8 @@ quiet_cmd_db2pdf = PDF     $@
 
				 	$(call cmd,db2pdf)
			
 
				 
			
 
				 
			
 
				-main_idx = Documentation/DocBook/index.html
			
 
				+index = index.html
			
 
				+main_idx = Documentation/DocBook/$(index)
			
 
				 build_main_index = rm -rf $(main_idx) && \
			
 
				 		   echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \
			
 
				 		   echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \
			
@@ -213,11 +214,12 @@ silent_gen_xml = :
 
				 dochelp:
			
 
				 	@echo  ' Linux kernel internal documentation in different formats:'
			
 
				 	@echo  '  htmldocs        - HTML'
			
 
				-	@echo  '  installmandocs  - install man pages generated by mandocs'
			
 
				-	@echo  '  mandocs         - man pages'
			
 
				 	@echo  '  pdfdocs         - PDF'
			
 
				 	@echo  '  psdocs          - Postscript'
			
 
				 	@echo  '  xmldocs         - XML DocBook'
			
 
				+	@echo  '  mandocs         - man pages'
			
 
				+	@echo  '  installmandocs  - install man pages generated by mandocs'
			
 
				+	@echo  '  cleandocs       - clean all generated DocBook files'
			
 
				 
			
 
				 ###
			
 
				 # Temporary files left by various tools
			
@@ -231,10 +233,14 @@ clean-files := $(DOCBOOKS) \
 
				 	$(patsubst %.xml, %.pdf,  $(DOCBOOKS)) \
			
 
				 	$(patsubst %.xml, %.html, $(DOCBOOKS)) \
			
 
				 	$(patsubst %.xml, %.9,    $(DOCBOOKS)) \
			
 
				-	$(C-procfs-example)
			
 
				+	$(C-procfs-example) $(index)
			
 
				 
			
 
				 clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
			
 
				 
			
 
				+cleandocs:
			
 
				+	$(Q)rm -f $(call objectify, $(clean-files))
			
 
				+	$(Q)rm -rf $(call objectify, $(clean-dirs))
			
 
				+
			
 
				 # Declare the contents of the .PHONY variable as phony.  We keep that
			
 
				 # information in a variable se we can use it in if_changed and friends.
			
 
				 
			
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -190,16 +190,20 @@ X!Ekernel/module.c
 
				 !Edrivers/pci/pci.c
			
 
				 !Edrivers/pci/pci-driver.c
			
 
				 !Edrivers/pci/remove.c
			
 
				-!Edrivers/pci/pci-acpi.c
			
 
				 !Edrivers/pci/search.c
			
 
				 !Edrivers/pci/msi.c
			
 
				 !Edrivers/pci/bus.c
			
 
				+!Edrivers/pci/access.c
			
 
				+!Edrivers/pci/irq.c
			
 
				+!Edrivers/pci/htirq.c
			
 
				 <!-- FIXME: Removed for now since no structured comments in source
			
 
				 X!Edrivers/pci/hotplug.c
			
 
				 -->
			
 
				 !Edrivers/pci/probe.c
			
 
				+!Edrivers/pci/slot.c
			
 
				 !Edrivers/pci/rom.c
			
 
				 !Edrivers/pci/iov.c
			
 
				+!Idrivers/pci/pci-sysfs.c
			
 
				      </sect1>
			
 
				      <sect1><title>PCI Hotplug Support Library</title>
			
 
				 !Edrivers/pci/hotplug/pci_hotplug_core.c
			
@@ -259,7 +263,7 @@ X!Earch/x86/kernel/mca_32.c
 
				 !Eblock/blk-tag.c
			
 
				 !Iblock/blk-tag.c
			
 
				 !Eblock/blk-integrity.c
			
 
				-!Iblock/blktrace.c
			
 
				+!Ikernel/trace/blktrace.c
			
 
				 !Iblock/genhd.c
			
 
				 !Eblock/genhd.c
			
 
				   </chapter>
			
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -281,7 +281,7 @@
 
				     seriously wrong while debugging, it will most often be the case
			
 
				     that you want to enable gdb to be verbose about its target
			
 
				     communications.  You do this prior to issuing the <constant>target
			
 
				-    remote</constant> command by typing in: <constant>set remote debug 1</constant>
			
 
				+    remote</constant> command by typing in: <constant>set debug remote 1</constant>
			
 
				     </para>
			
 
				   </chapter>
			
 
				   <chapter id="KGDBTestSuite">
			
--- a/Documentation/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/DocBook/writing-an-alsa-driver.tmpl
@@ -1137,8 +1137,8 @@
 
				           if (err < 0)
			
 
				                   return err;
			
 
				           /* check PCI availability (28bit DMA) */
			
 
				-          if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
			
 
				-              pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
			
 
				+          if (pci_set_dma_mask(pci, DMA_BIT_MASK(28)) < 0 ||
			
 
				+              pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(28)) < 0) {
			
 
				                   printk(KERN_ERR "error to set 28bit mask DMA\n");
			
 
				                   pci_disable_device(pci);
			
 
				                   return -ENXIO;
			
@@ -1252,8 +1252,8 @@
 
				   err = pci_enable_device(pci);
			
 
				   if (err < 0)
			
 
				           return err;
			
 
				-  if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
			
 
				-      pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
			
 
				+  if (pci_set_dma_mask(pci, DMA_BIT_MASK(28)) < 0 ||
			
 
				+      pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(28)) < 0) {
			
 
				           printk(KERN_ERR "error to set 28bit mask DMA\n");
			
 
				           pci_disable_device(pci);
			
 
				           return -ENXIO;
			
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -1040,23 +1040,21 @@ Front merges are handled by the binary trees in AS and deadline schedulers.
 
				 iii. Plugging the queue to batch requests in anticipation of opportunities for
			
 
				      merge/sort optimizations
			
 
				 
			
 
				-This is just the same as in 2.4 so far, though per-device unplugging
			
 
				-support is anticipated for 2.5. Also with a priority-based i/o scheduler,
			
 
				-such decisions could be based on request priorities.
			
 
				-
			
 
				 Plugging is an approach that the current i/o scheduling algorithm resorts to so
			
 
				 that it collects up enough requests in the queue to be able to take
			
 
				 advantage of the sorting/merging logic in the elevator. If the
			
 
				 queue is empty when a request comes in, then it plugs the request queue
			
 
				-(sort of like plugging the bottom of a vessel to get fluid to build up)
			
 
				+(sort of like plugging the bath tub of a vessel to get fluid to build up)
			
 
				 till it fills up with a few more requests, before starting to service
			
 
				 the requests. This provides an opportunity to merge/sort the requests before
			
 
				 passing them down to the device. There are various conditions when the queue is
			
 
				 unplugged (to open up the flow again), either through a scheduled task or
			
 
				 could be on demand. For example wait_on_buffer sets the unplugging going
			
 
				-(by running tq_disk) so the read gets satisfied soon. So in the read case,
			
 
				-the queue gets explicitly unplugged as part of waiting for completion,
			
 
				-in fact all queues get unplugged as a side-effect.
			
 
				+through sync_buffer() running blk_run_address_space(mapping). Or the caller
			
 
				+can do it explicity through blk_unplug(bdev). So in the read case,
			
 
				+the queue gets explicitly unplugged as part of waiting for completion on that
			
 
				+buffer. For page driven IO, the address space ->sync_page() takes care of
			
 
				+doing the blk_run_address_space().
			
 
				 
			
 
				 Aside:
			
 
				   This is kind of controversial territory, as it's not clear if plugging is
			
@@ -1067,11 +1065,6 @@ Aside:
 
				   multi-page bios being queued in one shot, we may not need to wait to merge
			
 
				   a big request from the broken up pieces coming by.
			
 
				 
			
 
				-  Per-queue granularity unplugging (still a Todo) may help reduce some of the
			
 
				-  concerns with just a single tq_disk flush approach. Something like
			
 
				-  blk_kick_queue() to unplug a specific queue (right away ?)
			
 
				-  or optionally, all queues, is in the plan.
			
 
				-
			
 
				 4.4 I/O contexts
			
 
				 I/O contexts provide a dynamically allocated per process data area. They may
			
 
				 be used in I/O schedulers, and in the block layer (could be used for IO statis,
			
--- a/Documentation/blockdev/00-INDEX
+++ b/Documentation/blockdev/00-INDEX
@@ -8,6 +8,8 @@ cpqarray.txt
 
				 	- info on using Compaq's SMART2 Intelligent Disk Array Controllers.
			
 
				 floppy.txt
			
 
				 	- notes and driver options for the floppy disk driver.
			
 
				+mflash.txt
			
 
				+	- info on mGine m(g)flash driver for linux.
			
 
				 nbd.txt
			
 
				 	- info on a TCP implementation of a network block device.
			
 
				 paride.txt
			
--- a/Documentation/blockdev/mflash.txt
+++ b/Documentation/blockdev/mflash.txt
@@ -0,0 +1,84 @@
 
				+This document describes m[g]flash support in linux.
			
 
				+
			
 
				+Contents
			
 
				+  1. Overview
			
 
				+  2. Reserved area configuration
			
 
				+  3. Example of mflash platform driver registration
			
 
				+
			
 
				+1. Overview
			
 
				+
			
 
				+Mflash and gflash are embedded flash drive. The only difference is mflash is
			
 
				+MCP(Multi Chip Package) device. These two device operate exactly same way.
			
 
				+So the rest mflash repersents mflash and gflash altogether.
			
 
				+
			
 
				+Internally, mflash has nand flash and other hardware logics and supports
			
 
				+2 different operation (ATA, IO) modes. ATA mode doesn't need any new
			
 
				+driver and currently works well under standard IDE subsystem. Actually it's
			
 
				+one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
			
 
				+IDE interface.
			
 
				+
			
 
				+Followings are brief descriptions about IO mode.
			
 
				+A. IO mode based on ATA protocol and uses some custom command. (read confirm,
			
 
				+write confirm)
			
 
				+B. IO mode uses SRAM bus interface.
			
 
				+C. IO mode supports 4kB boot area, so host can boot from mflash.
			
 
				+
			
 
				+2. Reserved area configuration
			
 
				+If host boot from mflash, usually needs raw area for boot loader image. All of
			
 
				+the mflash's block device operation will be taken this value as start offset.
			
 
				+Note that boot loader's size of reserved area and kernel configuration value
			
 
				+must be same.
			
 
				+
			
 
				+3. Example of mflash platform driver registration
			
 
				+Working mflash is very straight forward. Adding platform device stuff to board
			
 
				+configuration file is all. Here is some pseudo example.
			
 
				+
			
 
				+static struct mg_drv_data mflash_drv_data = {
			
 
				+	/* If you want to polling driver set to 1 */
			
 
				+	.use_polling = 0,
			
 
				+	/* device attribution */
			
 
				+	.dev_attr = MG_BOOT_DEV
			
 
				+};
			
 
				+
			
 
				+static struct resource mg_mflash_rsc[] = {
			
 
				+	/* Base address of mflash */
			
 
				+	[0] = {
			
 
				+		.start = 0x08000000,
			
 
				+		.end = 0x08000000 + SZ_64K - 1,
			
 
				+		.flags = IORESOURCE_MEM
			
 
				+	},
			
 
				+	/* mflash interrupt pin */
			
 
				+	[1] = {
			
 
				+		.start = IRQ_GPIO(84),
			
 
				+		.end = IRQ_GPIO(84),
			
 
				+		.flags = IORESOURCE_IRQ
			
 
				+	},
			
 
				+	/* mflash reset pin */
			
 
				+	[2] = {
			
 
				+		.start = 43,
			
 
				+		.end = 43,
			
 
				+		.name = MG_RST_PIN,
			
 
				+		.flags = IORESOURCE_IO
			
 
				+	},
			
 
				+	/* mflash reset-out pin
			
 
				+	 * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
			
 
				+	 * should assign this */
			
 
				+	[3] = {
			
 
				+		.start = 51,
			
 
				+		.end = 51,
			
 
				+		.name = MG_RSTOUT_PIN,
			
 
				+		.flags = IORESOURCE_IO
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+static struct platform_device mflash_dev = {
			
 
				+	.name = MG_DEV_NAME,
			
 
				+	.id = -1,
			
 
				+	.dev = {
			
 
				+		.platform_data = &mflash_drv_data,
			
 
				+	},
			
 
				+	.num_resources = ARRAY_SIZE(mg_mflash_rsc),
			
 
				+	.resource = mg_mflash_rsc
			
 
				+};
			
 
				+
			
 
				+platform_device_register(&mflash_dev);
			
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
 
				 process (bash) into it. CPU time consumed by this bash and its children
			
 
				 can be obtained from g1/cpuacct.usage and the same is accumulated in
			
 
				 /cgroups/cpuacct.usage also.
			
 
				+
			
 
				+cpuacct.stat file lists a few statistics which further divide the
			
 
				+CPU time obtained by the cgroup into user and system times. Currently
			
 
				+the following statistics are supported:
			
 
				+
			
 
				+user: Time spent by tasks of the cgroup in user mode.
			
 
				+system: Time spent by tasks of the cgroup in kernel mode.
			
 
				+
			
 
				+user and system are in USER_HZ unit.
			
 
				+
			
 
				+cpuacct controller uses percpu_counter interface to collect user and
			
 
				+system times. This has two side effects:
			
 
				+
			
 
				+- It is theoretically possible to see wrong values for user and system times.
			
 
				+  This is because percpu_counter_read() on 32bit systems isn't safe
			
 
				+  against concurrent writes.
			
 
				+- It is possible to see slightly outdated values for user and system times
			
 
				+  due to the batch processing nature of percpu_counter.
			
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -6,15 +6,14 @@ used here with the memory controller that is used in hardware.
 
				 
			
 
				 Salient features
			
 
				 
			
 
				-a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages
			
 
				+a. Enable control of Anonymous, Page Cache (mapped and unmapped) and
			
 
				+   Swap Cache memory pages.
			
 
				 b. The infrastructure allows easy addition of other types of memory to control
			
 
				 c. Provides *zero overhead* for non memory controller users
			
 
				 d. Provides a double LRU: global memory pressure causes reclaim from the
			
 
				    global LRU; a cgroup on hitting a limit, reclaims from the per
			
 
				    cgroup LRU
			
 
				 
			
 
				-NOTE: Swap Cache (unmapped) is not accounted now.
			
 
				-
			
 
				 Benefits and Purpose of the memory controller
			
 
				 
			
 
				 The memory controller isolates the memory behaviour of a group of tasks
			
@@ -290,34 +289,44 @@ will be charged as a new owner of it.
 
				   moved to the parent. If you want to avoid that, force_empty will be useful.
			
 
				 
			
 
				 5.2 stat file
			
 
				-  memory.stat file includes following statistics (now)
			
 
				-	cache			- # of pages from page-cache and shmem.
			
 
				-	rss			- # of pages from anonymous memory.
			
 
				-	pgpgin			- # of event of charging
			
 
				-	pgpgout			- # of event of uncharging
			
 
				-	active_anon		- # of pages on active lru of anon, shmem.
			
 
				-	inactive_anon 		- # of pages on active lru of anon, shmem
			
 
				-	active_file		- # of pages on active lru of file-cache
			
 
				-	inactive_file		- # of pages on inactive lru of file cache
			
 
				-	unevictable		- # of pages cannot be reclaimed.(mlocked etc)
			
 
				-
			
 
				-	Below is depend on CONFIG_DEBUG_VM.
			
 
				-	inactive_ratio		- VM internal parameter. (see mm/page_alloc.c)
			
 
				-	recent_rotated_anon	- VM internal parameter. (see mm/vmscan.c)
			
 
				-	recent_rotated_file	- VM internal parameter. (see mm/vmscan.c)
			
 
				-	recent_scanned_anon 	- VM internal parameter. (see mm/vmscan.c)
			
 
				-	recent_scanned_file 	- VM internal parameter. (see mm/vmscan.c)
			
 
				-
			
 
				-  Memo:
			
 
				+
			
 
				+memory.stat file includes following statistics
			
 
				+
			
 
				+cache		- # of bytes of page cache memory.
			
 
				+rss		- # of bytes of anonymous and swap cache memory.
			
 
				+pgpgin		- # of pages paged in (equivalent to # of charging events).
			
 
				+pgpgout		- # of pages paged out (equivalent to # of uncharging events).
			
 
				+active_anon	- # of bytes of anonymous and  swap cache memory on active
			
 
				+		  lru list.
			
 
				+inactive_anon	- # of bytes of anonymous memory and swap cache memory on
			
 
				+		  inactive lru list.
			
 
				+active_file	- # of bytes of file-backed memory on active lru list.
			
 
				+inactive_file	- # of bytes of file-backed memory on inactive lru list.
			
 
				+unevictable	- # of bytes of memory that cannot be reclaimed (mlocked etc).
			
 
				+
			
 
				+The following additional stats are dependent on CONFIG_DEBUG_VM.
			
 
				+
			
 
				+inactive_ratio		- VM internal parameter. (see mm/page_alloc.c)
			
 
				+recent_rotated_anon	- VM internal parameter. (see mm/vmscan.c)
			
 
				+recent_rotated_file	- VM internal parameter. (see mm/vmscan.c)
			
 
				+recent_scanned_anon	- VM internal parameter. (see mm/vmscan.c)
			
 
				+recent_scanned_file	- VM internal parameter. (see mm/vmscan.c)
			
 
				+
			
 
				+Memo:
			
 
				 	recent_rotated means recent frequency of lru rotation.
			
 
				 	recent_scanned means recent # of scans to lru.
			
 
				 	showing for better debug please see the code for meanings.
			
 
				 
			
 
				+Note:
			
 
				+	Only anonymous and swap cache memory is listed as part of 'rss' stat.
			
 
				+	This should not be confused with the true 'resident set size' or the
			
 
				+	amount of physical memory used by the cgroup. Per-cgroup rss
			
 
				+	accounting is not done yet.
			
 
				 
			
 
				 5.3 swappiness
			
 
				   Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
			
 
				 
			
 
				-  Following cgroup's swapiness can't be changed.
			
 
				+  Following cgroups' swapiness can't be changed.
			
 
				   - root cgroup (uses /proc/sys/vm/swappiness).
			
 
				   - a cgroup which uses hierarchy and it has child cgroup.
			
 
				   - a cgroup which uses hierarchy and not the root of hierarchy.
			
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -47,13 +47,18 @@ to work with it.
 
				 
			
 
				 2. Basic accounting routines
			
 
				 
			
 
				- a. void res_counter_init(struct res_counter *rc)
			
 
				+ a. void res_counter_init(struct res_counter *rc,
			
 
				+				struct res_counter *rc_parent)
			
 
				 
			
 
				  	Initializes the resource counter. As usual, should be the first
			
 
				 	routine called for a new counter.
			
 
				 
			
 
				- b. int res_counter_charge[_locked]
			
 
				-			(struct res_counter *rc, unsigned long val)
			
 
				+	The struct res_counter *parent can be used to define a hierarchical
			
 
				+	child -> parent relationship directly in the res_counter structure,
			
 
				+	NULL can be used to define no relationship.
			
 
				+
			
 
				+ c. int res_counter_charge(struct res_counter *rc, unsigned long val,
			
 
				+				struct res_counter **limit_fail_at)
			
 
				 
			
 
				 	When a resource is about to be allocated it has to be accounted
			
 
				 	with the appropriate resource counter (controller should determine
			
@@ -67,15 +72,25 @@ to work with it.
 
				 	  * if the charging is performed first, then it should be uncharged
			
 
				 	    on error path (if the one is called).
			
 
				 
			
 
				- c. void res_counter_uncharge[_locked]
			
 
				+	If the charging fails and a hierarchical dependency exists, the
			
 
				+	limit_fail_at parameter is set to the particular res_counter element
			
 
				+	where the charging failed.
			
 
				+
			
 
				+ d. int res_counter_charge_locked
			
 
				+			(struct res_counter *rc, unsigned long val)
			
 
				+
			
 
				+	The same as res_counter_charge(), but it must not acquire/release the
			
 
				+	res_counter->lock internally (it must be called with res_counter->lock
			
 
				+	held).
			
 
				+
			
 
				+ e. void res_counter_uncharge[_locked]
			
 
				 			(struct res_counter *rc, unsigned long val)
			
 
				 
			
 
				 	When a resource is released (freed) it should be de-accounted
			
 
				 	from the resource counter it was accounted to.  This is called
			
 
				 	"uncharging".
			
 
				 
			
 
				-    The _locked routines imply that the res_counter->lock is taken.
			
 
				-
			
 
				+	The _locked routines imply that the res_counter->lock is taken.
			
 
				 
			
 
				  2.1 Other accounting routines
			
 
				 
			
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -3,7 +3,7 @@
 
				 
			
 
				 	     Maintained by Alan Cox <device@lanana.org>
			
 
				 
			
 
				-		      Last revised: 29 November 2006
			
 
				+		      Last revised: 6th April 2009
			
 
				 
			
 
				 This list is the Linux Device List, the official registry of allocated
			
 
				 device numbers and /dev directory nodes for the Linux operating
			
@@ -2797,6 +2797,10 @@ Your cooperation is appreciated.
 
				 		 206 = /dev/ttySC1		SC26xx serial port 1
			
 
				 		 207 = /dev/ttySC2		SC26xx serial port 2
			
 
				 		 208 = /dev/ttySC3		SC26xx serial port 3
			
 
				+		 209 = /dev/ttyMAX0		MAX3100 serial port 0
			
 
				+		 210 = /dev/ttyMAX1		MAX3100 serial port 1
			
 
				+		 211 = /dev/ttyMAX2		MAX3100 serial port 2
			
 
				+		 212 = /dev/ttyMAX3		MAX3100 serial port 3
			
 
				 
			
 
				 205 char	Low-density serial ports (alternate device)
			
 
				 		  0 = /dev/culu0		Callout device for ttyLU0
			
--- a/Documentation/driver-model/platform.txt
+++ b/Documentation/driver-model/platform.txt
@@ -169,3 +169,62 @@ three different ways to find such a match:
 
				       be probed later if another device registers.  (Which is OK, since
			
 
				       this interface is only for use with non-hotpluggable devices.)
			
 
				 
			
 
				+
			
 
				+Early Platform Devices and Drivers
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+The early platform interfaces provide platform data to platform device
			
 
				+drivers early on during the system boot. The code is built on top of the
			
 
				+early_param() command line parsing and can be executed very early on.
			
 
				+
			
 
				+Example: "earlyprintk" class early serial console in 6 steps
			
 
				+
			
 
				+1. Registering early platform device data
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+The architecture code registers platform device data using the function
			
 
				+early_platform_add_devices(). In the case of early serial console this
			
 
				+should be hardware configuration for the serial port. Devices registered
			
 
				+at this point will later on be matched against early platform drivers.
			
 
				+
			
 
				+2. Parsing kernel command line
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+The architecture code calls parse_early_param() to parse the kernel
			
 
				+command line. This will execute all matching early_param() callbacks.
			
 
				+User specified early platform devices will be registered at this point.
			
 
				+For the early serial console case the user can specify port on the
			
 
				+kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
			
 
				+the class string, "serial" is the name of the platfrom driver and
			
 
				+0 is the platform device id. If the id is -1 then the dot and the
			
 
				+id can be omitted.
			
 
				+
			
 
				+3. Installing early platform drivers belonging to a certain class
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+The architecture code may optionally force registration of all early
			
 
				+platform drivers belonging to a certain class using the function
			
 
				+early_platform_driver_register_all(). User specified devices from
			
 
				+step 2 have priority over these. This step is omitted by the serial
			
 
				+driver example since the early serial driver code should be disabled
			
 
				+unless the user has specified port on the kernel command line.
			
 
				+
			
 
				+4. Early platform driver registration
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+Compiled-in platform drivers making use of early_platform_init() are
			
 
				+automatically registered during step 2 or 3. The serial driver example
			
 
				+should use early_platform_init("earlyprintk", &platform_driver).
			
 
				+
			
 
				+5. Probing of early platform drivers belonging to a certain class
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+The architecture code calls early_platform_driver_probe() to match
			
 
				+registered early platform devices associated with a certain class with
			
 
				+registered early platform drivers. Matched devices will get probed().
			
 
				+This step can be executed at any point during the early boot. As soon
			
 
				+as possible may be good for the serial port case.
			
 
				+
			
 
				+6. Inside the early platform driver probe()
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+The driver code needs to take special care during early boot, especially
			
 
				+when it comes to memory allocation and interrupt registration. The code
			
 
				+in the probe() function can use is_early_platform_device() to check if
			
 
				+it is called at early platform device or at the regular platform device
			
 
				+time. The early serial driver performs register_console() at this point.
			
 
				+
			
 
				+For further information, see <linux/platform_device.h>.
			
--- a/Documentation/fb/uvesafb.txt
+++ b/Documentation/fb/uvesafb.txt
@@ -59,7 +59,8 @@ Accepted options:
 
				 ypan    Enable display panning using the VESA protected mode
			
 
				         interface.  The visible screen is just a window of the
			
 
				         video memory, console scrolling is done by changing the
			
 
				-        start of the window.  Available on x86 only.
			
 
				+        start of the window.  This option is available on x86
			
 
				+        only and is the default option on that architecture.
			
 
				 
			
 
				 ywrap   Same as ypan, but assumes your gfx board can wrap-around
			
 
				         the video memory (i.e. starts reading from top if it
			
@@ -67,7 +68,7 @@ ywrap   Same as ypan, but assumes your gfx board can wrap-around
 
				         Available on x86 only.
			
 
				 
			
 
				 redraw  Scroll by redrawing the affected part of the screen, this
			
 
				-        is the safe (and slow) default.
			
 
				+        is the default on non-x86.
			
 
				 
			
 
				 (If you're using uvesafb as a module, the above three options are
			
 
				  used a parameter of the scroll option, e.g. scroll=ypan.)
			
@@ -182,7 +183,7 @@ from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo.
 
				 
			
 
				 --
			
 
				  Michal Januszewski <spock@gentoo.org>
			
 
				- Last updated: 2007-06-16
			
 
				+ Last updated: 2009-03-30
			
 
				 
			
 
				  Documentation of the uvesafb options is loosely based on vesafb.txt.
			
 
				 
			
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -354,7 +354,8 @@ Who:  Krzysztof Piotr Oledzki <ole@ans.pl>
 
				 
			
 
				 ---------------------------
			
 
				 
			
 
				-What:	i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client()
			
 
				+What:	i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(),
			
 
				+	i2c_adapter->client_register(), i2c_adapter->client_unregister
			
 
				 When:	2.6.30
			
 
				 Check:	i2c_attach_client i2c_detach_client
			
 
				 Why:	Deprecated by the new (standard) device driver binding model. Use
			
@@ -427,3 +428,12 @@ Why:	In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
 
				 	After a reasonable transition period, we will remove the legacy
			
 
				 	fakephp interface.
			
 
				 Who:	Alex Chiang <achiang@hp.com>
			
 
				+
			
 
				+---------------------------
			
 
				+
			
 
				+What:	i2c-voodoo3 driver
			
 
				+When:	October 2009
			
 
				+Why:	Superseded by tdfxfb. I2C/DDC support used to live in a separate
			
 
				+	driver but this caused driver conflicts.
			
 
				+Who:	Jean Delvare <khali@linux-fr.org>
			
 
				+	Krzysztof Helt <krzysztof.h1@wp.pl>
			
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -68,6 +68,8 @@ ncpfs.txt
 
				 	- info on Novell Netware(tm) filesystem using NCP protocol.
			
 
				 nfsroot.txt
			
 
				 	- short guide on setting up a diskless box with NFS root filesystem.
			
 
				+nilfs2.txt
			
 
				+	- info and mount options for the NILFS2 filesystem.
			
 
				 ntfs.txt
			
 
				 	- info and mount options for the NTFS filesystem (Windows NT).
			
 
				 ocfs2.txt
			
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -512,16 +512,24 @@ locking rules:
 
				 		BKL	mmap_sem	PageLocked(page)
			
 
				 open:		no	yes
			
 
				 close:		no	yes
			
 
				-fault:		no	yes
			
 
				-page_mkwrite:	no	yes		no
			
 
				+fault:		no	yes		can return with page locked
			
 
				+page_mkwrite:	no	yes		can return with page locked
			
 
				 access:		no	yes
			
 
				 
			
 
				-	->page_mkwrite() is called when a previously read-only page is
			
 
				-about to become writeable. The file system is responsible for
			
 
				-protecting against truncate races. Once appropriate action has been
			
 
				-taking to lock out truncate, the page range should be verified to be
			
 
				-within i_size. The page mapping should also be checked that it is not
			
 
				-NULL.
			
 
				+	->fault() is called when a previously not present pte is about
			
 
				+to be faulted in. The filesystem must find and return the page associated
			
 
				+with the passed in "pgoff" in the vm_fault structure. If it is possible that
			
 
				+the page may be truncated and/or invalidated, then the filesystem must lock
			
 
				+the page, then ensure it is not already truncated (the page lock will block
			
 
				+subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
			
 
				+locked. The VM will unlock the page.
			
 
				+
			
 
				+	->page_mkwrite() is called when a previously read-only pte is
			
 
				+about to become writeable. The filesystem again must ensure that there are
			
 
				+no truncate/invalidate races, and then return with the page locked. If
			
 
				+the page has been truncated, the filesystem should not look up a new page
			
 
				+like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
			
 
				+will cause the VM to retry the fault.
			
 
				 
			
 
				 	->access() is called when get_user_pages() fails in
			
 
				 acces_process_vm(), typically used to debug a process through
			
--- a/Documentation/filesystems/caching/cachefiles.txt
+++ b/Documentation/filesystems/caching/cachefiles.txt
@@ -407,7 +407,7 @@ A NOTE ON SECURITY
 
				 ==================
			
 
				 
			
 
				 CacheFiles makes use of the split security in the task_struct.  It allocates
			
 
				-its own task_security structure, and redirects current->act_as to point to it
			
 
				+its own task_security structure, and redirects current->cred to point to it
			
 
				 when it acts on behalf of another process, in that process's context.
			
 
				 
			
 
				 The reason it does this is that it calls vfs_mkdir() and suchlike rather than
			
@@ -429,9 +429,9 @@ This means it may lose signals or ptrace events for example, and affects what
 
				 the process looks like in /proc.
			
 
				 
			
 
				 So CacheFiles makes use of a logical split in the security between the
			
 
				-objective security (task->sec) and the subjective security (task->act_as).  The
			
 
				-objective security holds the intrinsic security properties of a process and is
			
 
				-never overridden.  This is what appears in /proc, and is what is used when a
			
 
				+objective security (task->real_cred) and the subjective security (task->cred).
			
 
				+The objective security holds the intrinsic security properties of a process and
			
 
				+is never overridden.  This is what appears in /proc, and is what is used when a
			
 
				 process is the target of an operation by some other process (SIGKILL for
			
 
				 example).
			
 
				 
			
--- a/Documentation/filesystems/knfsd-stats.txt
+++ b/Documentation/filesystems/knfsd-stats.txt
@@ -0,0 +1,159 @@
 
				+
			
 
				+Kernel NFS Server Statistics
			
 
				+============================
			
 
				+
			
 
				+This document describes the format and semantics of the statistics
			
 
				+which the kernel NFS server makes available to userspace.  These
			
 
				+statistics are available in several text form pseudo files, each of
			
 
				+which is described separately below.
			
 
				+
			
 
				+In most cases you don't need to know these formats, as the nfsstat(8)
			
 
				+program from the nfs-utils distribution provides a helpful command-line
			
 
				+interface for extracting and printing them.
			
 
				+
			
 
				+All the files described here are formatted as a sequence of text lines,
			
 
				+separated by newline '\n' characters.  Lines beginning with a hash
			
 
				+'#' character are comments intended for humans and should be ignored
			
 
				+by parsing routines.  All other lines contain a sequence of fields
			
 
				+separated by whitespace.
			
 
				+
			
 
				+/proc/fs/nfsd/pool_stats
			
 
				+------------------------
			
 
				+
			
 
				+This file is available in kernels from 2.6.30 onwards, if the
			
 
				+/proc/fs/nfsd filesystem is mounted (it almost always should be).
			
 
				+
			
 
				+The first line is a comment which describes the fields present in
			
 
				+all the other lines.  The other lines present the following data as
			
 
				+a sequence of unsigned decimal numeric fields.  One line is shown
			
 
				+for each NFS thread pool.
			
 
				+
			
 
				+All counters are 64 bits wide and wrap naturally.  There is no way
			
 
				+to zero these counters, instead applications should do their own
			
 
				+rate conversion.
			
 
				+
			
 
				+pool
			
 
				+	The id number of the NFS thread pool to which this line applies.
			
 
				+	This number does not change.
			
 
				+
			
 
				+	Thread pool ids are a contiguous set of small integers starting
			
 
				+	at zero.  The maximum value depends on the thread pool mode, but
			
 
				+	currently cannot be larger than the number of CPUs in the system.
			
 
				+	Note that in the default case there will be a single thread pool
			
 
				+	which contains all the nfsd threads and all the CPUs in the system,
			
 
				+	and thus this file will have a single line with a pool id of "0".
			
 
				+
			
 
				+packets-arrived
			
 
				+	Counts how many NFS packets have arrived.  More precisely, this
			
 
				+	is the number of times that the network stack has notified the
			
 
				+	sunrpc server layer that new data may be available on a transport
			
 
				+	(e.g. an NFS or UDP socket or an NFS/RDMA endpoint).
			
 
				+
			
 
				+	Depending on the NFS workload patterns and various network stack
			
 
				+	effects (such as Large Receive Offload) which can combine packets
			
 
				+	on the wire, this may be either more or less than the number
			
 
				+	of NFS calls received (which statistic is available elsewhere).
			
 
				+	However this is a more accurate and less workload-dependent measure
			
 
				+	of how much CPU load is being placed on the sunrpc server layer
			
 
				+	due to NFS network traffic.
			
 
				+
			
 
				+sockets-enqueued
			
 
				+	Counts how many times an NFS transport is enqueued to wait for
			
 
				+	an nfsd thread to service it, i.e. no nfsd thread was considered
			
 
				+	available.
			
 
				+
			
 
				+	The circumstance this statistic tracks indicates that there was NFS
			
 
				+	network-facing work to be done but it couldn't be done immediately,
			
 
				+	thus introducing a small delay in servicing NFS calls.  The ideal
			
 
				+	rate of change for this counter is zero; significantly non-zero
			
 
				+	values may indicate a performance limitation.
			
 
				+
			
 
				+	This can happen either because there are too few nfsd threads in the
			
 
				+	thread pool for the NFS workload (the workload is thread-limited),
			
 
				+	or because the NFS workload needs more CPU time than is available in
			
 
				+	the thread pool (the workload is CPU-limited).  In the former case,
			
 
				+	configuring more nfsd threads will probably improve the performance
			
 
				+	of the NFS workload.  In the latter case, the sunrpc server layer is
			
 
				+	already choosing not to wake idle nfsd threads because there are too
			
 
				+	many nfsd threads which want to run but cannot, so configuring more
			
 
				+	nfsd threads will make no difference whatsoever.  The overloads-avoided
			
 
				+	statistic (see below) can be used to distinguish these cases.
			
 
				+
			
 
				+threads-woken
			
 
				+	Counts how many times an idle nfsd thread is woken to try to
			
 
				+	receive some data from an NFS transport.
			
 
				+
			
 
				+	This statistic tracks the circumstance where incoming
			
 
				+	network-facing NFS work is being handled quickly, which is a good
			
 
				+	thing.  The ideal rate of change for this counter will be close
			
 
				+	to but less than the rate of change of the packets-arrived counter.
			
 
				+
			
 
				+overloads-avoided
			
 
				+	Counts how many times the sunrpc server layer chose not to wake an
			
 
				+	nfsd thread, despite the presence of idle nfsd threads, because
			
 
				+	too many nfsd threads had been recently woken but could not get
			
 
				+	enough CPU time to actually run.
			
 
				+
			
 
				+	This statistic counts a circumstance where the sunrpc layer
			
 
				+	heuristically avoids overloading the CPU scheduler with too many
			
 
				+	runnable nfsd threads.  The ideal rate of change for this counter
			
 
				+	is zero.  Significant non-zero values indicate that the workload
			
 
				+	is CPU limited.  Usually this is associated with heavy CPU usage
			
 
				+	on all the CPUs in the nfsd thread pool.
			
 
				+
			
 
				+	If a sustained large overloads-avoided rate is detected on a pool,
			
 
				+	the top(1) utility should be used to check for the following
			
 
				+	pattern of CPU usage on all the CPUs associated with the given
			
 
				+	nfsd thread pool.
			
 
				+
			
 
				+	 - %us ~= 0 (as you're *NOT* running applications on your NFS server)
			
 
				+
			
 
				+	 - %wa ~= 0
			
 
				+
			
 
				+	 - %id ~= 0
			
 
				+
			
 
				+	 - %sy + %hi + %si ~= 100
			
 
				+
			
 
				+	If this pattern is seen, configuring more nfsd threads will *not*
			
 
				+	improve the performance of the workload.  If this patten is not
			
 
				+	seen, then something more subtle is wrong.
			
 
				+
			
 
				+threads-timedout
			
 
				+	Counts how many times an nfsd thread triggered an idle timeout,
			
 
				+	i.e. was not woken to handle any incoming network packets for
			
 
				+	some time.
			
 
				+
			
 
				+	This statistic counts a circumstance where there are more nfsd
			
 
				+	threads configured than can be used by the NFS workload.  This is
			
 
				+	a clue that the number of nfsd threads can be reduced without
			
 
				+	affecting performance.  Unfortunately, it's only a clue and not
			
 
				+	a strong indication, for a couple of reasons:
			
 
				+
			
 
				+	 - Currently the rate at which the counter is incremented is quite
			
 
				+	   slow; the idle timeout is 60 minutes.  Unless the NFS workload
			
 
				+	   remains constant for hours at a time, this counter is unlikely
			
 
				+	   to be providing information that is still useful.
			
 
				+
			
 
				+	 - It is usually a wise policy to provide some slack,
			
 
				+	   i.e. configure a few more nfsds than are currently needed,
			
 
				+	   to allow for future spikes in load.
			
 
				+
			
 
				+
			
 
				+Note that incoming packets on NFS transports will be dealt with in
			
 
				+one of three ways.  An nfsd thread can be woken (threads-woken counts
			
 
				+this case), or the transport can be enqueued for later attention
			
 
				+(sockets-enqueued counts this case), or the packet can be temporarily
			
 
				+deferred because the transport is currently being used by an nfsd
			
 
				+thread.  This last case is not very interesting and is not explicitly
			
 
				+counted, but can be inferred from the other counters thus:
			
 
				+
			
 
				+packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken )
			
 
				+
			
 
				+
			
 
				+More
			
 
				+----
			
 
				+Descriptions of the other statistics file should go here.
			
 
				+
			
 
				+
			
 
				+Greg Banks <gnb@sgi.com>
			
 
				+26 Mar 2009
			
--- a/Documentation/filesystems/nfs41-server.txt
+++ b/Documentation/filesystems/nfs41-server.txt
@@ -0,0 +1,161 @@
 
				+NFSv4.1 Server Implementation
			
 
				+
			
 
				+Server support for minorversion 1 can be controlled using the
			
 
				+/proc/fs/nfsd/versions control file.  The string output returned
			
 
				+by reading this file will contain either "+4.1" or "-4.1"
			
 
				+correspondingly.
			
 
				+
			
 
				+Currently, server support for minorversion 1 is disabled by default.
			
 
				+It can be enabled at run time by writing the string "+4.1" to
			
 
				+the /proc/fs/nfsd/versions control file.  Note that to write this
			
 
				+control file, the nfsd service must be taken down.  Use your user-mode
			
 
				+nfs-utils to set this up; see rpc.nfsd(8)
			
 
				+
			
 
				+The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
			
 
				+on the latest NFSv4.1 Internet Draft:
			
 
				+http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
			
 
				+
			
 
				+From the many new features in NFSv4.1 the current implementation
			
 
				+focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
			
 
				+"exactly once" semantics and better control and throttling of the
			
 
				+resources allocated for each client.
			
 
				+
			
 
				+Other NFSv4.1 features, Parallel NFS operations in particular,
			
 
				+are still under development out of tree.
			
 
				+See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
			
 
				+for more information.
			
 
				+
			
 
				+The table below, taken from the NFSv4.1 document, lists
			
 
				+the operations that are mandatory to implement (REQ), optional
			
 
				+(OPT), and NFSv4.0 operations that are required not to implement (MNI)
			
 
				+in minor version 1.  The first column indicates the operations that
			
 
				+are not supported yet by the linux server implementation.
			
 
				+
			
 
				+The OPTIONAL features identified and their abbreviations are as follows:
			
 
				+	pNFS	Parallel NFS
			
 
				+	FDELG	File Delegations
			
 
				+	DDELG	Directory Delegations
			
 
				+
			
 
				+The following abbreviations indicate the linux server implementation status.
			
 
				+	I	Implemented NFSv4.1 operations.
			
 
				+	NS	Not Supported.
			
 
				+	NS*	unimplemented optional feature.
			
 
				+	P	pNFS features implemented out of tree.
			
 
				+	PNS	pNFS features that are not supported yet (out of tree).
			
 
				+
			
 
				+Operations
			
 
				+
			
 
				+   +----------------------+------------+--------------+----------------+
			
 
				+   | Operation            | REQ, REC,  | Feature      | Definition     |
			
 
				+   |                      | OPT, or    | (REQ, REC,   |                |
			
 
				+   |                      | MNI        | or OPT)      |                |
			
 
				+   +----------------------+------------+--------------+----------------+
			
 
				+   | ACCESS               | REQ        |              | Section 18.1   |
			
 
				+NS | BACKCHANNEL_CTL      | REQ        |              | Section 18.33  |
			
 
				+NS | BIND_CONN_TO_SESSION | REQ        |              | Section 18.34  |
			
 
				+   | CLOSE                | REQ        |              | Section 18.2   |
			
 
				+   | COMMIT               | REQ        |              | Section 18.3   |
			
 
				+   | CREATE               | REQ        |              | Section 18.4   |
			
 
				+I  | CREATE_SESSION       | REQ        |              | Section 18.36  |
			
 
				+NS*| DELEGPURGE           | OPT        | FDELG (REQ)  | Section 18.5   |
			
 
				+   | DELEGRETURN          | OPT        | FDELG,       | Section 18.6   |
			
 
				+   |                      |            | DDELG, pNFS  |                |
			
 
				+   |                      |            | (REQ)        |                |
			
 
				+NS | DESTROY_CLIENTID     | REQ        |              | Section 18.50  |
			
 
				+I  | DESTROY_SESSION      | REQ        |              | Section 18.37  |
			
 
				+I  | EXCHANGE_ID          | REQ        |              | Section 18.35  |
			
 
				+NS | FREE_STATEID         | REQ        |              | Section 18.38  |
			
 
				+   | GETATTR              | REQ        |              | Section 18.7   |
			
 
				+P  | GETDEVICEINFO        | OPT        | pNFS (REQ)   | Section 18.40  |
			
 
				+P  | GETDEVICELIST        | OPT        | pNFS (OPT)   | Section 18.41  |
			
 
				+   | GETFH                | REQ        |              | Section 18.8   |
			
 
				+NS*| GET_DIR_DELEGATION   | OPT        | DDELG (REQ)  | Section 18.39  |
			
 
				+P  | LAYOUTCOMMIT         | OPT        | pNFS (REQ)   | Section 18.42  |
			
 
				+P  | LAYOUTGET            | OPT        | pNFS (REQ)   | Section 18.43  |
			
 
				+P  | LAYOUTRETURN         | OPT        | pNFS (REQ)   | Section 18.44  |
			
 
				+   | LINK                 | OPT        |              | Section 18.9   |
			
 
				+   | LOCK                 | REQ        |              | Section 18.10  |
			
 
				+   | LOCKT                | REQ        |              | Section 18.11  |
			
 
				+   | LOCKU                | REQ        |              | Section 18.12  |
			
 
				+   | LOOKUP               | REQ        |              | Section 18.13  |
			
 
				+   | LOOKUPP              | REQ        |              | Section 18.14  |
			
 
				+   | NVERIFY              | REQ        |              | Section 18.15  |
			
 
				+   | OPEN                 | REQ        |              | Section 18.16  |
			
 
				+NS*| OPENATTR             | OPT        |              | Section 18.17  |
			
 
				+   | OPEN_CONFIRM         | MNI        |              | N/A            |
			
 
				+   | OPEN_DOWNGRADE       | REQ        |              | Section 18.18  |
			
 
				+   | PUTFH                | REQ        |              | Section 18.19  |
			
 
				+   | PUTPUBFH             | REQ        |              | Section 18.20  |
			
 
				+   | PUTROOTFH            | REQ        |              | Section 18.21  |
			
 
				+   | READ                 | REQ        |              | Section 18.22  |
			
 
				+   | READDIR              | REQ        |              | Section 18.23  |
			
 
				+   | READLINK             | OPT        |              | Section 18.24  |
			
 
				+NS | RECLAIM_COMPLETE     | REQ        |              | Section 18.51  |
			
 
				+   | RELEASE_LOCKOWNER    | MNI        |              | N/A            |
			
 
				+   | REMOVE               | REQ        |              | Section 18.25  |
			
 
				+   | RENAME               | REQ        |              | Section 18.26  |
			
 
				+   | RENEW                | MNI        |              | N/A            |
			
 
				+   | RESTOREFH            | REQ        |              | Section 18.27  |
			
 
				+   | SAVEFH               | REQ        |              | Section 18.28  |
			
 
				+   | SECINFO              | REQ        |              | Section 18.29  |
			
 
				+NS | SECINFO_NO_NAME      | REC        | pNFS files   | Section 18.45, |
			
 
				+   |                      |            | layout (REQ) | Section 13.12  |
			
 
				+I  | SEQUENCE             | REQ        |              | Section 18.46  |
			
 
				+   | SETATTR              | REQ        |              | Section 18.30  |
			
 
				+   | SETCLIENTID          | MNI        |              | N/A            |
			
 
				+   | SETCLIENTID_CONFIRM  | MNI        |              | N/A            |
			
 
				+NS | SET_SSV              | REQ        |              | Section 18.47  |
			
 
				+NS | TEST_STATEID         | REQ        |              | Section 18.48  |
			
 
				+   | VERIFY               | REQ        |              | Section 18.31  |
			
 
				+NS*| WANT_DELEGATION      | OPT        | FDELG (OPT)  | Section 18.49  |
			
 
				+   | WRITE                | REQ        |              | Section 18.32  |
			
 
				+
			
 
				+Callback Operations
			
 
				+
			
 
				+   +-------------------------+-----------+-------------+---------------+
			
 
				+   | Operation               | REQ, REC, | Feature     | Definition    |
			
 
				+   |                         | OPT, or   | (REQ, REC,  |               |
			
 
				+   |                         | MNI       | or OPT)     |               |
			
 
				+   +-------------------------+-----------+-------------+---------------+
			
 
				+   | CB_GETATTR              | OPT       | FDELG (REQ) | Section 20.1  |
			
 
				+P  | CB_LAYOUTRECALL         | OPT       | pNFS (REQ)  | Section 20.3  |
			
 
				+NS*| CB_NOTIFY               | OPT       | DDELG (REQ) | Section 20.4  |
			
 
				+P  | CB_NOTIFY_DEVICEID      | OPT       | pNFS (OPT)  | Section 20.12 |
			
 
				+NS*| CB_NOTIFY_LOCK          | OPT       |             | Section 20.11 |
			
 
				+NS*| CB_PUSH_DELEG           | OPT       | FDELG (OPT) | Section 20.5  |
			
 
				+   | CB_RECALL               | OPT       | FDELG,      | Section 20.2  |
			
 
				+   |                         |           | DDELG, pNFS |               |
			
 
				+   |                         |           | (REQ)       |               |
			
 
				+NS*| CB_RECALL_ANY           | OPT       | FDELG,      | Section 20.6  |
			
 
				+   |                         |           | DDELG, pNFS |               |
			
 
				+   |                         |           | (REQ)       |               |
			
 
				+NS | CB_RECALL_SLOT          | REQ       |             | Section 20.8  |
			
 
				+NS*| CB_RECALLABLE_OBJ_AVAIL | OPT       | DDELG, pNFS | Section 20.7  |
			
 
				+   |                         |           | (REQ)       |               |
			
 
				+I  | CB_SEQUENCE             | OPT       | FDELG,      | Section 20.9  |
			
 
				+   |                         |           | DDELG, pNFS |               |
			
 
				+   |                         |           | (REQ)       |               |
			
 
				+NS*| CB_WANTS_CANCELLED      | OPT       | FDELG,      | Section 20.10 |
			
 
				+   |                         |           | DDELG, pNFS |               |
			
 
				+   |                         |           | (REQ)       |               |
			
 
				+   +-------------------------+-----------+-------------+---------------+
			
 
				+
			
 
				+Implementation notes:
			
 
				+
			
 
				+EXCHANGE_ID:
			
 
				+* only SP4_NONE state protection supported
			
 
				+* implementation ids are ignored
			
 
				+
			
 
				+CREATE_SESSION:
			
 
				+* backchannel attributes are ignored
			
 
				+* backchannel security parameters are ignored
			
 
				+
			
 
				+SEQUENCE:
			
 
				+* no support for dynamic slot table renegotiation (optional)
			
 
				+
			
 
				+nfsv4.1 COMPOUND rules:
			
 
				+The following cases aren't supported yet:
			
 
				+* Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION,
			
 
				+  DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID.
			
 
				+* DESTROY_SESSION MUST be the final operation in the COMPOUND request.
			
 
				+
			
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -0,0 +1,200 @@
 
				+NILFS2
			
 
				+------
			
 
				+
			
 
				+NILFS2 is a log-structured file system (LFS) supporting continuous
			
 
				+snapshotting.  In addition to versioning capability of the entire file
			
 
				+system, users can even restore files mistakenly overwritten or
			
 
				+destroyed just a few seconds ago.  Since NILFS2 can keep consistency
			
 
				+like conventional LFS, it achieves quick recovery after system
			
 
				+crashes.
			
 
				+
			
 
				+NILFS2 creates a number of checkpoints every few seconds or per
			
 
				+synchronous write basis (unless there is no change).  Users can select
			
 
				+significant versions among continuously created checkpoints, and can
			
 
				+change them into snapshots which will be preserved until they are
			
 
				+changed back to checkpoints.
			
 
				+
			
 
				+There is no limit on the number of snapshots until the volume gets
			
 
				+full.  Each snapshot is mountable as a read-only file system
			
 
				+concurrently with its writable mount, and this feature is convenient
			
 
				+for online backup.
			
 
				+
			
 
				+The userland tools are included in nilfs-utils package, which is
			
 
				+available from the following download page.  At least "mkfs.nilfs2",
			
 
				+"mount.nilfs2", "umount.nilfs2", and "nilfs_cleanerd" (so called
			
 
				+cleaner or garbage collector) are required.  Details on the tools are
			
 
				+described in the man pages included in the package.
			
 
				+
			
 
				+Project web page:    http://www.nilfs.org/en/
			
 
				+Download page:       http://www.nilfs.org/en/download.html
			
 
				+Git tree web page:   http://www.nilfs.org/git/
			
 
				+NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users
			
 
				+
			
 
				+Caveats
			
 
				+=======
			
 
				+
			
 
				+Features which NILFS2 does not support yet:
			
 
				+
			
 
				+	- atime
			
 
				+	- extended attributes
			
 
				+	- POSIX ACLs
			
 
				+	- quotas
			
 
				+	- writable snapshots
			
 
				+	- remote backup (CDP)
			
 
				+	- data integrity
			
 
				+	- defragmentation
			
 
				+
			
 
				+Mount options
			
 
				+=============
			
 
				+
			
 
				+NILFS2 supports the following mount options:
			
 
				+(*) == default
			
 
				+
			
 
				+barrier=on(*)		This enables/disables barriers. barrier=off disables
			
 
				+			it, barrier=on enables it.
			
 
				+errors=continue(*)	Keep going on a filesystem error.
			
 
				+errors=remount-ro	Remount the filesystem read-only on an error.
			
 
				+errors=panic		Panic and halt the machine if an error occurs.
			
 
				+cp=n			Specify the checkpoint-number of the snapshot to be
			
 
				+			mounted.  Checkpoints and snapshots are listed by lscp
			
 
				+			user command.  Only the checkpoints marked as snapshot
			
 
				+			are mountable with this option.  Snapshot is read-only,
			
 
				+			so a read-only mount option must be specified together.
			
 
				+order=relaxed(*)	Apply relaxed order semantics that allows modified data
			
 
				+			blocks to be written to disk without making a
			
 
				+			checkpoint if no metadata update is going.  This mode
			
 
				+			is equivalent to the ordered data mode of the ext3
			
 
				+			filesystem except for the updates on data blocks still
			
 
				+			conserve atomicity.  This will improve synchronous
			
 
				+			write performance for overwriting.
			
 
				+order=strict		Apply strict in-order semantics that preserves sequence
			
 
				+			of all file operations including overwriting of data
			
 
				+			blocks.  That means, it is guaranteed that no
			
 
				+			overtaking of events occurs in the recovered file
			
 
				+			system after a crash.
			
 
				+
			
 
				+NILFS2 usage
			
 
				+============
			
 
				+
			
 
				+To use nilfs2 as a local file system, simply:
			
 
				+
			
 
				+ # mkfs -t nilfs2 /dev/block_device
			
 
				+ # mount -t nilfs2 /dev/block_device /dir
			
 
				+
			
 
				+This will also invoke the cleaner through the mount helper program
			
 
				+(mount.nilfs2).
			
 
				+
			
 
				+Checkpoints and snapshots are managed by the following commands.
			
 
				+Their manpages are included in the nilfs-utils package above.
			
 
				+
			
 
				+  lscp     list checkpoints or snapshots.
			
 
				+  mkcp     make a checkpoint or a snapshot.
			
 
				+  chcp     change an existing checkpoint to a snapshot or vice versa.
			
 
				+  rmcp     invalidate specified checkpoint(s).
			
 
				+
			
 
				+To mount a snapshot,
			
 
				+
			
 
				+ # mount -t nilfs2 -r -o cp=<cno> /dev/block_device /snap_dir
			
 
				+
			
 
				+where <cno> is the checkpoint number of the snapshot.
			
 
				+
			
 
				+To unmount the NILFS2 mount point or snapshot, simply:
			
 
				+
			
 
				+ # umount /dir
			
 
				+
			
 
				+Then, the cleaner daemon is automatically shut down by the umount
			
 
				+helper program (umount.nilfs2).
			
 
				+
			
 
				+Disk format
			
 
				+===========
			
 
				+
			
 
				+A nilfs2 volume is equally divided into a number of segments except
			
 
				+for the super block (SB) and segment #0.  A segment is the container
			
 
				+of logs.  Each log is composed of summary information blocks, payload
			
 
				+blocks, and an optional super root block (SR):
			
 
				+
			
 
				+   ______________________________________________________
			
 
				+  | |SB| | Segment | Segment | Segment | ... | Segment | |
			
 
				+  |_|__|_|____0____|____1____|____2____|_____|____N____|_|
			
 
				+  0 +1K +4K       +8M       +16M      +24M  +(8MB x N)
			
 
				+       .             .            (Typical offsets for 4KB-block)
			
 
				+    .                  .
			
 
				+  .______________________.
			
 
				+  | log | log |... | log |
			
 
				+  |__1__|__2__|____|__m__|
			
 
				+        .       .
			
 
				+      .               .
			
 
				+    .                       .
			
 
				+  .______________________________.
			
 
				+  | Summary | Payload blocks  |SR|
			
 
				+  |_blocks__|_________________|__|
			
 
				+
			
 
				+The payload blocks are organized per file, and each file consists of
			
 
				+data blocks and B-tree node blocks:
			
 
				+
			
 
				+    |<---       File-A        --->|<---       File-B        --->|
			
 
				+   _______________________________________________________________
			
 
				+    | Data blocks | B-tree blocks | Data blocks | B-tree blocks | ...
			
 
				+   _|_____________|_______________|_____________|_______________|_
			
 
				+
			
 
				+
			
 
				+Since only the modified blocks are written in the log, it may have
			
 
				+files without data blocks or B-tree node blocks.
			
 
				+
			
 
				+The organization of the blocks is recorded in the summary information
			
 
				+blocks, which contains a header structure (nilfs_segment_summary), per
			
 
				+file structures (nilfs_finfo), and per block structures (nilfs_binfo):
			
 
				+
			
 
				+  _________________________________________________________________________
			
 
				+ | Summary | finfo | binfo | ... | binfo | finfo | binfo | ... | binfo |...
			
 
				+ |_blocks__|___A___|_(A,1)_|_____|(A,Na)_|___B___|_(B,1)_|_____|(B,Nb)_|___
			
 
				+
			
 
				+
			
 
				+The logs include regular files, directory files, symbolic link files
			
 
				+and several meta data files.  The mata data files are the files used
			
 
				+to maintain file system meta data.  The current version of NILFS2 uses
			
 
				+the following meta data files:
			
 
				+
			
 
				+ 1) Inode file (ifile)             -- Stores on-disk inodes
			
 
				+ 2) Checkpoint file (cpfile)       -- Stores checkpoints
			
 
				+ 3) Segment usage file (sufile)    -- Stores allocation state of segments
			
 
				+ 4) Data address translation file  -- Maps virtual block numbers to usual
			
 
				+    (DAT)                             block numbers.  This file serves to
			
 
				+                                      make on-disk blocks relocatable.
			
 
				+
			
 
				+The following figure shows a typical organization of the logs:
			
 
				+
			
 
				+  _________________________________________________________________________
			
 
				+ | Summary | regular file | file  | ... | ifile | cpfile | sufile | DAT |SR|
			
 
				+ |_blocks__|_or_directory_|_______|_____|_______|________|________|_____|__|
			
 
				+
			
 
				+
			
 
				+To stride over segment boundaries, this sequence of files may be split
			
 
				+into multiple logs.  The sequence of logs that should be treated as
			
 
				+logically one log, is delimited with flags marked in the segment
			
 
				+summary.  The recovery code of nilfs2 looks this boundary information
			
 
				+to ensure atomicity of updates.
			
 
				+
			
 
				+The super root block is inserted for every checkpoints.  It includes
			
 
				+three special inodes, inodes for the DAT, cpfile, and sufile.  Inodes
			
 
				+of regular files, directories, symlinks and other special files, are
			
 
				+included in the ifile.  The inode of ifile itself is included in the
			
 
				+corresponding checkpoint entry in the cpfile.  Thus, the hierarchy
			
 
				+among NILFS2 files can be depicted as follows:
			
 
				+
			
 
				+  Super block (SB)
			
 
				+       |
			
 
				+       v
			
 
				+  Super root block (the latest cno=xx)
			
 
				+       |-- DAT
			
 
				+       |-- sufile
			
 
				+       `-- cpfile
			
 
				+              |-- ifile (cno=c1)
			
 
				+              |-- ifile (cno=c2) ---- file (ino=i1)
			
 
				+              :        :          |-- file (ino=i2)
			
 
				+              `-- ifile (cno=xx)  |-- file (ino=i3)
			
 
				+                                  :        :
			
 
				+                                  `-- file (ino=yy)
			
 
				+                                    ( regular file, directory, or symlink )
			
 
				+
			
 
				+For detail on the format of each file, please see include/linux/nilfs2_fs.h.
			
--- a/Documentation/filesystems/pohmelfs/design_notes.txt
+++ b/Documentation/filesystems/pohmelfs/design_notes.txt
@@ -0,0 +1,71 @@
 
				+POHMELFS: Parallel Optimized Host Message Exchange Layered File System.
			
 
				+
			
 
				+		Evgeniy Polyakov <zbr@ioremap.net>
			
 
				+
			
 
				+Homepage: http://www.ioremap.net/projects/pohmelfs
			
 
				+
			
 
				+POHMELFS first began as a network filesystem with coherent local data and
			
 
				+metadata caches but is now evolving into a parallel distributed filesystem.
			
 
				+
			
 
				+Main features of this FS include:
			
 
				+ * Locally coherent cache for data and metadata with (potentially) byte-range locks.
			
 
				+	Since all Linux filesystems lock the whole inode during writing, algorithm
			
 
				+	is very simple and does not use byte-ranges, although they are sent in
			
 
				+	locking messages.
			
 
				+ * Completely async processing of all events except creation of hard and symbolic
			
 
				+	links, and rename events.
			
 
				+	Object creation and data reading and writing are processed asynchronously.
			
 
				+ * Flexible object architecture optimized for network processing.
			
 
				+	Ability to create long paths to objects and remove arbitrarily huge
			
 
				+	directories with a single network command.
			
 
				+	(like removing the whole kernel tree via a single network command).
			
 
				+ * Very high performance.
			
 
				+ * Fast and scalable multithreaded userspace server. Being in userspace it works
			
 
				+	with any underlying filesystem and still is much faster than async in-kernel NFS one.
			
 
				+ * Client is able to switch between different servers (if one goes down, client
			
 
				+	automatically reconnects to second and so on).
			
 
				+ * Transactions support. Full failover for all operations.
			
 
				+	Resending transactions to different servers on timeout or error.
			
 
				+ * Read request (data read, directory listing, lookup requests) balancing between multiple servers.
			
 
				+ * Write requests are replicated to multiple servers and completed only when all of them are acked.
			
 
				+ * Ability to add and/or remove servers from the working set at run-time.
			
 
				+ * Strong authentification and possible data encryption in network channel.
			
 
				+ * Extended attributes support.
			
 
				+
			
 
				+POHMELFS is based on transactions, which are potentially long-standing objects that live
			
 
				+in the client's memory. Each transaction contains all the information needed to process a given
			
 
				+command (or set of commands, which is frequently used during data writing: single transactions
			
 
				+can contain creation and data writing commands). Transactions are committed by all the servers
			
 
				+to which they are sent and, in case of failures, are eventually resent or dropped with an error.
			
 
				+For example, reading will return an error if no servers are available.
			
 
				+
			
 
				+POHMELFS uses a asynchronous approach to data processing. Courtesy of transactions, it is
			
 
				+possible to detach replies from requests and, if the command requires data to be received, the
			
 
				+caller sleeps waiting for it. Thus, it is possible to issue multiple read commands to different
			
 
				+servers and async threads will pick up replies in parallel, find appropriate transactions in the
			
 
				+system and put the data where it belongs (like the page or inode cache).
			
 
				+
			
 
				+The main feature of POHMELFS is writeback data and the metadata cache.
			
 
				+Only a few non-performance critical operations use the write-through cache and
			
 
				+are synchronous: hard and symbolic link creation, and object rename. Creation,
			
 
				+removal of objects and data writing are asynchronous and are sent to
			
 
				+the server during system writeback. Only one writer at a time is allowed for any
			
 
				+given inode, which is guarded by an appropriate locking protocol.
			
 
				+Because of this feature, POHMELFS is extremely fast at metadata intensive
			
 
				+workloads and can fully utilize the bandwidth to the servers when doing bulk
			
 
				+data transfers.
			
 
				+
			
 
				+POHMELFS clients operate with a working set of servers and are capable of balancing read-only
			
 
				+operations (like lookups or directory listings) between them according to IO priorities.
			
 
				+Administrators can add or remove servers from the set at run-time via special commands (described
			
 
				+in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers, which are connected
			
 
				+with write permission turned on. IO priority and permissions can be changed in run-time.
			
 
				+
			
 
				+POHMELFS is capable of full data channel encryption and/or strong crypto hashing.
			
 
				+One can select any kernel supported cipher, encryption mode, hash type and operation mode
			
 
				+(hmac or digest). It is also possible to use both or neither (default). Crypto configuration
			
 
				+is checked during mount time and, if the server does not support it, appropriate capabilities
			
 
				+will be disabled or mount will fail (if 'crypto_fail_unsupported' mount option is specified).
			
 
				+Crypto performance heavily depends on the number of crypto threads, which asynchronously perform
			
 
				+crypto operations and send the resulting data to server or submit it up the stack. This number
			
 
				+can be controlled via a mount option.
			
--- a/Documentation/filesystems/pohmelfs/info.txt
+++ b/Documentation/filesystems/pohmelfs/info.txt
@@ -0,0 +1,99 @@
 
				+POHMELFS usage information.
			
 
				+
			
 
				+Mount options.
			
 
				+All but index, number of crypto threads and maximum IO size can changed via remount.
			
 
				+
			
 
				+idx=%u
			
 
				+ Each mountpoint is associated with a special index via this option.
			
 
				+ Administrator can add or remove servers from the given index, so all mounts,
			
 
				+ which were attached to it, are updated.
			
 
				+ Default it is 0.
			
 
				+
			
 
				+trans_scan_timeout=%u
			
 
				+ This timeout, expressed in milliseconds, specifies time to scan transaction
			
 
				+ trees looking for stale requests, which have to be resent, or if number of
			
 
				+ retries exceed specified limit, dropped with error.
			
 
				+ Default is 5 seconds.
			
 
				+
			
 
				+drop_scan_timeout=%u
			
 
				+ Internal timeout, expressed in milliseconds, which specifies how frequently
			
 
				+ inodes marked to be dropped are freed. It also specifies how frequently
			
 
				+ the system checks that servers have to be added or removed from current working set.
			
 
				+ Default is 1 second.
			
 
				+
			
 
				+wait_on_page_timeout=%u
			
 
				+ Number of milliseconds to wait for reply from remote server for data reading command.
			
 
				+ If this timeout is exceeded, reading returns an error.
			
 
				+ Default is 5 seconds.
			
 
				+
			
 
				+trans_retries=%u
			
 
				+ This is the number of times that a transaction will be resent to a server that did
			
 
				+ not answer for the last @trans_scan_timeout milliseconds.
			
 
				+ When the number of resends exceeds this limit, the transaction is completed with error.
			
 
				+ Default is 5 resends.
			
 
				+
			
 
				+crypto_thread_num=%u
			
 
				+ Number of crypto processing threads. Threads are used both for RX and TX traffic.
			
 
				+ Default is 2, or no threads if crypto operations are not supported.
			
 
				+
			
 
				+trans_max_pages=%u
			
 
				+ Maximum number of pages in a single transaction. This parameter also controls
			
 
				+ the number of pages,  allocated for crypto processing (each crypto thread has
			
 
				+ pool of pages, the number of which is equal to 'trans_max_pages'.
			
 
				+ Default is 100 pages.
			
 
				+
			
 
				+crypto_fail_unsupported
			
 
				+ If specified, mount will fail if the server does not support requested crypto operations.
			
 
				+ By default mount will disable non-matching crypto operations.
			
 
				+
			
 
				+mcache_timeout=%u
			
 
				+ Maximum number of milliseconds to wait for the mcache objects to be processed.
			
 
				+ Mcache includes locks (given lock should be granted by server), attributes (they should be
			
 
				+ fully received in the given timeframe).
			
 
				+ Default is 5 seconds.
			
 
				+
			
 
				+Usage examples.
			
 
				+
			
 
				+Add server server1.net:1025 into the working set with index $idx
			
 
				+with appropriate hash algorithm and key file and cipher algorithm, mode and key file:
			
 
				+$cfg A add -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key
			
 
				+
			
 
				+Mount filesystem with given index $idx to /mnt mountpoint.
			
 
				+Client will connect to all servers specified in the working set via previous command:
			
 
				+mount -t pohmel -o idx=$idx q /mnt
			
 
				+
			
 
				+Change permissions to read-only (-I 1 option, '-I 2' - write-only, 3 - rw):
			
 
				+$cfg A modify -a server1.net -p 1025 -i $idx -I 1
			
 
				+
			
 
				+Change IO priority to 123 (node with the highest priority gets read requests).
			
 
				+$cfg A modify -a server1.net -p 1025 -i $idx -P 123
			
 
				+
			
 
				+One can check currect status of all connections in the mountstats file:
			
 
				+# cat /proc/$PID/mountstats
			
 
				+...
			
 
				+device none mounted on /mnt with fstype pohmel
			
 
				+idx addr(:port) socket_type protocol active priority permissions
			
 
				+0 server1.net:1026 1 6 1 250 1
			
 
				+0 server2.net:1025 1 6 1 123 3
			
 
				+
			
 
				+Server installation.
			
 
				+
			
 
				+Creating a server, which listens at port 1025 and 0.0.0.0 address.
			
 
				+Working root directory (note, that server chroots there, so you have to have appropriate permissions)
			
 
				+is set to /mnt, server will negotiate hash/cipher with client, in case client requested it, there
			
 
				+are appropriate key files.
			
 
				+Number of working threads is set to 10.
			
 
				+
			
 
				+# ./fserver -a 0.0.0.0 -p 1025 -r /mnt -w 10 -K hash_key -k cipher_key
			
 
				+
			
 
				+ -A 6			 - listen on ipv6 address. Default: Disabled.
			
 
				+ -r root                 - path to root directory. Default: /tmp.
			
 
				+ -a addr                 - listen address. Default: 0.0.0.0.
			
 
				+ -p port                 - listen port. Default: 1025.
			
 
				+ -w workers              - number of workers per connected client. Default: 1.
			
 
				+ -K file		 - hash key size. Default: none.
			
 
				+ -k file		 - cipher key size. Default: none.
			
 
				+ -h                      - this help.
			
 
				+
			
 
				+Number of worker threads specifies how many workers will be created for each client.
			
 
				+Bulk single-client transafers usually are better handled with smaller number (like 1-3).
			
--- a/Documentation/filesystems/pohmelfs/network_protocol.txt
+++ b/Documentation/filesystems/pohmelfs/network_protocol.txt
@@ -0,0 +1,227 @@
 
				+POHMELFS network protocol.
			
 
				+
			
 
				+Basic structure used in network communication is following command:
			
 
				+
			
 
				+struct netfs_cmd
			
 
				+{
			
 
				+	__u16			cmd;	/* Command number */
			
 
				+	__u16			csize;	/* Attached crypto information size */
			
 
				+	__u16			cpad;	/* Attached padding size */
			
 
				+	__u16			ext;	/* External flags */
			
 
				+	__u32			size;	/* Size of the attached data */
			
 
				+	__u32			trans;	/* Transaction id */
			
 
				+	__u64			id;	/* Object ID to operate on. Used for feedback.*/
			
 
				+	__u64			start;	/* Start of the object. */
			
 
				+	__u64			iv;	/* IV sequence */
			
 
				+	__u8			data[0];
			
 
				+};
			
 
				+
			
 
				+Commands can be embedded into transaction command (which in turn has own command),
			
 
				+so one can extend protocol as needed without breaking backward compatibility as long
			
 
				+as old commands are supported. All string lengths include tail 0 byte.
			
 
				+
			
 
				+All commans are transfered over the network in big-endian. CPU endianess is used at the end peers.
			
 
				+
			
 
				+@cmd - command number, which specifies command to be processed. Following
			
 
				+	commands are used currently:
			
 
				+
			
 
				+	NETFS_READDIR	= 1,	/* Read directory for given inode number */
			
 
				+	NETFS_READ_PAGE,	/* Read data page from the server */
			
 
				+	NETFS_WRITE_PAGE,	/* Write data page to the server */
			
 
				+	NETFS_CREATE,		/* Create directory entry */
			
 
				+	NETFS_REMOVE,		/* Remove directory entry */
			
 
				+	NETFS_LOOKUP,		/* Lookup single object */
			
 
				+	NETFS_LINK,		/* Create a link */
			
 
				+	NETFS_TRANS,		/* Transaction */
			
 
				+	NETFS_OPEN,		/* Open intent */
			
 
				+	NETFS_INODE_INFO,	/* Metadata cache coherency synchronization message */
			
 
				+	NETFS_PAGE_CACHE,	/* Page cache invalidation message */
			
 
				+	NETFS_READ_PAGES,	/* Read multiple contiguous pages in one go */
			
 
				+	NETFS_RENAME,		/* Rename object */
			
 
				+	NETFS_CAPABILITIES,	/* Capabilities of the client, for example supported crypto */
			
 
				+	NETFS_LOCK,		/* Distributed lock message */
			
 
				+	NETFS_XATTR_SET,	/* Set extended attribute */
			
 
				+	NETFS_XATTR_GET,	/* Get extended attribute */
			
 
				+
			
 
				+@ext - external flags. Used by different commands to specify some extra arguments
			
 
				+	like partial size of the embedded objects or creation flags.
			
 
				+
			
 
				+@size - size of the attached data. For NETFS_READ_PAGE and NETFS_READ_PAGES no data is attached,
			
 
				+	but size of the requested data is incorporated here. It does not include size of the command
			
 
				+	header (struct netfs_cmd) itself.
			
 
				+
			
 
				+@id - id of the object this command operates on. Each command can use it for own purpose.
			
 
				+
			
 
				+@start - start of the object this command operates on. Each command can use it for own purpose.
			
 
				+
			
 
				+@csize, @cpad - size and padding size of the (attached if needed) crypto information.
			
 
				+
			
 
				+Command specifications.
			
 
				+
			
 
				+@NETFS_READDIR
			
 
				+This command is used to sync content of the remote dir to the client.
			
 
				+
			
 
				+@ext - length of the path to object.
			
 
				+@size - the same.
			
 
				+@id - local inode number of the directory to read.
			
 
				+@start - zero.
			
 
				+
			
 
				+
			
 
				+@NETFS_READ_PAGE
			
 
				+This command is used to read data from remote server.
			
 
				+Data size does not exceed local page cache size.
			
 
				+
			
 
				+@id - inode number.
			
 
				+@start - first byte offset.
			
 
				+@size - number of bytes to read plus length of the path to object.
			
 
				+@ext - object path length.
			
 
				+
			
 
				+
			
 
				+@NETFS_CREATE
			
 
				+Used to create object.
			
 
				+It does not require that all directories on top of the object were
			
 
				+already created, it will create them automatically. Each object has
			
 
				+associated @netfs_path_entry data structure, which contains creation
			
 
				+mode (permissions and type) and length of the name as long as name itself.
			
 
				+
			
 
				+@start - 0
			
 
				+@size - size of the all data structures needed to create a path
			
 
				+@id - local inode number
			
 
				+@ext - 0
			
 
				+
			
 
				+
			
 
				+@NETFS_REMOVE
			
 
				+Used to remove object.
			
 
				+
			
 
				+@ext - length of the path to object.
			
 
				+@size - the same.
			
 
				+@id - local inode number.
			
 
				+@start - zero.
			
 
				+
			
 
				+
			
 
				+@NETFS_LOOKUP
			
 
				+Lookup information about object on server.
			
 
				+
			
 
				+@ext - length of the path to object.
			
 
				+@size - the same.
			
 
				+@id - local inode number of the directory to look object in.
			
 
				+@start - local inode number of the object to look at.
			
 
				+
			
 
				+
			
 
				+@NETFS_LINK
			
 
				+Create hard of symlink.
			
 
				+Command is sent as "object_path|target_path".
			
 
				+
			
 
				+@size - size of the above string.
			
 
				+@id - parent local inode number.
			
 
				+@start - 1 for symlink, 0 for hardlink.
			
 
				+@ext - size of the "object_path" above.
			
 
				+
			
 
				+
			
 
				+@NETFS_TRANS
			
 
				+Transaction header.
			
 
				+
			
 
				+@size - incorporates all embedded command sizes including theirs header sizes.
			
 
				+@start - transaction generation number - unique id used to find transaction.
			
 
				+@ext - transaction flags. Unused at the moment.
			
 
				+@id - 0.
			
 
				+
			
 
				+
			
 
				+@NETFS_OPEN
			
 
				+Open intent for given transaction.
			
 
				+
			
 
				+@id - local inode number.
			
 
				+@start - 0.
			
 
				+@size - path length to the object.
			
 
				+@ext - open flags (O_RDWR and so on).
			
 
				+
			
 
				+
			
 
				+@NETFS_INODE_INFO
			
 
				+Metadata update command.
			
 
				+It is sent to servers when attributes of the object are changed and received
			
 
				+when data or metadata were updated. It operates with the following structure:
			
 
				+
			
 
				+struct netfs_inode_info
			
 
				+{
			
 
				+	unsigned int		mode;
			
 
				+	unsigned int		nlink;
			
 
				+	unsigned int		uid;
			
 
				+	unsigned int		gid;
			
 
				+	unsigned int		blocksize;
			
 
				+	unsigned int		padding;
			
 
				+	__u64			ino;
			
 
				+	__u64			blocks;
			
 
				+	__u64			rdev;
			
 
				+	__u64			size;
			
 
				+	__u64			version;
			
 
				+};
			
 
				+
			
 
				+It effectively mirrors stat(2) returned data.
			
 
				+
			
 
				+
			
 
				+@ext - path length to the object.
			
 
				+@size - the same plus size of the netfs_inode_info structure.
			
 
				+@id - local inode number.
			
 
				+@start - 0.
			
 
				+
			
 
				+
			
 
				+@NETFS_PAGE_CACHE
			
 
				+Command is only received by clients. It contains information about
			
 
				+page to be marked as not up-to-date.
			
 
				+
			
 
				+@id - client's inode number.
			
 
				+@start - last byte of the page to be invalidated. If it is not equal to
			
 
				+	current inode size, it will be vmtruncated().
			
 
				+@size - 0
			
 
				+@ext - 0
			
 
				+
			
 
				+
			
 
				+@NETFS_READ_PAGES
			
 
				+Used to read multiple contiguous pages in one go.
			
 
				+
			
 
				+@start - first byte of the contiguous region to read.
			
 
				+@size - contains of two fields: lower 8 bits are used to represent page cache shift
			
 
				+	used by client, another 3 bytes are used to get number of pages.
			
 
				+@id - local inode number.
			
 
				+@ext - path length to the object.
			
 
				+
			
 
				+
			
 
				+@NETFS_RENAME
			
 
				+Used to rename object.
			
 
				+Attached data is formed into following string: "old_path|new_path".
			
 
				+
			
 
				+@id - local inode number.
			
 
				+@start - parent inode number.
			
 
				+@size - length of the above string.
			
 
				+@ext - length of the old path part.
			
 
				+
			
 
				+
			
 
				+@NETFS_CAPABILITIES
			
 
				+Used to exchange crypto capabilities with server.
			
 
				+If crypto capabilities are not supported by server, then client will disable it
			
 
				+or fail (if 'crypto_fail_unsupported' mount options was specified).
			
 
				+
			
 
				+@id - superblock index. Used to specify crypto information for group of servers.
			
 
				+@size - size of the attached capabilities structure.
			
 
				+@start - 0.
			
 
				+@size - 0.
			
 
				+@scsize - 0.
			
 
				+
			
 
				+@NETFS_LOCK
			
 
				+Used to send lock request/release messages. Although it sends byte range request
			
 
				+and is capable of flushing pages based on that, it is not used, since all Linux
			
 
				+filesystems lock the whole inode.
			
 
				+
			
 
				+@id - lock generation number.
			
 
				+@start - start of the locked range.
			
 
				+@size - size of the locked range.
			
 
				+@ext - lock type: read/write. Not used actually. 15'th bit is used to determine,
			
 
				+	if it is lock request (1) or release (0).
			
 
				+
			
 
				+@NETFS_XATTR_SET
			
 
				+@NETFS_XATTR_GET
			
 
				+Used to set/get extended attributes for given inode.
			
 
				+@id - attribute generation number or xattr setting type
			
 
				+@start - size of the attribute (request or attached)
			
 
				+@size - name length, path len and data size for given attribute
			
 
				+@ext - path length for given object
			
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -277,8 +277,7 @@ or bottom half).
 
				   unfreeze_fs: called when VFS is unlocking a filesystem and making it writable
			
 
				   	again.
			
 
				 
			
 
				-  statfs: called when the VFS needs to get filesystem statistics. This
			
 
				-	is called with the kernel lock held
			
 
				+  statfs: called when the VFS needs to get filesystem statistics.
			
 
				 
			
 
				   remount_fs: called when the filesystem is remounted. This is called
			
 
				 	with the kernel lock held
			
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -1,1424 +0,0 @@
 
				-		ftrace - Function Tracer
			
 
				-		========================
			
 
				-
			
 
				-Copyright 2008 Red Hat Inc.
			
 
				-   Author:   Steven Rostedt <srostedt@redhat.com>
			
 
				-  License:   The GNU Free Documentation License, Version 1.2
			
 
				-               (dual licensed under the GPL v2)
			
 
				-Reviewers:   Elias Oltmanns, Randy Dunlap, Andrew Morton,
			
 
				-	     John Kacur, and David Teigland.
			
 
				-
			
 
				-Written for: 2.6.28-rc2
			
 
				-
			
 
				-Introduction
			
 
				-------------
			
 
				-
			
 
				-Ftrace is an internal tracer designed to help out developers and
			
 
				-designers of systems to find what is going on inside the kernel.
			
 
				-It can be used for debugging or analyzing latencies and performance
			
 
				-issues that take place outside of user-space.
			
 
				-
			
 
				-Although ftrace is the function tracer, it also includes an
			
 
				-infrastructure that allows for other types of tracing. Some of the
			
 
				-tracers that are currently in ftrace include a tracer to trace
			
 
				-context switches, the time it takes for a high priority task to
			
 
				-run after it was woken up, the time interrupts are disabled, and
			
 
				-more (ftrace allows for tracer plugins, which means that the list of
			
 
				-tracers can always grow).
			
 
				-
			
 
				-
			
 
				-The File System
			
 
				----------------
			
 
				-
			
 
				-Ftrace uses the debugfs file system to hold the control files as well
			
 
				-as the files to display output.
			
 
				-
			
 
				-To mount the debugfs system:
			
 
				-
			
 
				-  # mkdir /debug
			
 
				-  # mount -t debugfs nodev /debug
			
 
				-
			
 
				-(Note: it is more common to mount at /sys/kernel/debug, but for simplicity
			
 
				- this document will use /debug)
			
 
				-
			
 
				-That's it! (assuming that you have ftrace configured into your kernel)
			
 
				-
			
 
				-After mounting the debugfs, you can see a directory called
			
 
				-"tracing".  This directory contains the control and output files
			
 
				-of ftrace. Here is a list of some of the key files:
			
 
				-
			
 
				-
			
 
				- Note: all time values are in microseconds.
			
 
				-
			
 
				-  current_tracer: This is used to set or display the current tracer
			
 
				-		that is configured.
			
 
				-
			
 
				-  available_tracers: This holds the different types of tracers that
			
 
				-		have been compiled into the kernel. The tracers
			
 
				-		listed here can be configured by echoing their name
			
 
				-		into current_tracer.
			
 
				-
			
 
				-  tracing_enabled: This sets or displays whether the current_tracer
			
 
				-		is activated and tracing or not. Echo 0 into this
			
 
				-		file to disable the tracer or 1 to enable it.
			
 
				-
			
 
				-  trace: This file holds the output of the trace in a human readable
			
 
				-		format (described below).
			
 
				-
			
 
				-  latency_trace: This file shows the same trace but the information
			
 
				-		is organized more to display possible latencies
			
 
				-		in the system (described below).
			
 
				-
			
 
				-  trace_pipe: The output is the same as the "trace" file but this
			
 
				-		file is meant to be streamed with live tracing.
			
 
				-		Reads from this file will block until new data
			
 
				-		is retrieved. Unlike the "trace" and "latency_trace"
			
 
				-		files, this file is a consumer. This means reading
			
 
				-		from this file causes sequential reads to display
			
 
				-		more current data. Once data is read from this
			
 
				-		file, it is consumed, and will not be read
			
 
				-		again with a sequential read. The "trace" and
			
 
				-		"latency_trace" files are static, and if the
			
 
				-		tracer is not adding more data, they will display
			
 
				-		the same information every time they are read.
			
 
				-
			
 
				-  trace_options: This file lets the user control the amount of data
			
 
				-		that is displayed in one of the above output
			
 
				-		files.
			
 
				-
			
 
				-  trace_max_latency: Some of the tracers record the max latency.
			
 
				-		For example, the time interrupts are disabled.
			
 
				-		This time is saved in this file. The max trace
			
 
				-		will also be stored, and displayed by either
			
 
				-		"trace" or "latency_trace".  A new max trace will
			
 
				-		only be recorded if the latency is greater than
			
 
				-		the value in this file. (in microseconds)
			
 
				-
			
 
				-  buffer_size_kb: This sets or displays the number of kilobytes each CPU
			
 
				-		buffer can hold. The tracer buffers are the same size
			
 
				-		for each CPU. The displayed number is the size of the
			
 
				-		CPU buffer and not total size of all buffers. The
			
 
				-		trace buffers are allocated in pages (blocks of memory
			
 
				-		that the kernel uses for allocation, usually 4 KB in size).
			
 
				-		If the last page allocated has room for more bytes
			
 
				-		than requested, the rest of the page will be used,
			
 
				-		making the actual allocation bigger than requested.
			
 
				-		(Note, the size may not be a multiple of the page size due
			
 
				-		to buffer managment overhead.)
			
 
				-
			
 
				-		This can only be updated when the current_tracer
			
 
				-		is set to "nop".
			
 
				-
			
 
				-  tracing_cpumask: This is a mask that lets the user only trace
			
 
				-		on specified CPUS. The format is a hex string
			
 
				-		representing the CPUS.
			
 
				-
			
 
				-  set_ftrace_filter: When dynamic ftrace is configured in (see the
			
 
				-		section below "dynamic ftrace"), the code is dynamically
			
 
				-		modified (code text rewrite) to disable calling of the
			
 
				-		function profiler (mcount). This lets tracing be configured
			
 
				-		in with practically no overhead in performance.  This also
			
 
				-		has a side effect of enabling or disabling specific functions
			
 
				-		to be traced. Echoing names of functions into this file
			
 
				-		will limit the trace to only those functions.
			
 
				-
			
 
				-  set_ftrace_notrace: This has an effect opposite to that of
			
 
				-		set_ftrace_filter. Any function that is added here will not
			
 
				-		be traced. If a function exists in both set_ftrace_filter
			
 
				-		and set_ftrace_notrace,	the function will _not_ be traced.
			
 
				-
			
 
				-  set_ftrace_pid: Have the function tracer only trace a single thread.
			
 
				-
			
 
				-  available_filter_functions: This lists the functions that ftrace
			
 
				-		has processed and can trace. These are the function
			
 
				-		names that you can pass to "set_ftrace_filter" or
			
 
				-		"set_ftrace_notrace". (See the section "dynamic ftrace"
			
 
				-		below for more details.)
			
 
				-
			
 
				-
			
 
				-The Tracers
			
 
				------------
			
 
				-
			
 
				-Here is the list of current tracers that may be configured.
			
 
				-
			
 
				-  function - function tracer that uses mcount to trace all functions.
			
 
				-
			
 
				-  sched_switch - traces the context switches between tasks.
			
 
				-
			
 
				-  irqsoff - traces the areas that disable interrupts and saves
			
 
				-  		the trace with the longest max latency.
			
 
				-		See tracing_max_latency.  When a new max is recorded,
			
 
				-		it replaces the old trace. It is best to view this
			
 
				-		trace via the latency_trace file.
			
 
				-
			
 
				-  preemptoff - Similar to irqsoff but traces and records the amount of
			
 
				-		time for which preemption is disabled.
			
 
				-
			
 
				-  preemptirqsoff - Similar to irqsoff and preemptoff, but traces and
			
 
				-		 records the largest time for which irqs and/or preemption
			
 
				-		 is disabled.
			
 
				-
			
 
				-  wakeup - Traces and records the max latency that it takes for
			
 
				-		the highest priority task to get scheduled after
			
 
				-		it has been woken up.
			
 
				-
			
 
				-  nop - This is not a tracer. To remove all tracers from tracing
			
 
				-		simply echo "nop" into current_tracer.
			
 
				-
			
 
				-
			
 
				-Examples of using the tracer
			
 
				-----------------------------
			
 
				-
			
 
				-Here are typical examples of using the tracers when controlling them only
			
 
				-with the debugfs interface (without using any user-land utilities).
			
 
				-
			
 
				-Output format:
			
 
				---------------
			
 
				-
			
 
				-Here is an example of the output format of the file "trace"
			
 
				-
			
 
				-                             --------
			
 
				-# tracer: function
			
 
				-#
			
 
				-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				-#              | |      |          |         |
			
 
				-            bash-4251  [01] 10152.583854: path_put <-path_walk
			
 
				-            bash-4251  [01] 10152.583855: dput <-path_put
			
 
				-            bash-4251  [01] 10152.583855: _atomic_dec_and_lock <-dput
			
 
				-                             --------
			
 
				-
			
 
				-A header is printed with the tracer name that is represented by the trace.
			
 
				-In this case the tracer is "function". Then a header showing the format. Task
			
 
				-name "bash", the task PID "4251", the CPU that it was running on
			
 
				-"01", the timestamp in <secs>.<usecs> format, the function name that was
			
 
				-traced "path_put" and the parent function that called this function
			
 
				-"path_walk". The timestamp is the time at which the function was
			
 
				-entered.
			
 
				-
			
 
				-The sched_switch tracer also includes tracing of task wakeups and
			
 
				-context switches.
			
 
				-
			
 
				-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R   +  2916:115:S
			
 
				-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R   +    10:115:S
			
 
				-     ksoftirqd/1-7     [01]  1453.070013:      7:115:R ==>    10:115:R
			
 
				-        events/1-10    [01]  1453.070013:     10:115:S ==>  2916:115:R
			
 
				-     kondemand/1-2916  [01]  1453.070013:   2916:115:S ==>     7:115:R
			
 
				-     ksoftirqd/1-7     [01]  1453.070013:      7:115:S ==>     0:140:R
			
 
				-
			
 
				-Wake ups are represented by a "+" and the context switches are shown as
			
 
				-"==>".  The format is:
			
 
				-
			
 
				- Context switches:
			
 
				-
			
 
				-       Previous task              Next Task
			
 
				-
			
 
				-  <pid>:<prio>:<state>  ==>  <pid>:<prio>:<state>
			
 
				-
			
 
				- Wake ups:
			
 
				-
			
 
				-       Current task               Task waking up
			
 
				-
			
 
				-  <pid>:<prio>:<state>    +  <pid>:<prio>:<state>
			
 
				-
			
 
				-The prio is the internal kernel priority, which is the inverse of the
			
 
				-priority that is usually displayed by user-space tools. Zero represents
			
 
				-the highest priority (99). Prio 100 starts the "nice" priorities with
			
 
				-100 being equal to nice -20 and 139 being nice 19. The prio "140" is
			
 
				-reserved for the idle task which is the lowest priority thread (pid 0).
			
 
				-
			
 
				-
			
 
				-Latency trace format
			
 
				---------------------
			
 
				-
			
 
				-For traces that display latency times, the latency_trace file gives
			
 
				-somewhat more information to see why a latency happened. Here is a typical
			
 
				-trace.
			
 
				-
			
 
				-# tracer: irqsoff
			
 
				-#
			
 
				-irqsoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				---------------------------------------------------------------------
			
 
				- latency: 97 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				-    -----------------
			
 
				- => started at: apic_timer_interrupt
			
 
				- => ended at:   do_softirq
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-  <idle>-0     0d..1    0us+: trace_hardirqs_off_thunk (apic_timer_interrupt)
			
 
				-  <idle>-0     0d.s.   97us : __do_softirq (do_softirq)
			
 
				-  <idle>-0     0d.s1   98us : trace_hardirqs_on (do_softirq)
			
 
				-
			
 
				-
			
 
				-
			
 
				-This shows that the current tracer is "irqsoff" tracing the time for which
			
 
				-interrupts were disabled. It gives the trace version and the version
			
 
				-of the kernel upon which this was executed on (2.6.26-rc8). Then it displays
			
 
				-the max latency in microsecs (97 us). The number of trace entries displayed
			
 
				-and the total number recorded (both are three: #3/3). The type of
			
 
				-preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero
			
 
				-and are reserved for later use. #P is the number of online CPUS (#P:2).
			
 
				-
			
 
				-The task is the process that was running when the latency occurred.
			
 
				-(swapper pid: 0).
			
 
				-
			
 
				-The start and stop (the functions in which the interrupts were disabled and
			
 
				-enabled respectively) that caused the latencies:
			
 
				-
			
 
				-  apic_timer_interrupt is where the interrupts were disabled.
			
 
				-  do_softirq is where they were enabled again.
			
 
				-
			
 
				-The next lines after the header are the trace itself. The header
			
 
				-explains which is which.
			
 
				-
			
 
				-  cmd: The name of the process in the trace.
			
 
				-
			
 
				-  pid: The PID of that process.
			
 
				-
			
 
				-  CPU#: The CPU which the process was running on.
			
 
				-
			
 
				-  irqs-off: 'd' interrupts are disabled. '.' otherwise.
			
 
				-	    Note: If the architecture does not support a way to
			
 
				-		  read the irq flags variable, an 'X' will always
			
 
				-		  be printed here.
			
 
				-
			
 
				-  need-resched: 'N' task need_resched is set, '.' otherwise.
			
 
				-
			
 
				-  hardirq/softirq:
			
 
				-	'H' - hard irq occurred inside a softirq.
			
 
				-	'h' - hard irq is running
			
 
				-	's' - soft irq is running
			
 
				-	'.' - normal context.
			
 
				-
			
 
				-  preempt-depth: The level of preempt_disabled
			
 
				-
			
 
				-The above is mostly meaningful for kernel developers.
			
 
				-
			
 
				-  time: This differs from the trace file output. The trace file output
			
 
				-	includes an absolute timestamp. The timestamp used by the
			
 
				-	latency_trace file is relative to the start of the trace.
			
 
				-
			
 
				-  delay: This is just to help catch your eye a bit better. And
			
 
				-	needs to be fixed to be only relative to the same CPU.
			
 
				-	The marks are determined by the difference between this
			
 
				-	current trace and the next trace.
			
 
				-	 '!' - greater than preempt_mark_thresh (default 100)
			
 
				-	 '+' - greater than 1 microsecond
			
 
				-	 ' ' - less than or equal to 1 microsecond.
			
 
				-
			
 
				-  The rest is the same as the 'trace' file.
			
 
				-
			
 
				-
			
 
				-trace_options
			
 
				--------------
			
 
				-
			
 
				-The trace_options file is used to control what gets printed in the trace
			
 
				-output. To see what is available, simply cat the file:
			
 
				-
			
 
				-  cat /debug/tracing/trace_options
			
 
				-  print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
			
 
				- noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
			
 
				-
			
 
				-To disable one of the options, echo in the option prepended with "no".
			
 
				-
			
 
				-  echo noprint-parent > /debug/tracing/trace_options
			
 
				-
			
 
				-To enable an option, leave off the "no".
			
 
				-
			
 
				-  echo sym-offset > /debug/tracing/trace_options
			
 
				-
			
 
				-Here are the available options:
			
 
				-
			
 
				-  print-parent - On function traces, display the calling function
			
 
				-		as well as the function being traced.
			
 
				-
			
 
				-  print-parent:
			
 
				-   bash-4000  [01]  1477.606694: simple_strtoul <-strict_strtoul
			
 
				-
			
 
				-  noprint-parent:
			
 
				-   bash-4000  [01]  1477.606694: simple_strtoul
			
 
				-
			
 
				-
			
 
				-  sym-offset - Display not only the function name, but also the offset
			
 
				-		in the function. For example, instead of seeing just
			
 
				-		"ktime_get", you will see "ktime_get+0xb/0x20".
			
 
				-
			
 
				-  sym-offset:
			
 
				-   bash-4000  [01]  1477.606694: simple_strtoul+0x6/0xa0
			
 
				-
			
 
				-  sym-addr - this will also display the function address as well as
			
 
				-		the function name.
			
 
				-
			
 
				-  sym-addr:
			
 
				-   bash-4000  [01]  1477.606694: simple_strtoul <c0339346>
			
 
				-
			
 
				-  verbose - This deals with the latency_trace file.
			
 
				-
			
 
				-    bash  4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
			
 
				-    (+0.000ms): simple_strtoul (strict_strtoul)
			
 
				-
			
 
				-  raw - This will display raw numbers. This option is best for use with
			
 
				-	user applications that can translate the raw numbers better than
			
 
				-	having it done in the kernel.
			
 
				-
			
 
				-  hex - Similar to raw, but the numbers will be in a hexadecimal format.
			
 
				-
			
 
				-  bin - This will print out the formats in raw binary.
			
 
				-
			
 
				-  block - TBD (needs update)
			
 
				-
			
 
				-  stacktrace - This is one of the options that changes the trace itself.
			
 
				-		When a trace is recorded, so is the stack of functions.
			
 
				-		This allows for back traces of trace sites.
			
 
				-
			
 
				-  userstacktrace - This option changes the trace.
			
 
				-		   It records a stacktrace of the current userspace thread.
			
 
				-
			
 
				-  sym-userobj - when user stacktrace are enabled, look up which object the
			
 
				-		address belongs to, and print a relative address
			
 
				-		This is especially useful when ASLR is on, otherwise you don't
			
 
				-		get a chance to resolve the address to object/file/line after the app is no
			
 
				-		longer running
			
 
				-
			
 
				-		The lookup is performed when you read trace,trace_pipe,latency_trace. Example:
			
 
				-
			
 
				-		a.out-1623  [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
			
 
				-x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
			
 
				-
			
 
				-  sched-tree - TBD (any users??)
			
 
				-
			
 
				-
			
 
				-sched_switch
			
 
				-------------
			
 
				-
			
 
				-This tracer simply records schedule switches. Here is an example
			
 
				-of how to use it.
			
 
				-
			
 
				- # echo sched_switch > /debug/tracing/current_tracer
			
 
				- # echo 1 > /debug/tracing/tracing_enabled
			
 
				- # sleep 1
			
 
				- # echo 0 > /debug/tracing/tracing_enabled
			
 
				- # cat /debug/tracing/trace
			
 
				-
			
 
				-# tracer: sched_switch
			
 
				-#
			
 
				-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				-#              | |      |          |         |
			
 
				-            bash-3997  [01]   240.132281:   3997:120:R   +  4055:120:R
			
 
				-            bash-3997  [01]   240.132284:   3997:120:R ==>  4055:120:R
			
 
				-           sleep-4055  [01]   240.132371:   4055:120:S ==>  3997:120:R
			
 
				-            bash-3997  [01]   240.132454:   3997:120:R   +  4055:120:S
			
 
				-            bash-3997  [01]   240.132457:   3997:120:R ==>  4055:120:R
			
 
				-           sleep-4055  [01]   240.132460:   4055:120:D ==>  3997:120:R
			
 
				-            bash-3997  [01]   240.132463:   3997:120:R   +  4055:120:D
			
 
				-            bash-3997  [01]   240.132465:   3997:120:R ==>  4055:120:R
			
 
				-          <idle>-0     [00]   240.132589:      0:140:R   +     4:115:S
			
 
				-          <idle>-0     [00]   240.132591:      0:140:R ==>     4:115:R
			
 
				-     ksoftirqd/0-4     [00]   240.132595:      4:115:S ==>     0:140:R
			
 
				-          <idle>-0     [00]   240.132598:      0:140:R   +     4:115:S
			
 
				-          <idle>-0     [00]   240.132599:      0:140:R ==>     4:115:R
			
 
				-     ksoftirqd/0-4     [00]   240.132603:      4:115:S ==>     0:140:R
			
 
				-           sleep-4055  [01]   240.133058:   4055:120:S ==>  3997:120:R
			
 
				- [...]
			
 
				-
			
 
				-
			
 
				-As we have discussed previously about this format, the header shows
			
 
				-the name of the trace and points to the options. The "FUNCTION"
			
 
				-is a misnomer since here it represents the wake ups and context
			
 
				-switches.
			
 
				-
			
 
				-The sched_switch file only lists the wake ups (represented with '+')
			
 
				-and context switches ('==>') with the previous task or current task
			
 
				-first followed by the next task or task waking up. The format for both
			
 
				-of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO
			
 
				-is the inverse of the actual priority with zero (0) being the highest
			
 
				-priority and the nice values starting at 100 (nice -20). Below is
			
 
				-a quick chart to map the kernel priority to user land priorities.
			
 
				-
			
 
				-  Kernel priority: 0 to 99    ==> user RT priority 99 to 0
			
 
				-  Kernel priority: 100 to 139 ==> user nice -20 to 19
			
 
				-  Kernel priority: 140        ==> idle task priority
			
 
				-
			
 
				-The task states are:
			
 
				-
			
 
				- R - running : wants to run, may not actually be running
			
 
				- S - sleep   : process is waiting to be woken up (handles signals)
			
 
				- D - disk sleep (uninterruptible sleep) : process must be woken up
			
 
				-					(ignores signals)
			
 
				- T - stopped : process suspended
			
 
				- t - traced  : process is being traced (with something like gdb)
			
 
				- Z - zombie  : process waiting to be cleaned up
			
 
				- X - unknown
			
 
				-
			
 
				-
			
 
				-ftrace_enabled
			
 
				---------------
			
 
				-
			
 
				-The following tracers (listed below) give different output depending
			
 
				-on whether or not the sysctl ftrace_enabled is set. To set ftrace_enabled,
			
 
				-one can either use the sysctl function or set it via the proc
			
 
				-file system interface.
			
 
				-
			
 
				-  sysctl kernel.ftrace_enabled=1
			
 
				-
			
 
				- or
			
 
				-
			
 
				-  echo 1 > /proc/sys/kernel/ftrace_enabled
			
 
				-
			
 
				-To disable ftrace_enabled simply replace the '1' with '0' in
			
 
				-the above commands.
			
 
				-
			
 
				-When ftrace_enabled is set the tracers will also record the functions
			
 
				-that are within the trace. The descriptions of the tracers
			
 
				-will also show an example with ftrace enabled.
			
 
				-
			
 
				-
			
 
				-irqsoff
			
 
				--------
			
 
				-
			
 
				-When interrupts are disabled, the CPU can not react to any other
			
 
				-external event (besides NMIs and SMIs). This prevents the timer
			
 
				-interrupt from triggering or the mouse interrupt from letting the
			
 
				-kernel know of a new mouse event. The result is a latency with the
			
 
				-reaction time.
			
 
				-
			
 
				-The irqsoff tracer tracks the time for which interrupts are disabled.
			
 
				-When a new maximum latency is hit, the tracer saves the trace leading up
			
 
				-to that latency point so that every time a new maximum is reached, the old
			
 
				-saved trace is discarded and the new trace is saved.
			
 
				-
			
 
				-To reset the maximum, echo 0 into tracing_max_latency. Here is an
			
 
				-example:
			
 
				-
			
 
				- # echo irqsoff > /debug/tracing/current_tracer
			
 
				- # echo 0 > /debug/tracing/tracing_max_latency
			
 
				- # echo 1 > /debug/tracing/tracing_enabled
			
 
				- # ls -ltr
			
 
				- [...]
			
 
				- # echo 0 > /debug/tracing/tracing_enabled
			
 
				- # cat /debug/tracing/latency_trace
			
 
				-# tracer: irqsoff
			
 
				-#
			
 
				-irqsoff latency trace v1.1.5 on 2.6.26
			
 
				---------------------------------------------------------------------
			
 
				- latency: 12 us, #3/3, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: bash-3730 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				-    -----------------
			
 
				- => started at: sys_setpgid
			
 
				- => ended at:   sys_setpgid
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-    bash-3730  1d...    0us : _write_lock_irq (sys_setpgid)
			
 
				-    bash-3730  1d..1    1us+: _write_unlock_irq (sys_setpgid)
			
 
				-    bash-3730  1d..2   14us : trace_hardirqs_on (sys_setpgid)
			
 
				-
			
 
				-
			
 
				-Here we see that that we had a latency of 12 microsecs (which is
			
 
				-very good). The _write_lock_irq in sys_setpgid disabled interrupts.
			
 
				-The difference between the 12 and the displayed timestamp 14us occurred
			
 
				-because the clock was incremented between the time of recording the max
			
 
				-latency and the time of recording the function that had that latency.
			
 
				-
			
 
				-Note the above example had ftrace_enabled not set. If we set the
			
 
				-ftrace_enabled, we get a much larger output:
			
 
				-
			
 
				-# tracer: irqsoff
			
 
				-#
			
 
				-irqsoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				---------------------------------------------------------------------
			
 
				- latency: 50 us, #101/101, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: ls-4339 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				-    -----------------
			
 
				- => started at: __alloc_pages_internal
			
 
				- => ended at:   __alloc_pages_internal
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-      ls-4339  0...1    0us+: get_page_from_freelist (__alloc_pages_internal)
			
 
				-      ls-4339  0d..1    3us : rmqueue_bulk (get_page_from_freelist)
			
 
				-      ls-4339  0d..1    3us : _spin_lock (rmqueue_bulk)
			
 
				-      ls-4339  0d..1    4us : add_preempt_count (_spin_lock)
			
 
				-      ls-4339  0d..2    4us : __rmqueue (rmqueue_bulk)
			
 
				-      ls-4339  0d..2    5us : __rmqueue_smallest (__rmqueue)
			
 
				-      ls-4339  0d..2    5us : __mod_zone_page_state (__rmqueue_smallest)
			
 
				-      ls-4339  0d..2    6us : __rmqueue (rmqueue_bulk)
			
 
				-      ls-4339  0d..2    6us : __rmqueue_smallest (__rmqueue)
			
 
				-      ls-4339  0d..2    7us : __mod_zone_page_state (__rmqueue_smallest)
			
 
				-      ls-4339  0d..2    7us : __rmqueue (rmqueue_bulk)
			
 
				-      ls-4339  0d..2    8us : __rmqueue_smallest (__rmqueue)
			
 
				-[...]
			
 
				-      ls-4339  0d..2   46us : __rmqueue_smallest (__rmqueue)
			
 
				-      ls-4339  0d..2   47us : __mod_zone_page_state (__rmqueue_smallest)
			
 
				-      ls-4339  0d..2   47us : __rmqueue (rmqueue_bulk)
			
 
				-      ls-4339  0d..2   48us : __rmqueue_smallest (__rmqueue)
			
 
				-      ls-4339  0d..2   48us : __mod_zone_page_state (__rmqueue_smallest)
			
 
				-      ls-4339  0d..2   49us : _spin_unlock (rmqueue_bulk)
			
 
				-      ls-4339  0d..2   49us : sub_preempt_count (_spin_unlock)
			
 
				-      ls-4339  0d..1   50us : get_page_from_freelist (__alloc_pages_internal)
			
 
				-      ls-4339  0d..2   51us : trace_hardirqs_on (__alloc_pages_internal)
			
 
				-
			
 
				-
			
 
				-
			
 
				-Here we traced a 50 microsecond latency. But we also see all the
			
 
				-functions that were called during that time. Note that by enabling
			
 
				-function tracing, we incur an added overhead. This overhead may
			
 
				-extend the latency times. But nevertheless, this trace has provided
			
 
				-some very helpful debugging information.
			
 
				-
			
 
				-
			
 
				-preemptoff
			
 
				-----------
			
 
				-
			
 
				-When preemption is disabled, we may be able to receive interrupts but
			
 
				-the task cannot be preempted and a higher priority task must wait
			
 
				-for preemption to be enabled again before it can preempt a lower
			
 
				-priority task.
			
 
				-
			
 
				-The preemptoff tracer traces the places that disable preemption.
			
 
				-Like the irqsoff tracer, it records the maximum latency for which preemption
			
 
				-was disabled. The control of preemptoff tracer is much like the irqsoff
			
 
				-tracer.
			
 
				-
			
 
				- # echo preemptoff > /debug/tracing/current_tracer
			
 
				- # echo 0 > /debug/tracing/tracing_max_latency
			
 
				- # echo 1 > /debug/tracing/tracing_enabled
			
 
				- # ls -ltr
			
 
				- [...]
			
 
				- # echo 0 > /debug/tracing/tracing_enabled
			
 
				- # cat /debug/tracing/latency_trace
			
 
				-# tracer: preemptoff
			
 
				-#
			
 
				-preemptoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				---------------------------------------------------------------------
			
 
				- latency: 29 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				-    -----------------
			
 
				- => started at: do_IRQ
			
 
				- => ended at:   __do_softirq
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-    sshd-4261  0d.h.    0us+: irq_enter (do_IRQ)
			
 
				-    sshd-4261  0d.s.   29us : _local_bh_enable (__do_softirq)
			
 
				-    sshd-4261  0d.s1   30us : trace_preempt_on (__do_softirq)
			
 
				-
			
 
				-
			
 
				-This has some more changes. Preemption was disabled when an interrupt
			
 
				-came in (notice the 'h'), and was enabled while doing a softirq.
			
 
				-(notice the 's'). But we also see that interrupts have been disabled
			
 
				-when entering the preempt off section and leaving it (the 'd').
			
 
				-We do not know if interrupts were enabled in the mean time.
			
 
				-
			
 
				-# tracer: preemptoff
			
 
				-#
			
 
				-preemptoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				---------------------------------------------------------------------
			
 
				- latency: 63 us, #87/87, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				-    -----------------
			
 
				- => started at: remove_wait_queue
			
 
				- => ended at:   __do_softirq
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-    sshd-4261  0d..1    0us : _spin_lock_irqsave (remove_wait_queue)
			
 
				-    sshd-4261  0d..1    1us : _spin_unlock_irqrestore (remove_wait_queue)
			
 
				-    sshd-4261  0d..1    2us : do_IRQ (common_interrupt)
			
 
				-    sshd-4261  0d..1    2us : irq_enter (do_IRQ)
			
 
				-    sshd-4261  0d..1    2us : idle_cpu (irq_enter)
			
 
				-    sshd-4261  0d..1    3us : add_preempt_count (irq_enter)
			
 
				-    sshd-4261  0d.h1    3us : idle_cpu (irq_enter)
			
 
				-    sshd-4261  0d.h.    4us : handle_fasteoi_irq (do_IRQ)
			
 
				-[...]
			
 
				-    sshd-4261  0d.h.   12us : add_preempt_count (_spin_lock)
			
 
				-    sshd-4261  0d.h1   12us : ack_ioapic_quirk_irq (handle_fasteoi_irq)
			
 
				-    sshd-4261  0d.h1   13us : move_native_irq (ack_ioapic_quirk_irq)
			
 
				-    sshd-4261  0d.h1   13us : _spin_unlock (handle_fasteoi_irq)
			
 
				-    sshd-4261  0d.h1   14us : sub_preempt_count (_spin_unlock)
			
 
				-    sshd-4261  0d.h1   14us : irq_exit (do_IRQ)
			
 
				-    sshd-4261  0d.h1   15us : sub_preempt_count (irq_exit)
			
 
				-    sshd-4261  0d..2   15us : do_softirq (irq_exit)
			
 
				-    sshd-4261  0d...   15us : __do_softirq (do_softirq)
			
 
				-    sshd-4261  0d...   16us : __local_bh_disable (__do_softirq)
			
 
				-    sshd-4261  0d...   16us+: add_preempt_count (__local_bh_disable)
			
 
				-    sshd-4261  0d.s4   20us : add_preempt_count (__local_bh_disable)
			
 
				-    sshd-4261  0d.s4   21us : sub_preempt_count (local_bh_enable)
			
 
				-    sshd-4261  0d.s5   21us : sub_preempt_count (local_bh_enable)
			
 
				-[...]
			
 
				-    sshd-4261  0d.s6   41us : add_preempt_count (__local_bh_disable)
			
 
				-    sshd-4261  0d.s6   42us : sub_preempt_count (local_bh_enable)
			
 
				-    sshd-4261  0d.s7   42us : sub_preempt_count (local_bh_enable)
			
 
				-    sshd-4261  0d.s5   43us : add_preempt_count (__local_bh_disable)
			
 
				-    sshd-4261  0d.s5   43us : sub_preempt_count (local_bh_enable_ip)
			
 
				-    sshd-4261  0d.s6   44us : sub_preempt_count (local_bh_enable_ip)
			
 
				-    sshd-4261  0d.s5   44us : add_preempt_count (__local_bh_disable)
			
 
				-    sshd-4261  0d.s5   45us : sub_preempt_count (local_bh_enable)
			
 
				-[...]
			
 
				-    sshd-4261  0d.s.   63us : _local_bh_enable (__do_softirq)
			
 
				-    sshd-4261  0d.s1   64us : trace_preempt_on (__do_softirq)
			
 
				-
			
 
				-
			
 
				-The above is an example of the preemptoff trace with ftrace_enabled
			
 
				-set. Here we see that interrupts were disabled the entire time.
			
 
				-The irq_enter code lets us know that we entered an interrupt 'h'.
			
 
				-Before that, the functions being traced still show that it is not
			
 
				-in an interrupt, but we can see from the functions themselves that
			
 
				-this is not the case.
			
 
				-
			
 
				-Notice that __do_softirq when called does not have a preempt_count.
			
 
				-It may seem that we missed a preempt enabling. What really happened
			
 
				-is that the preempt count is held on the thread's stack and we
			
 
				-switched to the softirq stack (4K stacks in effect). The code
			
 
				-does not copy the preempt count, but because interrupts are disabled,
			
 
				-we do not need to worry about it. Having a tracer like this is good
			
 
				-for letting people know what really happens inside the kernel.
			
 
				-
			
 
				-
			
 
				-preemptirqsoff
			
 
				---------------
			
 
				-
			
 
				-Knowing the locations that have interrupts disabled or preemption
			
 
				-disabled for the longest times is helpful. But sometimes we would
			
 
				-like to know when either preemption and/or interrupts are disabled.
			
 
				-
			
 
				-Consider the following code:
			
 
				-
			
 
				-    local_irq_disable();
			
 
				-    call_function_with_irqs_off();
			
 
				-    preempt_disable();
			
 
				-    call_function_with_irqs_and_preemption_off();
			
 
				-    local_irq_enable();
			
 
				-    call_function_with_preemption_off();
			
 
				-    preempt_enable();
			
 
				-
			
 
				-The irqsoff tracer will record the total length of
			
 
				-call_function_with_irqs_off() and
			
 
				-call_function_with_irqs_and_preemption_off().
			
 
				-
			
 
				-The preemptoff tracer will record the total length of
			
 
				-call_function_with_irqs_and_preemption_off() and
			
 
				-call_function_with_preemption_off().
			
 
				-
			
 
				-But neither will trace the time that interrupts and/or preemption
			
 
				-is disabled. This total time is the time that we can not schedule.
			
 
				-To record this time, use the preemptirqsoff tracer.
			
 
				-
			
 
				-Again, using this trace is much like the irqsoff and preemptoff tracers.
			
 
				-
			
 
				- # echo preemptirqsoff > /debug/tracing/current_tracer
			
 
				- # echo 0 > /debug/tracing/tracing_max_latency
			
 
				- # echo 1 > /debug/tracing/tracing_enabled
			
 
				- # ls -ltr
			
 
				- [...]
			
 
				- # echo 0 > /debug/tracing/tracing_enabled
			
 
				- # cat /debug/tracing/latency_trace
			
 
				-# tracer: preemptirqsoff
			
 
				-#
			
 
				-preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				---------------------------------------------------------------------
			
 
				- latency: 293 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: ls-4860 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				-    -----------------
			
 
				- => started at: apic_timer_interrupt
			
 
				- => ended at:   __do_softirq
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-      ls-4860  0d...    0us!: trace_hardirqs_off_thunk (apic_timer_interrupt)
			
 
				-      ls-4860  0d.s.  294us : _local_bh_enable (__do_softirq)
			
 
				-      ls-4860  0d.s1  294us : trace_preempt_on (__do_softirq)
			
 
				-
			
 
				-
			
 
				-
			
 
				-The trace_hardirqs_off_thunk is called from assembly on x86 when
			
 
				-interrupts are disabled in the assembly code. Without the function
			
 
				-tracing, we do not know if interrupts were enabled within the preemption
			
 
				-points. We do see that it started with preemption enabled.
			
 
				-
			
 
				-Here is a trace with ftrace_enabled set:
			
 
				-
			
 
				-
			
 
				-# tracer: preemptirqsoff
			
 
				-#
			
 
				-preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				---------------------------------------------------------------------
			
 
				- latency: 105 us, #183/183, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				-    -----------------
			
 
				- => started at: write_chan
			
 
				- => ended at:   __do_softirq
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-      ls-4473  0.N..    0us : preempt_schedule (write_chan)
			
 
				-      ls-4473  0dN.1    1us : _spin_lock (schedule)
			
 
				-      ls-4473  0dN.1    2us : add_preempt_count (_spin_lock)
			
 
				-      ls-4473  0d..2    2us : put_prev_task_fair (schedule)
			
 
				-[...]
			
 
				-      ls-4473  0d..2   13us : set_normalized_timespec (ktime_get_ts)
			
 
				-      ls-4473  0d..2   13us : __switch_to (schedule)
			
 
				-    sshd-4261  0d..2   14us : finish_task_switch (schedule)
			
 
				-    sshd-4261  0d..2   14us : _spin_unlock_irq (finish_task_switch)
			
 
				-    sshd-4261  0d..1   15us : add_preempt_count (_spin_lock_irqsave)
			
 
				-    sshd-4261  0d..2   16us : _spin_unlock_irqrestore (hrtick_set)
			
 
				-    sshd-4261  0d..2   16us : do_IRQ (common_interrupt)
			
 
				-    sshd-4261  0d..2   17us : irq_enter (do_IRQ)
			
 
				-    sshd-4261  0d..2   17us : idle_cpu (irq_enter)
			
 
				-    sshd-4261  0d..2   18us : add_preempt_count (irq_enter)
			
 
				-    sshd-4261  0d.h2   18us : idle_cpu (irq_enter)
			
 
				-    sshd-4261  0d.h.   18us : handle_fasteoi_irq (do_IRQ)
			
 
				-    sshd-4261  0d.h.   19us : _spin_lock (handle_fasteoi_irq)
			
 
				-    sshd-4261  0d.h.   19us : add_preempt_count (_spin_lock)
			
 
				-    sshd-4261  0d.h1   20us : _spin_unlock (handle_fasteoi_irq)
			
 
				-    sshd-4261  0d.h1   20us : sub_preempt_count (_spin_unlock)
			
 
				-[...]
			
 
				-    sshd-4261  0d.h1   28us : _spin_unlock (handle_fasteoi_irq)
			
 
				-    sshd-4261  0d.h1   29us : sub_preempt_count (_spin_unlock)
			
 
				-    sshd-4261  0d.h2   29us : irq_exit (do_IRQ)
			
 
				-    sshd-4261  0d.h2   29us : sub_preempt_count (irq_exit)
			
 
				-    sshd-4261  0d..3   30us : do_softirq (irq_exit)
			
 
				-    sshd-4261  0d...   30us : __do_softirq (do_softirq)
			
 
				-    sshd-4261  0d...   31us : __local_bh_disable (__do_softirq)
			
 
				-    sshd-4261  0d...   31us+: add_preempt_count (__local_bh_disable)
			
 
				-    sshd-4261  0d.s4   34us : add_preempt_count (__local_bh_disable)
			
 
				-[...]
			
 
				-    sshd-4261  0d.s3   43us : sub_preempt_count (local_bh_enable_ip)
			
 
				-    sshd-4261  0d.s4   44us : sub_preempt_count (local_bh_enable_ip)
			
 
				-    sshd-4261  0d.s3   44us : smp_apic_timer_interrupt (apic_timer_interrupt)
			
 
				-    sshd-4261  0d.s3   45us : irq_enter (smp_apic_timer_interrupt)
			
 
				-    sshd-4261  0d.s3   45us : idle_cpu (irq_enter)
			
 
				-    sshd-4261  0d.s3   46us : add_preempt_count (irq_enter)
			
 
				-    sshd-4261  0d.H3   46us : idle_cpu (irq_enter)
			
 
				-    sshd-4261  0d.H3   47us : hrtimer_interrupt (smp_apic_timer_interrupt)
			
 
				-    sshd-4261  0d.H3   47us : ktime_get (hrtimer_interrupt)
			
 
				-[...]
			
 
				-    sshd-4261  0d.H3   81us : tick_program_event (hrtimer_interrupt)
			
 
				-    sshd-4261  0d.H3   82us : ktime_get (tick_program_event)
			
 
				-    sshd-4261  0d.H3   82us : ktime_get_ts (ktime_get)
			
 
				-    sshd-4261  0d.H3   83us : getnstimeofday (ktime_get_ts)
			
 
				-    sshd-4261  0d.H3   83us : set_normalized_timespec (ktime_get_ts)
			
 
				-    sshd-4261  0d.H3   84us : clockevents_program_event (tick_program_event)
			
 
				-    sshd-4261  0d.H3   84us : lapic_next_event (clockevents_program_event)
			
 
				-    sshd-4261  0d.H3   85us : irq_exit (smp_apic_timer_interrupt)
			
 
				-    sshd-4261  0d.H3   85us : sub_preempt_count (irq_exit)
			
 
				-    sshd-4261  0d.s4   86us : sub_preempt_count (irq_exit)
			
 
				-    sshd-4261  0d.s3   86us : add_preempt_count (__local_bh_disable)
			
 
				-[...]
			
 
				-    sshd-4261  0d.s1   98us : sub_preempt_count (net_rx_action)
			
 
				-    sshd-4261  0d.s.   99us : add_preempt_count (_spin_lock_irq)
			
 
				-    sshd-4261  0d.s1   99us+: _spin_unlock_irq (run_timer_softirq)
			
 
				-    sshd-4261  0d.s.  104us : _local_bh_enable (__do_softirq)
			
 
				-    sshd-4261  0d.s.  104us : sub_preempt_count (_local_bh_enable)
			
 
				-    sshd-4261  0d.s.  105us : _local_bh_enable (__do_softirq)
			
 
				-    sshd-4261  0d.s1  105us : trace_preempt_on (__do_softirq)
			
 
				-
			
 
				-
			
 
				-This is a very interesting trace. It started with the preemption of
			
 
				-the ls task. We see that the task had the "need_resched" bit set
			
 
				-via the 'N' in the trace.  Interrupts were disabled before the spin_lock
			
 
				-at the beginning of the trace. We see that a schedule took place to run
			
 
				-sshd.  When the interrupts were enabled, we took an interrupt.
			
 
				-On return from the interrupt handler, the softirq ran. We took another
			
 
				-interrupt while running the softirq as we see from the capital 'H'.
			
 
				-
			
 
				-
			
 
				-wakeup
			
 
				-------
			
 
				-
			
 
				-In a Real-Time environment it is very important to know the wakeup
			
 
				-time it takes for the highest priority task that is woken up to the
			
 
				-time that it executes. This is also known as "schedule latency".
			
 
				-I stress the point that this is about RT tasks. It is also important
			
 
				-to know the scheduling latency of non-RT tasks, but the average
			
 
				-schedule latency is better for non-RT tasks. Tools like
			
 
				-LatencyTop are more appropriate for such measurements.
			
 
				-
			
 
				-Real-Time environments are interested in the worst case latency.
			
 
				-That is the longest latency it takes for something to happen, and
			
 
				-not the average. We can have a very fast scheduler that may only
			
 
				-have a large latency once in a while, but that would not work well
			
 
				-with Real-Time tasks.  The wakeup tracer was designed to record
			
 
				-the worst case wakeups of RT tasks. Non-RT tasks are not recorded
			
 
				-because the tracer only records one worst case and tracing non-RT
			
 
				-tasks that are unpredictable will overwrite the worst case latency
			
 
				-of RT tasks.
			
 
				-
			
 
				-Since this tracer only deals with RT tasks, we will run this slightly
			
 
				-differently than we did with the previous tracers. Instead of performing
			
 
				-an 'ls', we will run 'sleep 1' under 'chrt' which changes the
			
 
				-priority of the task.
			
 
				-
			
 
				- # echo wakeup > /debug/tracing/current_tracer
			
 
				- # echo 0 > /debug/tracing/tracing_max_latency
			
 
				- # echo 1 > /debug/tracing/tracing_enabled
			
 
				- # chrt -f 5 sleep 1
			
 
				- # echo 0 > /debug/tracing/tracing_enabled
			
 
				- # cat /debug/tracing/latency_trace
			
 
				-# tracer: wakeup
			
 
				-#
			
 
				-wakeup latency trace v1.1.5 on 2.6.26-rc8
			
 
				---------------------------------------------------------------------
			
 
				- latency: 4 us, #2/2, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: sleep-4901 (uid:0 nice:0 policy:1 rt_prio:5)
			
 
				-    -----------------
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-  <idle>-0     1d.h4    0us+: try_to_wake_up (wake_up_process)
			
 
				-  <idle>-0     1d..4    4us : schedule (cpu_idle)
			
 
				-
			
 
				-
			
 
				-
			
 
				-Running this on an idle system, we see that it only took 4 microseconds
			
 
				-to perform the task switch.  Note, since the trace marker in the
			
 
				-schedule is before the actual "switch", we stop the tracing when
			
 
				-the recorded task is about to schedule in. This may change if
			
 
				-we add a new marker at the end of the scheduler.
			
 
				-
			
 
				-Notice that the recorded task is 'sleep' with the PID of 4901 and it
			
 
				-has an rt_prio of 5. This priority is user-space priority and not
			
 
				-the internal kernel priority. The policy is 1 for SCHED_FIFO and 2
			
 
				-for SCHED_RR.
			
 
				-
			
 
				-Doing the same with chrt -r 5 and ftrace_enabled set.
			
 
				-
			
 
				-# tracer: wakeup
			
 
				-#
			
 
				-wakeup latency trace v1.1.5 on 2.6.26-rc8
			
 
				---------------------------------------------------------------------
			
 
				- latency: 50 us, #60/60, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				-    -----------------
			
 
				-    | task: sleep-4068 (uid:0 nice:0 policy:2 rt_prio:5)
			
 
				-    -----------------
			
 
				-
			
 
				-#                _------=> CPU#
			
 
				-#               / _-----=> irqs-off
			
 
				-#              | / _----=> need-resched
			
 
				-#              || / _---=> hardirq/softirq
			
 
				-#              ||| / _--=> preempt-depth
			
 
				-#              |||| /
			
 
				-#              |||||     delay
			
 
				-#  cmd     pid ||||| time  |   caller
			
 
				-#     \   /    |||||   \   |   /
			
 
				-ksoftirq-7     1d.H3    0us : try_to_wake_up (wake_up_process)
			
 
				-ksoftirq-7     1d.H4    1us : sub_preempt_count (marker_probe_cb)
			
 
				-ksoftirq-7     1d.H3    2us : check_preempt_wakeup (try_to_wake_up)
			
 
				-ksoftirq-7     1d.H3    3us : update_curr (check_preempt_wakeup)
			
 
				-ksoftirq-7     1d.H3    4us : calc_delta_mine (update_curr)
			
 
				-ksoftirq-7     1d.H3    5us : __resched_task (check_preempt_wakeup)
			
 
				-ksoftirq-7     1d.H3    6us : task_wake_up_rt (try_to_wake_up)
			
 
				-ksoftirq-7     1d.H3    7us : _spin_unlock_irqrestore (try_to_wake_up)
			
 
				-[...]
			
 
				-ksoftirq-7     1d.H2   17us : irq_exit (smp_apic_timer_interrupt)
			
 
				-ksoftirq-7     1d.H2   18us : sub_preempt_count (irq_exit)
			
 
				-ksoftirq-7     1d.s3   19us : sub_preempt_count (irq_exit)
			
 
				-ksoftirq-7     1..s2   20us : rcu_process_callbacks (__do_softirq)
			
 
				-[...]
			
 
				-ksoftirq-7     1..s2   26us : __rcu_process_callbacks (rcu_process_callbacks)
			
 
				-ksoftirq-7     1d.s2   27us : _local_bh_enable (__do_softirq)
			
 
				-ksoftirq-7     1d.s2   28us : sub_preempt_count (_local_bh_enable)
			
 
				-ksoftirq-7     1.N.3   29us : sub_preempt_count (ksoftirqd)
			
 
				-ksoftirq-7     1.N.2   30us : _cond_resched (ksoftirqd)
			
 
				-ksoftirq-7     1.N.2   31us : __cond_resched (_cond_resched)
			
 
				-ksoftirq-7     1.N.2   32us : add_preempt_count (__cond_resched)
			
 
				-ksoftirq-7     1.N.2   33us : schedule (__cond_resched)
			
 
				-ksoftirq-7     1.N.2   33us : add_preempt_count (schedule)
			
 
				-ksoftirq-7     1.N.3   34us : hrtick_clear (schedule)
			
 
				-ksoftirq-7     1dN.3   35us : _spin_lock (schedule)
			
 
				-ksoftirq-7     1dN.3   36us : add_preempt_count (_spin_lock)
			
 
				-ksoftirq-7     1d..4   37us : put_prev_task_fair (schedule)
			
 
				-ksoftirq-7     1d..4   38us : update_curr (put_prev_task_fair)
			
 
				-[...]
			
 
				-ksoftirq-7     1d..5   47us : _spin_trylock (tracing_record_cmdline)
			
 
				-ksoftirq-7     1d..5   48us : add_preempt_count (_spin_trylock)
			
 
				-ksoftirq-7     1d..6   49us : _spin_unlock (tracing_record_cmdline)
			
 
				-ksoftirq-7     1d..6   49us : sub_preempt_count (_spin_unlock)
			
 
				-ksoftirq-7     1d..4   50us : schedule (__cond_resched)
			
 
				-
			
 
				-The interrupt went off while running ksoftirqd. This task runs at
			
 
				-SCHED_OTHER. Why did not we see the 'N' set early? This may be
			
 
				-a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks
			
 
				-configured, the interrupt and softirq run with their own stack.
			
 
				-Some information is held on the top of the task's stack (need_resched
			
 
				-and preempt_count are both stored there). The setting of the NEED_RESCHED
			
 
				-bit is done directly to the task's stack, but the reading of the
			
 
				-NEED_RESCHED is done by looking at the current stack, which in this case
			
 
				-is the stack for the hard interrupt. This hides the fact that NEED_RESCHED
			
 
				-has been set. We do not see the 'N' until we switch back to the task's
			
 
				-assigned stack.
			
 
				-
			
 
				-function
			
 
				---------
			
 
				-
			
 
				-This tracer is the function tracer. Enabling the function tracer
			
 
				-can be done from the debug file system. Make sure the ftrace_enabled is
			
 
				-set; otherwise this tracer is a nop.
			
 
				-
			
 
				- # sysctl kernel.ftrace_enabled=1
			
 
				- # echo function > /debug/tracing/current_tracer
			
 
				- # echo 1 > /debug/tracing/tracing_enabled
			
 
				- # usleep 1
			
 
				- # echo 0 > /debug/tracing/tracing_enabled
			
 
				- # cat /debug/tracing/trace
			
 
				-# tracer: function
			
 
				-#
			
 
				-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				-#              | |      |          |         |
			
 
				-            bash-4003  [00]   123.638713: finish_task_switch <-schedule
			
 
				-            bash-4003  [00]   123.638714: _spin_unlock_irq <-finish_task_switch
			
 
				-            bash-4003  [00]   123.638714: sub_preempt_count <-_spin_unlock_irq
			
 
				-            bash-4003  [00]   123.638715: hrtick_set <-schedule
			
 
				-            bash-4003  [00]   123.638715: _spin_lock_irqsave <-hrtick_set
			
 
				-            bash-4003  [00]   123.638716: add_preempt_count <-_spin_lock_irqsave
			
 
				-            bash-4003  [00]   123.638716: _spin_unlock_irqrestore <-hrtick_set
			
 
				-            bash-4003  [00]   123.638717: sub_preempt_count <-_spin_unlock_irqrestore
			
 
				-            bash-4003  [00]   123.638717: hrtick_clear <-hrtick_set
			
 
				-            bash-4003  [00]   123.638718: sub_preempt_count <-schedule
			
 
				-            bash-4003  [00]   123.638718: sub_preempt_count <-preempt_schedule
			
 
				-            bash-4003  [00]   123.638719: wait_for_completion <-__stop_machine_run
			
 
				-            bash-4003  [00]   123.638719: wait_for_common <-wait_for_completion
			
 
				-            bash-4003  [00]   123.638720: _spin_lock_irq <-wait_for_common
			
 
				-            bash-4003  [00]   123.638720: add_preempt_count <-_spin_lock_irq
			
 
				-[...]
			
 
				-
			
 
				-
			
 
				-Note: function tracer uses ring buffers to store the above entries.
			
 
				-The newest data may overwrite the oldest data. Sometimes using echo to
			
 
				-stop the trace is not sufficient because the tracing could have overwritten
			
 
				-the data that you wanted to record. For this reason, it is sometimes better to
			
 
				-disable tracing directly from a program. This allows you to stop the
			
 
				-tracing at the point that you hit the part that you are interested in.
			
 
				-To disable the tracing directly from a C program, something like following
			
 
				-code snippet can be used:
			
 
				-
			
 
				-int trace_fd;
			
 
				-[...]
			
 
				-int main(int argc, char *argv[]) {
			
 
				-	[...]
			
 
				-	trace_fd = open("/debug/tracing/tracing_enabled", O_WRONLY);
			
 
				-	[...]
			
 
				-	if (condition_hit()) {
			
 
				-		write(trace_fd, "0", 1);
			
 
				-	}
			
 
				-	[...]
			
 
				-}
			
 
				-
			
 
				-Note: Here we hard coded the path name. The debugfs mount is not
			
 
				-guaranteed to be at /debug (and is more commonly at /sys/kernel/debug).
			
 
				-For simple one time traces, the above is sufficent. For anything else,
			
 
				-a search through /proc/mounts may be needed to find where the debugfs
			
 
				-file-system is mounted.
			
 
				-
			
 
				-
			
 
				-Single thread tracing
			
 
				----------------------
			
 
				-
			
 
				-By writing into /debug/tracing/set_ftrace_pid you can trace a
			
 
				-single thread. For example:
			
 
				-
			
 
				-# cat /debug/tracing/set_ftrace_pid
			
 
				-no pid
			
 
				-# echo 3111 > /debug/tracing/set_ftrace_pid
			
 
				-# cat /debug/tracing/set_ftrace_pid
			
 
				-3111
			
 
				-# echo function > /debug/tracing/current_tracer
			
 
				-# cat /debug/tracing/trace | head
			
 
				- # tracer: function
			
 
				- #
			
 
				- #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
			
 
				- #              | |       |          |         |
			
 
				-     yum-updatesd-3111  [003]  1637.254676: finish_task_switch <-thread_return
			
 
				-     yum-updatesd-3111  [003]  1637.254681: hrtimer_cancel <-schedule_hrtimeout_range
			
 
				-     yum-updatesd-3111  [003]  1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel
			
 
				-     yum-updatesd-3111  [003]  1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
			
 
				-     yum-updatesd-3111  [003]  1637.254685: fget_light <-do_sys_poll
			
 
				-     yum-updatesd-3111  [003]  1637.254686: pipe_poll <-do_sys_poll
			
 
				-# echo -1 > /debug/tracing/set_ftrace_pid
			
 
				-# cat /debug/tracing/trace |head
			
 
				- # tracer: function
			
 
				- #
			
 
				- #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
			
 
				- #              | |       |          |         |
			
 
				- ##### CPU 3 buffer started ####
			
 
				-     yum-updatesd-3111  [003]  1701.957688: free_poll_entry <-poll_freewait
			
 
				-     yum-updatesd-3111  [003]  1701.957689: remove_wait_queue <-free_poll_entry
			
 
				-     yum-updatesd-3111  [003]  1701.957691: fput <-free_poll_entry
			
 
				-     yum-updatesd-3111  [003]  1701.957692: audit_syscall_exit <-sysret_audit
			
 
				-     yum-updatesd-3111  [003]  1701.957693: path_put <-audit_syscall_exit
			
 
				-
			
 
				-If you want to trace a function when executing, you could use
			
 
				-something like this simple program:
			
 
				-
			
 
				-#include <stdio.h>
			
 
				-#include <stdlib.h>
			
 
				-#include <sys/types.h>
			
 
				-#include <sys/stat.h>
			
 
				-#include <fcntl.h>
			
 
				-#include <unistd.h>
			
 
				-
			
 
				-int main (int argc, char **argv)
			
 
				-{
			
 
				-        if (argc < 1)
			
 
				-                exit(-1);
			
 
				-
			
 
				-        if (fork() > 0) {
			
 
				-                int fd, ffd;
			
 
				-                char line[64];
			
 
				-                int s;
			
 
				-
			
 
				-                ffd = open("/debug/tracing/current_tracer", O_WRONLY);
			
 
				-                if (ffd < 0)
			
 
				-                        exit(-1);
			
 
				-                write(ffd, "nop", 3);
			
 
				-
			
 
				-                fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY);
			
 
				-                s = sprintf(line, "%d\n", getpid());
			
 
				-                write(fd, line, s);
			
 
				-
			
 
				-                write(ffd, "function", 8);
			
 
				-
			
 
				-                close(fd);
			
 
				-                close(ffd);
			
 
				-
			
 
				-                execvp(argv[1], argv+1);
			
 
				-        }
			
 
				-
			
 
				-        return 0;
			
 
				-}
			
 
				-
			
 
				-dynamic ftrace
			
 
				---------------
			
 
				-
			
 
				-If CONFIG_DYNAMIC_FTRACE is set, the system will run with
			
 
				-virtually no overhead when function tracing is disabled. The way
			
 
				-this works is the mcount function call (placed at the start of
			
 
				-every kernel function, produced by the -pg switch in gcc), starts
			
 
				-of pointing to a simple return. (Enabling FTRACE will include the
			
 
				--pg switch in the compiling of the kernel.)
			
 
				-
			
 
				-At compile time every C file object is run through the
			
 
				-recordmcount.pl script (located in the scripts directory). This
			
 
				-script will process the C object using objdump to find all the
			
 
				-locations in the .text section that call mcount. (Note, only
			
 
				-the .text section is processed, since processing other sections
			
 
				-like .init.text may cause races due to those sections being freed).
			
 
				-
			
 
				-A new section called "__mcount_loc" is created that holds references
			
 
				-to all the mcount call sites in the .text section. This section is
			
 
				-compiled back into the original object. The final linker will add
			
 
				-all these references into a single table.
			
 
				-
			
 
				-On boot up, before SMP is initialized, the dynamic ftrace code
			
 
				-scans this table and updates all the locations into nops. It also
			
 
				-records the locations, which are added to the available_filter_functions
			
 
				-list.  Modules are processed as they are loaded and before they are
			
 
				-executed.  When a module is unloaded, it also removes its functions from
			
 
				-the ftrace function list. This is automatic in the module unload
			
 
				-code, and the module author does not need to worry about it.
			
 
				-
			
 
				-When tracing is enabled, kstop_machine is called to prevent races
			
 
				-with the CPUS executing code being modified (which can cause the
			
 
				-CPU to do undesireable things), and the nops are patched back
			
 
				-to calls. But this time, they do not call mcount (which is just
			
 
				-a function stub). They now call into the ftrace infrastructure.
			
 
				-
			
 
				-One special side-effect to the recording of the functions being
			
 
				-traced is that we can now selectively choose which functions we
			
 
				-wish to trace and which ones we want the mcount calls to remain as
			
 
				-nops.
			
 
				-
			
 
				-Two files are used, one for enabling and one for disabling the tracing
			
 
				-of specified functions. They are:
			
 
				-
			
 
				-  set_ftrace_filter
			
 
				-
			
 
				-and
			
 
				-
			
 
				-  set_ftrace_notrace
			
 
				-
			
 
				-A list of available functions that you can add to these files is listed
			
 
				-in:
			
 
				-
			
 
				-   available_filter_functions
			
 
				-
			
 
				- # cat /debug/tracing/available_filter_functions
			
 
				-put_prev_task_idle
			
 
				-kmem_cache_create
			
 
				-pick_next_task_rt
			
 
				-get_online_cpus
			
 
				-pick_next_task_fair
			
 
				-mutex_lock
			
 
				-[...]
			
 
				-
			
 
				-If I am only interested in sys_nanosleep and hrtimer_interrupt:
			
 
				-
			
 
				- # echo sys_nanosleep hrtimer_interrupt \
			
 
				-		> /debug/tracing/set_ftrace_filter
			
 
				- # echo ftrace > /debug/tracing/current_tracer
			
 
				- # echo 1 > /debug/tracing/tracing_enabled
			
 
				- # usleep 1
			
 
				- # echo 0 > /debug/tracing/tracing_enabled
			
 
				- # cat /debug/tracing/trace
			
 
				-# tracer: ftrace
			
 
				-#
			
 
				-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				-#              | |      |          |         |
			
 
				-          usleep-4134  [00]  1317.070017: hrtimer_interrupt <-smp_apic_timer_interrupt
			
 
				-          usleep-4134  [00]  1317.070111: sys_nanosleep <-syscall_call
			
 
				-          <idle>-0     [00]  1317.070115: hrtimer_interrupt <-smp_apic_timer_interrupt
			
 
				-
			
 
				-To see which functions are being traced, you can cat the file:
			
 
				-
			
 
				- # cat /debug/tracing/set_ftrace_filter
			
 
				-hrtimer_interrupt
			
 
				-sys_nanosleep
			
 
				-
			
 
				-
			
 
				-Perhaps this is not enough. The filters also allow simple wild cards.
			
 
				-Only the following are currently available
			
 
				-
			
 
				-  <match>*  - will match functions that begin with <match>
			
 
				-  *<match>  - will match functions that end with <match>
			
 
				-  *<match>* - will match functions that have <match> in it
			
 
				-
			
 
				-These are the only wild cards which are supported.
			
 
				-
			
 
				-  <match>*<match> will not work.
			
 
				-
			
 
				-Note: It is better to use quotes to enclose the wild cards, otherwise
			
 
				-  the shell may expand the parameters into names of files in the local
			
 
				-  directory.
			
 
				-
			
 
				- # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
			
 
				-
			
 
				-Produces:
			
 
				-
			
 
				-# tracer: ftrace
			
 
				-#
			
 
				-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				-#              | |      |          |         |
			
 
				-            bash-4003  [00]  1480.611794: hrtimer_init <-copy_process
			
 
				-            bash-4003  [00]  1480.611941: hrtimer_start <-hrtick_set
			
 
				-            bash-4003  [00]  1480.611956: hrtimer_cancel <-hrtick_clear
			
 
				-            bash-4003  [00]  1480.611956: hrtimer_try_to_cancel <-hrtimer_cancel
			
 
				-          <idle>-0     [00]  1480.612019: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				-          <idle>-0     [00]  1480.612025: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				-          <idle>-0     [00]  1480.612032: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				-          <idle>-0     [00]  1480.612037: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				-          <idle>-0     [00]  1480.612382: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				-
			
 
				-
			
 
				-Notice that we lost the sys_nanosleep.
			
 
				-
			
 
				- # cat /debug/tracing/set_ftrace_filter
			
 
				-hrtimer_run_queues
			
 
				-hrtimer_run_pending
			
 
				-hrtimer_init
			
 
				-hrtimer_cancel
			
 
				-hrtimer_try_to_cancel
			
 
				-hrtimer_forward
			
 
				-hrtimer_start
			
 
				-hrtimer_reprogram
			
 
				-hrtimer_force_reprogram
			
 
				-hrtimer_get_next_event
			
 
				-hrtimer_interrupt
			
 
				-hrtimer_nanosleep
			
 
				-hrtimer_wakeup
			
 
				-hrtimer_get_remaining
			
 
				-hrtimer_get_res
			
 
				-hrtimer_init_sleeper
			
 
				-
			
 
				-
			
 
				-This is because the '>' and '>>' act just like they do in bash.
			
 
				-To rewrite the filters, use '>'
			
 
				-To append to the filters, use '>>'
			
 
				-
			
 
				-To clear out a filter so that all functions will be recorded again:
			
 
				-
			
 
				- # echo > /debug/tracing/set_ftrace_filter
			
 
				- # cat /debug/tracing/set_ftrace_filter
			
 
				- #
			
 
				-
			
 
				-Again, now we want to append.
			
 
				-
			
 
				- # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
			
 
				- # cat /debug/tracing/set_ftrace_filter
			
 
				-sys_nanosleep
			
 
				- # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter
			
 
				- # cat /debug/tracing/set_ftrace_filter
			
 
				-hrtimer_run_queues
			
 
				-hrtimer_run_pending
			
 
				-hrtimer_init
			
 
				-hrtimer_cancel
			
 
				-hrtimer_try_to_cancel
			
 
				-hrtimer_forward
			
 
				-hrtimer_start
			
 
				-hrtimer_reprogram
			
 
				-hrtimer_force_reprogram
			
 
				-hrtimer_get_next_event
			
 
				-hrtimer_interrupt
			
 
				-sys_nanosleep
			
 
				-hrtimer_nanosleep
			
 
				-hrtimer_wakeup
			
 
				-hrtimer_get_remaining
			
 
				-hrtimer_get_res
			
 
				-hrtimer_init_sleeper
			
 
				-
			
 
				-
			
 
				-The set_ftrace_notrace prevents those functions from being traced.
			
 
				-
			
 
				- # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace
			
 
				-
			
 
				-Produces:
			
 
				-
			
 
				-# tracer: ftrace
			
 
				-#
			
 
				-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				-#              | |      |          |         |
			
 
				-            bash-4043  [01]   115.281644: finish_task_switch <-schedule
			
 
				-            bash-4043  [01]   115.281645: hrtick_set <-schedule
			
 
				-            bash-4043  [01]   115.281645: hrtick_clear <-hrtick_set
			
 
				-            bash-4043  [01]   115.281646: wait_for_completion <-__stop_machine_run
			
 
				-            bash-4043  [01]   115.281647: wait_for_common <-wait_for_completion
			
 
				-            bash-4043  [01]   115.281647: kthread_stop <-stop_machine_run
			
 
				-            bash-4043  [01]   115.281648: init_waitqueue_head <-kthread_stop
			
 
				-            bash-4043  [01]   115.281648: wake_up_process <-kthread_stop
			
 
				-            bash-4043  [01]   115.281649: try_to_wake_up <-wake_up_process
			
 
				-
			
 
				-We can see that there's no more lock or preempt tracing.
			
 
				-
			
 
				-trace_pipe
			
 
				-----------
			
 
				-
			
 
				-The trace_pipe outputs the same content as the trace file, but the effect
			
 
				-on the tracing is different. Every read from trace_pipe is consumed.
			
 
				-This means that subsequent reads will be different. The trace
			
 
				-is live.
			
 
				-
			
 
				- # echo function > /debug/tracing/current_tracer
			
 
				- # cat /debug/tracing/trace_pipe > /tmp/trace.out &
			
 
				-[1] 4153
			
 
				- # echo 1 > /debug/tracing/tracing_enabled
			
 
				- # usleep 1
			
 
				- # echo 0 > /debug/tracing/tracing_enabled
			
 
				- # cat /debug/tracing/trace
			
 
				-# tracer: function
			
 
				-#
			
 
				-#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				-#              | |      |          |         |
			
 
				-
			
 
				- #
			
 
				- # cat /tmp/trace.out
			
 
				-            bash-4043  [00] 41.267106: finish_task_switch <-schedule
			
 
				-            bash-4043  [00] 41.267106: hrtick_set <-schedule
			
 
				-            bash-4043  [00] 41.267107: hrtick_clear <-hrtick_set
			
 
				-            bash-4043  [00] 41.267108: wait_for_completion <-__stop_machine_run
			
 
				-            bash-4043  [00] 41.267108: wait_for_common <-wait_for_completion
			
 
				-            bash-4043  [00] 41.267109: kthread_stop <-stop_machine_run
			
 
				-            bash-4043  [00] 41.267109: init_waitqueue_head <-kthread_stop
			
 
				-            bash-4043  [00] 41.267110: wake_up_process <-kthread_stop
			
 
				-            bash-4043  [00] 41.267110: try_to_wake_up <-wake_up_process
			
 
				-            bash-4043  [00] 41.267111: select_task_rq_rt <-try_to_wake_up
			
 
				-
			
 
				-
			
 
				-Note, reading the trace_pipe file will block until more input is added.
			
 
				-By changing the tracer, trace_pipe will issue an EOF. We needed
			
 
				-to set the function tracer _before_ we "cat" the trace_pipe file.
			
 
				-
			
 
				-
			
 
				-trace entries
			
 
				--------------
			
 
				-
			
 
				-Having too much or not enough data can be troublesome in diagnosing
			
 
				-an issue in the kernel. The file buffer_size_kb is used to modify
			
 
				-the size of the internal trace buffers. The number listed
			
 
				-is the number of entries that can be recorded per CPU. To know
			
 
				-the full size, multiply the number of possible CPUS with the
			
 
				-number of entries.
			
 
				-
			
 
				- # cat /debug/tracing/buffer_size_kb
			
 
				-1408 (units kilobytes)
			
 
				-
			
 
				-Note, to modify this, you must have tracing completely disabled. To do that,
			
 
				-echo "nop" into the current_tracer. If the current_tracer is not set
			
 
				-to "nop", an EINVAL error will be returned.
			
 
				-
			
 
				- # echo nop > /debug/tracing/current_tracer
			
 
				- # echo 10000 > /debug/tracing/buffer_size_kb
			
 
				- # cat /debug/tracing/buffer_size_kb
			
 
				-10000 (units kilobytes)
			
 
				-
			
 
				-The number of pages which will be allocated is limited to a percentage
			
 
				-of available memory. Allocating too much will produce an error.
			
 
				-
			
 
				- # echo 1000000000000 > /debug/tracing/buffer_size_kb
			
 
				--bash: echo: write error: Cannot allocate memory
			
 
				- # cat /debug/tracing/buffer_size_kb
			
 
				-85
			
 
				-
			
--- a/Documentation/hwmon/g760a
+++ b/Documentation/hwmon/g760a
@@ -0,0 +1,36 @@
 
				+Kernel driver g760a
			
 
				+===================
			
 
				+
			
 
				+Supported chips:
			
 
				+  * Global Mixed-mode Technology Inc. G760A
			
 
				+    Prefix: 'g760a'
			
 
				+    Datasheet: Publicly available at the GMT website
			
 
				+      http://www.gmt.com.tw/datasheet/g760a.pdf
			
 
				+
			
 
				+Author: Herbert Valerio Riedel <hvr@gnu.org>
			
 
				+
			
 
				+Description
			
 
				+-----------
			
 
				+
			
 
				+The GMT G760A Fan Speed PWM Controller is connected directly to a fan
			
 
				+and performs closed-loop control of the fan speed.
			
 
				+
			
 
				+The fan speed is programmed by setting the period via 'pwm1' of two
			
 
				+consecutive speed pulses. The period is defined in terms of clock
			
 
				+cycle counts of an assumed 32kHz clock source.
			
 
				+
			
 
				+Setting a period of 0 stops the fan; setting the period to 255 sets
			
 
				+fan to maximum speed.
			
 
				+
			
 
				+The measured fan rotation speed returned via 'fan1_input' is derived
			
 
				+from the measured speed pulse period by assuming again a 32kHz clock
			
 
				+source and a 2 pulse-per-revolution fan.
			
 
				+
			
 
				+The 'alarms' file provides access to the two alarm bits provided by
			
 
				+the G760A chip's status register: Bit 0 is set when the actual fan
			
 
				+speed differs more than 20% with respect to the programmed fan speed;
			
 
				+bit 1 is set when fan speed is below 1920 RPM.
			
 
				+
			
 
				+The g760a driver will not update its values more frequently than every
			
 
				+other second; reading them more often will do no harm, but will return
			
 
				+'old' values.
			
--- a/Documentation/infiniband/ipoib.txt
+++ b/Documentation/infiniband/ipoib.txt
@@ -24,6 +24,49 @@ Partitions and P_Keys
 
				   The P_Key for any interface is given by the "pkey" file, and the
			
 
				   main interface for a subinterface is in "parent."
			
 
				 
			
 
				+Datagram vs Connected modes
			
 
				+
			
 
				+  The IPoIB driver supports two modes of operation: datagram and
			
 
				+  connected.  The mode is set and read through an interface's
			
 
				+  /sys/class/net/<intf name>/mode file.
			
 
				+
			
 
				+  In datagram mode, the IB UD (Unreliable Datagram) transport is used
			
 
				+  and so the interface MTU has is equal to the IB L2 MTU minus the
			
 
				+  IPoIB encapsulation header (4 bytes).  For example, in a typical IB
			
 
				+  fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
			
 
				+
			
 
				+  In connected mode, the IB RC (Reliable Connected) transport is used.
			
 
				+  Connected mode is to takes advantage of the connected nature of the
			
 
				+  IB transport and allows an MTU up to the maximal IP packet size of
			
 
				+  64K, which reduces the number of IP packets needed for handling
			
 
				+  large UDP datagrams, TCP segments, etc and increases the performance
			
 
				+  for large messages.
			
 
				+
			
 
				+  In connected mode, the interface's UD QP is still used for multicast
			
 
				+  and communication with peers that don't support connected mode. In
			
 
				+  this case, RX emulation of ICMP PMTU packets is used to cause the
			
 
				+  networking stack to use the smaller UD MTU for these neighbours.
			
 
				+
			
 
				+Stateless offloads
			
 
				+
			
 
				+  If the IB HW supports IPoIB stateless offloads, IPoIB advertises
			
 
				+  TCP/IP checksum and/or Large Send (LSO) offloading capability to the
			
 
				+  network stack.
			
 
				+
			
 
				+  Large Receive (LRO) offloading is also implemented and may be turned
			
 
				+  on/off using ethtool calls.  Currently LRO is supported only for
			
 
				+  checksum offload capable devices.
			
 
				+
			
 
				+  Stateless offloads are supported only in datagram mode.  
			
 
				+
			
 
				+Interrupt moderation
			
 
				+
			
 
				+  If the underlying IB device supports CQ event moderation, one can
			
 
				+  use ethtool to set interrupt mitigation parameters and thus reduce
			
 
				+  the overhead incurred by handling interrupts.  The main code path of
			
 
				+  IPoIB doesn't use events for TX completion signaling so only RX
			
 
				+  moderation is supported.
			
 
				+
			
 
				 Debugging Information
			
 
				 
			
 
				   By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
			
@@ -55,3 +98,5 @@ References
 
				     http://ietf.org/rfc/rfc4391.txt 
			
 
				   IP over InfiniBand (IPoIB) Architecture (RFC 4392)
			
 
				     http://ietf.org/rfc/rfc4392.txt 
			
 
				+  IP over InfiniBand: Connected Mode (RFC 4755)
			
 
				+    http://ietf.org/rfc/rfc4755.txt
			
--- a/Documentation/input/bcm5974.txt
+++ b/Documentation/input/bcm5974.txt
@@ -0,0 +1,65 @@
 
				+BCM5974 Driver (bcm5974)
			
 
				+------------------------
			
 
				+	Copyright (C) 2008-2009	Henrik Rydberg <rydberg@euromail.se>
			
 
				+
			
 
				+The USB initialization and package decoding was made by Scott Shawcroft as
			
 
				+part of the touchd user-space driver project:
			
 
				+	Copyright (C) 2008	Scott Shawcroft (scott.shawcroft@gmail.com)
			
 
				+
			
 
				+The BCM5974 driver is based on the appletouch driver:
			
 
				+	Copyright (C) 2001-2004	Greg Kroah-Hartman (greg@kroah.com)
			
 
				+	Copyright (C) 2005	Johannes Berg (johannes@sipsolutions.net)
			
 
				+	Copyright (C) 2005	Stelian Pop (stelian@popies.net)
			
 
				+	Copyright (C) 2005	Frank Arnold (frank@scirocco-5v-turbo.de)
			
 
				+	Copyright (C) 2005	Peter Osterlund (petero2@telia.com)
			
 
				+	Copyright (C) 2005	Michael Hanselmann (linux-kernel@hansmi.ch)
			
 
				+	Copyright (C) 2006	Nicolas Boichat (nicolas@boichat.ch)
			
 
				+
			
 
				+This driver adds support for the multi-touch trackpad on the new Apple
			
 
				+Macbook Air and Macbook Pro laptops. It replaces the appletouch driver on
			
 
				+those computers, and integrates well with the synaptics driver of the Xorg
			
 
				+system.
			
 
				+
			
 
				+Known to work on Macbook Air, Macbook Pro Penryn and the new unibody
			
 
				+Macbook 5 and Macbook Pro 5.
			
 
				+
			
 
				+Usage
			
 
				+-----
			
 
				+
			
 
				+The driver loads automatically for the supported usb device ids, and
			
 
				+becomes available both as an event device (/dev/input/event*) and as a
			
 
				+mouse via the mousedev driver (/dev/input/mice).
			
 
				+
			
 
				+USB Race
			
 
				+--------
			
 
				+
			
 
				+The Apple multi-touch trackpads report both mouse and keyboard events via
			
 
				+different interfaces of the same usb device. This creates a race condition
			
 
				+with the HID driver, which, if not told otherwise, will find the standard
			
 
				+HID mouse and keyboard, and claim the whole device. To remedy, the usb
			
 
				+product id must be listed in the mouse_ignore list of the hid driver.
			
 
				+
			
 
				+Debug output
			
 
				+------------
			
 
				+
			
 
				+To ease the development for new hardware version, verbose packet output can
			
 
				+be switched on with the debug kernel module parameter. The range [1-9]
			
 
				+yields different levels of verbosity. Example (as root):
			
 
				+
			
 
				+echo -n 9 > /sys/module/bcm5974/parameters/debug
			
 
				+
			
 
				+tail -f /var/log/debug
			
 
				+
			
 
				+echo -n 0 > /sys/module/bcm5974/parameters/debug
			
 
				+
			
 
				+Trivia
			
 
				+------
			
 
				+
			
 
				+The driver was developed at the ubuntu forums in June 2008 [1], and now has
			
 
				+a more permanent home at bitmath.org [2].
			
 
				+
			
 
				+Links
			
 
				+-----
			
 
				+
			
 
				+[1] http://ubuntuforums.org/showthread.php?t=840040
			
 
				+[2] http://http://bitmath.org/code/
			
--- a/Documentation/input/multi-touch-protocol.txt
+++ b/Documentation/input/multi-touch-protocol.txt
@@ -0,0 +1,140 @@
 
				+Multi-touch (MT) Protocol
			
 
				+-------------------------
			
 
				+	Copyright (C) 2009	Henrik Rydberg <rydberg@euromail.se>
			
 
				+
			
 
				+
			
 
				+Introduction
			
 
				+------------
			
 
				+
			
 
				+In order to utilize the full power of the new multi-touch devices, a way to
			
 
				+report detailed finger data to user space is needed. This document
			
 
				+describes the multi-touch (MT) protocol which allows kernel drivers to
			
 
				+report details for an arbitrary number of fingers.
			
 
				+
			
 
				+
			
 
				+Usage
			
 
				+-----
			
 
				+
			
 
				+Anonymous finger details are sent sequentially as separate packets of ABS
			
 
				+events. Only the ABS_MT events are recognized as part of a finger
			
 
				+packet. The end of a packet is marked by calling the input_mt_sync()
			
 
				+function, which generates a SYN_MT_REPORT event. The end of multi-touch
			
 
				+transfer is marked by calling the usual input_sync() function.
			
 
				+
			
 
				+A set of ABS_MT events with the desired properties is defined. The events
			
 
				+are divided into categories, to allow for partial implementation.  The
			
 
				+minimum set consists of ABS_MT_TOUCH_MAJOR, ABS_MT_POSITION_X and
			
 
				+ABS_MT_POSITION_Y, which allows for multiple fingers to be tracked.  If the
			
 
				+device supports it, the ABS_MT_WIDTH_MAJOR may be used to provide the size
			
 
				+of the approaching finger. Anisotropy and direction may be specified with
			
 
				+ABS_MT_TOUCH_MINOR, ABS_MT_WIDTH_MINOR and ABS_MT_ORIENTATION. Devices with
			
 
				+more granular information may specify general shapes as blobs, i.e., as a
			
 
				+sequence of rectangular shapes grouped together by an
			
 
				+ABS_MT_BLOB_ID. Finally, the ABS_MT_TOOL_TYPE may be used to specify
			
 
				+whether the touching tool is a finger or a pen or something else.
			
 
				+
			
 
				+
			
 
				+Event Semantics
			
 
				+---------------
			
 
				+
			
 
				+The word "contact" is used to describe a tool which is in direct contact
			
 
				+with the surface. A finger, a pen or a rubber all classify as contacts.
			
 
				+
			
 
				+ABS_MT_TOUCH_MAJOR
			
 
				+
			
 
				+The length of the major axis of the contact. The length should be given in
			
 
				+surface units. If the surface has an X times Y resolution, the largest
			
 
				+possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal.
			
 
				+
			
 
				+ABS_MT_TOUCH_MINOR
			
 
				+
			
 
				+The length, in surface units, of the minor axis of the contact. If the
			
 
				+contact is circular, this event can be omitted.
			
 
				+
			
 
				+ABS_MT_WIDTH_MAJOR
			
 
				+
			
 
				+The length, in surface units, of the major axis of the approaching
			
 
				+tool. This should be understood as the size of the tool itself. The
			
 
				+orientation of the contact and the approaching tool are assumed to be the
			
 
				+same.
			
 
				+
			
 
				+ABS_MT_WIDTH_MINOR
			
 
				+
			
 
				+The length, in surface units, of the minor axis of the approaching
			
 
				+tool. Omit if circular.
			
 
				+
			
 
				+The above four values can be used to derive additional information about
			
 
				+the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates
			
 
				+the notion of pressure. The fingers of the hand and the palm all have
			
 
				+different characteristic widths [1].
			
 
				+
			
 
				+ABS_MT_ORIENTATION
			
 
				+
			
 
				+The orientation of the ellipse. The value should describe half a revolution
			
 
				+clockwise around the touch center. The scale of the value is arbitrary, but
			
 
				+zero should be returned for an ellipse aligned along the Y axis of the
			
 
				+surface. As an example, an index finger placed straight onto the axis could
			
 
				+return zero orientation, something negative when twisted to the left, and
			
 
				+something positive when twisted to the right. This value can be omitted if
			
 
				+the touching object is circular, or if the information is not available in
			
 
				+the kernel driver.
			
 
				+
			
 
				+ABS_MT_POSITION_X
			
 
				+
			
 
				+The surface X coordinate of the center of the touching ellipse.
			
 
				+
			
 
				+ABS_MT_POSITION_Y
			
 
				+
			
 
				+The surface Y coordinate of the center of the touching ellipse.
			
 
				+
			
 
				+ABS_MT_TOOL_TYPE
			
 
				+
			
 
				+The type of approaching tool. A lot of kernel drivers cannot distinguish
			
 
				+between different tool types, such as a finger or a pen. In such cases, the
			
 
				+event should be omitted. The protocol currently supports MT_TOOL_FINGER and
			
 
				+MT_TOOL_PEN [2].
			
 
				+
			
 
				+ABS_MT_BLOB_ID
			
 
				+
			
 
				+The BLOB_ID groups several packets together into one arbitrarily shaped
			
 
				+contact. This is a low-level anonymous grouping, and should not be confused
			
 
				+with the high-level contactID, explained below. Most kernel drivers will
			
 
				+not have this capability, and can safely omit the event.
			
 
				+
			
 
				+
			
 
				+Finger Tracking
			
 
				+---------------
			
 
				+
			
 
				+The kernel driver should generate an arbitrary enumeration of the set of
			
 
				+anonymous contacts currently on the surface. The order in which the packets
			
 
				+appear in the event stream is not important.
			
 
				+
			
 
				+The process of finger tracking, i.e., to assign a unique contactID to each
			
 
				+initiated contact on the surface, is left to user space; preferably the
			
 
				+multi-touch X driver [3]. In that driver, the contactID stays the same and
			
 
				+unique until the contact vanishes (when the finger leaves the surface). The
			
 
				+problem of assigning a set of anonymous fingers to a set of identified
			
 
				+fingers is a euclidian bipartite matching problem at each event update, and
			
 
				+relies on a sufficiently rapid update rate.
			
 
				+
			
 
				+Notes
			
 
				+-----
			
 
				+
			
 
				+In order to stay compatible with existing applications, the data
			
 
				+reported in a finger packet must not be recognized as single-touch
			
 
				+events. In addition, all finger data must bypass input filtering,
			
 
				+since subsequent events of the same type refer to different fingers.
			
 
				+
			
 
				+The first kernel driver to utilize the MT protocol is the bcm5974 driver,
			
 
				+where examples can be found.
			
 
				+
			
 
				+[1] With the extension ABS_MT_APPROACH_X and ABS_MT_APPROACH_Y, the
			
 
				+difference between the contact position and the approaching tool position
			
 
				+could be used to derive tilt.
			
 
				+[2] The list can of course be extended.
			
 
				+[3] The multi-touch X driver is currently in the prototyping stage. At the
			
 
				+time of writing (April 2009), the MT protocol is not yet merged, and the
			
 
				+prototype implements finger matching, basic mouse support and two-finger
			
 
				+scrolling. The project aims at improving the quality of current multi-touch
			
 
				+functionality available in the synaptics X driver, and in addition
			
 
				+implement more advanced gestures.
			
--- a/Documentation/input/rotary-encoder.txt
+++ b/Documentation/input/rotary-encoder.txt
@@ -0,0 +1,101 @@
 
				+rotary-encoder - a generic driver for GPIO connected devices
			
 
				+Daniel Mack <daniel@caiaq.de>, Feb 2009
			
 
				+
			
 
				+0. Function
			
 
				+-----------
			
 
				+
			
 
				+Rotary encoders are devices which are connected to the CPU or other
			
 
				+peripherals with two wires. The outputs are phase-shifted by 90 degrees
			
 
				+and by triggering on falling and rising edges, the turn direction can
			
 
				+be determined.
			
 
				+
			
 
				+The phase diagram of these two outputs look like this:
			
 
				+
			
 
				+                  _____       _____       _____
			
 
				+                 |     |     |     |     |     |
			
 
				+  Channel A  ____|     |_____|     |_____|     |____
			
 
				+
			
 
				+                 :  :  :  :  :  :  :  :  :  :  :  :
			
 
				+            __       _____       _____       _____
			
 
				+              |     |     |     |     |     |     |
			
 
				+  Channel B   |_____|     |_____|     |_____|     |__
			
 
				+
			
 
				+                 :  :  :  :  :  :  :  :  :  :  :  :
			
 
				+  Event          a  b  c  d  a  b  c  d  a  b  c  d
			
 
				+
			
 
				+                |<-------->|
			
 
				+	          one step
			
 
				+
			
 
				+
			
 
				+For more information, please see
			
 
				+	http://en.wikipedia.org/wiki/Rotary_encoder
			
 
				+
			
 
				+
			
 
				+1. Events / state machine
			
 
				+-------------------------
			
 
				+
			
 
				+a) Rising edge on channel A, channel B in low state
			
 
				+	This state is used to recognize a clockwise turn
			
 
				+
			
 
				+b) Rising edge on channel B, channel A in high state
			
 
				+	When entering this state, the encoder is put into 'armed' state,
			
 
				+	meaning that there it has seen half the way of a one-step transition.
			
 
				+
			
 
				+c) Falling edge on channel A, channel B in high state
			
 
				+	This state is used to recognize a counter-clockwise turn
			
 
				+
			
 
				+d) Falling edge on channel B, channel A in low state
			
 
				+	Parking position. If the encoder enters this state, a full transition
			
 
				+	should have happend, unless it flipped back on half the way. The
			
 
				+	'armed' state tells us about that.
			
 
				+
			
 
				+2. Platform requirements
			
 
				+------------------------
			
 
				+
			
 
				+As there is no hardware dependent call in this driver, the platform it is
			
 
				+used with must support gpiolib. Another requirement is that IRQs must be
			
 
				+able to fire on both edges.
			
 
				+
			
 
				+
			
 
				+3. Board integration
			
 
				+--------------------
			
 
				+
			
 
				+To use this driver in your system, register a platform_device with the
			
 
				+name 'rotary-encoder' and associate the IRQs and some specific platform
			
 
				+data with it.
			
 
				+
			
 
				+struct rotary_encoder_platform_data is declared in
			
 
				+include/linux/rotary-encoder.h and needs to be filled with the number of
			
 
				+steps the encoder has and can carry information about externally inverted
			
 
				+signals (because of used invertig buffer or other reasons).
			
 
				+
			
 
				+Because GPIO to IRQ mapping is platform specific, this information must
			
 
				+be given in seperately to the driver. See the example below.
			
 
				+
			
 
				+---------<snip>---------
			
 
				+
			
 
				+/* board support file example */
			
 
				+
			
 
				+#include <linux/input.h>
			
 
				+#include <linux/rotary_encoder.h>
			
 
				+
			
 
				+#define GPIO_ROTARY_A 1
			
 
				+#define GPIO_ROTARY_B 2
			
 
				+
			
 
				+static struct rotary_encoder_platform_data my_rotary_encoder_info = {
			
 
				+	.steps		= 24,
			
 
				+	.axis		= ABS_X,
			
 
				+	.gpio_a		= GPIO_ROTARY_A,
			
 
				+	.gpio_b		= GPIO_ROTARY_B,
			
 
				+	.inverted_a	= 0,
			
 
				+	.inverted_b	= 0,
			
 
				+};
			
 
				+
			
 
				+static struct platform_device rotary_encoder_device = {
			
 
				+	.name		= "rotary-encoder",
			
 
				+	.id		= 0,
			
 
				+	.dev		= {
			
 
				+		.platform_data = &my_rotary_encoder_info,
			
 
				+	}
			
 
				+};
			
 
				+
			
--- a/Documentation/isdn/00-INDEX
+++ b/Documentation/isdn/00-INDEX
@@ -2,8 +2,14 @@
 
				 	- this file (info on ISDN implementation for Linux)
			
 
				 CREDITS
			
 
				 	- list of the kind folks that brought you this stuff.
			
 
				+HiSax.cert
			
 
				+	- information about the ITU approval certification of the HiSax driver.
			
 
				 INTERFACE
			
 
				-	- description of Linklevel and Hardwarelevel ISDN interface.
			
 
				+	- description of isdn4linux Link Level and Hardware Level interfaces.
			
 
				+INTERFACE.fax
			
 
				+	- description of the fax subinterface of isdn4linux.
			
 
				+INTERFACE.CAPI
			
 
				+	- description of kernel CAPI Link Level to Hardware Level interface.
			
 
				 README
			
 
				 	- general info on what you need and what to do for Linux ISDN.
			
 
				 README.FAQ
			
@@ -12,6 +18,8 @@ README.audio
 
				 	- info for running audio over ISDN.
			
 
				 README.fax
			
 
				 	- info for using Fax over ISDN.
			
 
				+README.gigaset
			
 
				+	- info on the drivers for Siemens Gigaset ISDN adapters.
			
 
				 README.icn
			
 
				 	- info on the ICN-ISDN-card and its driver.
			
 
				 README.HiSax
			
@@ -37,7 +45,8 @@ README.diversion
 
				 README.sc
			
 
				 	- info on driver for Spellcaster cards.
			
 
				 README.x25
			
 
				-    _ info for running X.25 over ISDN.
			
 
				+	- info for running X.25 over ISDN.
			
 
				 README.hysdn
			
 
				-        - info on driver for Hypercope active HYSDN cards
			
 
				- 
			
 
				+	- info on driver for Hypercope active HYSDN cards
			
 
				+README.mISDN
			
 
				+	- info on the Modular ISDN subsystem (mISDN).
			
--- a/Documentation/isdn/INTERFACE.CAPI
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -0,0 +1,213 @@
 
				+Kernel CAPI Interface to Hardware Drivers
			
 
				+-----------------------------------------
			
 
				+
			
 
				+1. Overview
			
 
				+
			
 
				+From the CAPI 2.0 specification:
			
 
				+COMMON-ISDN-API (CAPI) is an application programming interface standard used
			
 
				+to access ISDN equipment connected to basic rate interfaces (BRI) and primary
			
 
				+rate interfaces (PRI).
			
 
				+
			
 
				+Kernel CAPI operates as a dispatching layer between CAPI applications and CAPI
			
 
				+hardware drivers. Hardware drivers register ISDN devices (controllers, in CAPI
			
 
				+lingo) with Kernel CAPI to indicate their readiness to provide their service
			
 
				+to CAPI applications. CAPI applications also register with Kernel CAPI,
			
 
				+requesting association with a CAPI device. Kernel CAPI then dispatches the
			
 
				+application registration to an available device, forwarding it to the
			
 
				+corresponding hardware driver. Kernel CAPI then forwards CAPI messages in both
			
 
				+directions between the application and the hardware driver.
			
 
				+
			
 
				+Format and semantics of CAPI messages are specified in the CAPI 2.0 standard.
			
 
				+This standard is freely available from http://www.capi.org.
			
 
				+
			
 
				+
			
 
				+2. Driver and Device Registration
			
 
				+
			
 
				+CAPI drivers optionally register themselves with Kernel CAPI by calling the
			
 
				+Kernel CAPI function register_capi_driver() with a pointer to a struct
			
 
				+capi_driver. This structure must be filled with the name and revision of the
			
 
				+driver, and optionally a pointer to a callback function, add_card(). The
			
 
				+registration can be revoked by calling the function unregister_capi_driver()
			
 
				+with a pointer to the same struct capi_driver.
			
 
				+
			
 
				+CAPI drivers must register each of the ISDN devices they control with Kernel
			
 
				+CAPI by calling the Kernel CAPI function attach_capi_ctr() with a pointer to a
			
 
				+struct capi_ctr before they can be used. This structure must be filled with
			
 
				+the names of the driver and controller, and a number of callback function
			
 
				+pointers which are subsequently used by Kernel CAPI for communicating with the
			
 
				+driver. The registration can be revoked by calling the function
			
 
				+detach_capi_ctr() with a pointer to the same struct capi_ctr.
			
 
				+
			
 
				+Before the device can be actually used, the driver must fill in the device
			
 
				+information fields 'manu', 'version', 'profile' and 'serial' in the capi_ctr
			
 
				+structure of the device, and signal its readiness by calling capi_ctr_ready().
			
 
				+From then on, Kernel CAPI may call the registered callback functions for the
			
 
				+device.
			
 
				+
			
 
				+If the device becomes unusable for any reason (shutdown, disconnect ...), the
			
 
				+driver has to call capi_ctr_reseted(). This will prevent further calls to the
			
 
				+callback functions by Kernel CAPI.
			
 
				+
			
 
				+
			
 
				+3. Application Registration and Communication
			
 
				+
			
 
				+Kernel CAPI forwards registration requests from applications (calls to CAPI
			
 
				+operation CAPI_REGISTER) to an appropriate hardware driver by calling its
			
 
				+register_appl() callback function. A unique Application ID (ApplID, u16) is
			
 
				+allocated by Kernel CAPI and passed to register_appl() along with the
			
 
				+parameter structure provided by the application. This is analogous to the
			
 
				+open() operation on regular files or character devices.
			
 
				+
			
 
				+After a successful return from register_appl(), CAPI messages from the
			
 
				+application may be passed to the driver for the device via calls to the
			
 
				+send_message() callback function. The CAPI message to send is stored in the
			
 
				+data portion of an skb. Conversely, the driver may call Kernel CAPI's
			
 
				+capi_ctr_handle_message() function to pass a received CAPI message to Kernel
			
 
				+CAPI for forwarding to an application, specifying its ApplID.
			
 
				+
			
 
				+Deregistration requests (CAPI operation CAPI_RELEASE) from applications are
			
 
				+forwarded as calls to the release_appl() callback function, passing the same
			
 
				+ApplID as with register_appl(). After return from release_appl(), no CAPI
			
 
				+messages for that application may be passed to or from the device anymore.
			
 
				+
			
 
				+
			
 
				+4. Data Structures
			
 
				+
			
 
				+4.1 struct capi_driver
			
 
				+
			
 
				+This structure describes a Kernel CAPI driver itself. It is used in the
			
 
				+register_capi_driver() and unregister_capi_driver() functions, and contains
			
 
				+the following non-private fields, all to be set by the driver before calling
			
 
				+register_capi_driver():
			
 
				+
			
 
				+char name[32]
			
 
				+	the name of the driver, as a zero-terminated ASCII string
			
 
				+char revision[32]
			
 
				+	the revision number of the driver, as a zero-terminated ASCII string
			
 
				+int (*add_card)(struct capi_driver *driver, capicardparams *data)
			
 
				+	a callback function pointer (may be NULL)
			
 
				+
			
 
				+
			
 
				+4.2 struct capi_ctr
			
 
				+
			
 
				+This structure describes an ISDN device (controller) handled by a Kernel CAPI
			
 
				+driver. After registration via the attach_capi_ctr() function it is passed to
			
 
				+all controller specific lower layer interface and callback functions to
			
 
				+identify the controller to operate on.
			
 
				+
			
 
				+It contains the following non-private fields:
			
 
				+
			
 
				+- to be set by the driver before calling attach_capi_ctr():
			
 
				+
			
 
				+struct module *owner
			
 
				+	pointer to the driver module owning the device
			
 
				+
			
 
				+void *driverdata
			
 
				+	an opaque pointer to driver specific data, not touched by Kernel CAPI
			
 
				+
			
 
				+char name[32]
			
 
				+	the name of the controller, as a zero-terminated ASCII string
			
 
				+
			
 
				+char *driver_name
			
 
				+	the name of the driver, as a zero-terminated ASCII string
			
 
				+
			
 
				+int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata)
			
 
				+	(optional) pointer to a callback function for sending firmware and
			
 
				+	configuration data to the device
			
 
				+
			
 
				+void (*reset_ctr)(struct capi_ctr *ctrlr)
			
 
				+	pointer to a callback function for performing a reset on the device,
			
 
				+	releasing all registered applications
			
 
				+
			
 
				+void (*register_appl)(struct capi_ctr *ctrlr, u16 applid,
			
 
				+			capi_register_params *rparam)
			
 
				+void (*release_appl)(struct capi_ctr *ctrlr, u16 applid)
			
 
				+	pointers to callback functions for registration and deregistration of
			
 
				+	applications with the device
			
 
				+
			
 
				+u16  (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb)
			
 
				+	pointer to a callback function for sending a CAPI message to the
			
 
				+	device
			
 
				+
			
 
				+char *(*procinfo)(struct capi_ctr *ctrlr)
			
 
				+	pointer to a callback function returning the entry for the device in
			
 
				+	the CAPI controller info table, /proc/capi/controller
			
 
				+
			
 
				+read_proc_t *ctr_read_proc
			
 
				+	pointer to the read_proc callback function for the device's proc file
			
 
				+	system entry, /proc/capi/controllers/<n>; will be called with a
			
 
				+	pointer to the device's capi_ctr structure as the last (data) argument
			
 
				+
			
 
				+- to be filled in before calling capi_ctr_ready():
			
 
				+
			
 
				+u8 manu[CAPI_MANUFACTURER_LEN]
			
 
				+	value to return for CAPI_GET_MANUFACTURER
			
 
				+
			
 
				+capi_version version
			
 
				+	value to return for CAPI_GET_VERSION
			
 
				+
			
 
				+capi_profile profile
			
 
				+	value to return for CAPI_GET_PROFILE
			
 
				+
			
 
				+u8 serial[CAPI_SERIAL_LEN]
			
 
				+	value to return for CAPI_GET_SERIAL
			
 
				+
			
 
				+
			
 
				+5. Lower Layer Interface Functions
			
 
				+
			
 
				+(declared in <linux/isdn/capilli.h>)
			
 
				+
			
 
				+void register_capi_driver(struct capi_driver *drvr)
			
 
				+void unregister_capi_driver(struct capi_driver *drvr)
			
 
				+	register/unregister a driver with Kernel CAPI
			
 
				+
			
 
				+int attach_capi_ctr(struct capi_ctr *ctrlr)
			
 
				+int detach_capi_ctr(struct capi_ctr *ctrlr)
			
 
				+	register/unregister a device (controller) with Kernel CAPI
			
 
				+
			
 
				+void capi_ctr_ready(struct capi_ctr *ctrlr)
			
 
				+void capi_ctr_reseted(struct capi_ctr *ctrlr)
			
 
				+	signal controller ready/not ready
			
 
				+
			
 
				+void capi_ctr_suspend_output(struct capi_ctr *ctrlr)
			
 
				+void capi_ctr_resume_output(struct capi_ctr *ctrlr)
			
 
				+	signal suspend/resume
			
 
				+
			
 
				+void capi_ctr_handle_message(struct capi_ctr * ctrlr, u16 applid,
			
 
				+				struct sk_buff *skb)
			
 
				+	pass a received CAPI message to Kernel CAPI
			
 
				+	for forwarding to the specified application
			
 
				+
			
 
				+
			
 
				+6. Helper Functions and Macros
			
 
				+
			
 
				+Library functions (from <linux/isdn/capilli.h>):
			
 
				+
			
 
				+void capilib_new_ncci(struct list_head *head, u16 applid,
			
 
				+			u32 ncci, u32 winsize)
			
 
				+void capilib_free_ncci(struct list_head *head, u16 applid, u32 ncci)
			
 
				+void capilib_release_appl(struct list_head *head, u16 applid)
			
 
				+void capilib_release(struct list_head *head)
			
 
				+void capilib_data_b3_conf(struct list_head *head, u16 applid,
			
 
				+			u32 ncci, u16 msgid)
			
 
				+u16  capilib_data_b3_req(struct list_head *head, u16 applid,
			
 
				+			u32 ncci, u16 msgid)
			
 
				+
			
 
				+
			
 
				+Macros to extract/set element values from/in a CAPI message header
			
 
				+(from <linux/isdn/capiutil.h>):
			
 
				+
			
 
				+Get Macro		Set Macro			Element (Type)
			
 
				+
			
 
				+CAPIMSG_LEN(m)		CAPIMSG_SETLEN(m, len)		Total Length (u16)
			
 
				+CAPIMSG_APPID(m)	CAPIMSG_SETAPPID(m, applid)	ApplID (u16)
			
 
				+CAPIMSG_COMMAND(m)	CAPIMSG_SETCOMMAND(m,cmd)	Command (u8)
			
 
				+CAPIMSG_SUBCOMMAND(m)	CAPIMSG_SETSUBCOMMAND(m, cmd)	Subcommand (u8)
			
 
				+CAPIMSG_CMD(m)		-				Command*256
			
 
				+							+ Subcommand (u16)
			
 
				+CAPIMSG_MSGID(m)	CAPIMSG_SETMSGID(m, msgid)	Message Number (u16)
			
 
				+
			
 
				+CAPIMSG_CONTROL(m)	CAPIMSG_SETCONTROL(m, contr)	Controller/PLCI/NCCI
			
 
				+							(u32)
			
 
				+CAPIMSG_DATALEN(m)	CAPIMSG_SETDATALEN(m, len)	Data Length (u16)
			
 
				+
			
--- a/Documentation/isdn/README.gigaset
+++ b/Documentation/isdn/README.gigaset
@@ -61,24 +61,28 @@ GigaSet 307x Device Driver
 
				      ---------------------
			
 
				 2.1. Modules
			
 
				      -------
			
 
				-     To get the device working, you have to load the proper kernel module. You
			
 
				-     can do this using
			
 
				-         modprobe modulename
			
 
				-     where modulename is ser_gigaset (M101), usb_gigaset (M105), or
			
 
				-     bas_gigaset (direct USB connection to the base).
			
 
				+     For the devices to work, the proper kernel modules have to be loaded.
			
 
				+     This normally happens automatically when the system detects the USB
			
 
				+     device (base, M105) or when the line discipline is attached (M101). It
			
 
				+     can also be triggered manually using the modprobe(8) command, for example
			
 
				+     for troubleshooting or to pass module parameters.
			
 
				 
			
 
				      The module ser_gigaset provides a serial line discipline N_GIGASET_M101
			
 
				-     which drives the device through the regular serial line driver. To use it,
			
 
				-     run the Gigaset M101 daemon "gigasetm101d" (also available from
			
 
				-     http://sourceforge.net/projects/gigaset307x/) with the device file of the
			
 
				-     RS232 port to the M101 as an argument, for example:
			
 
				-	 gigasetm101d /dev/ttyS1
			
 
				-     This will open the device file, set its line discipline to N_GIGASET_M101,
			
 
				-     and then sleep in the background, keeping the device open so that the
			
 
				-     line discipline remains active. To deactivate it, kill the daemon, for
			
 
				-     example with
			
 
				-	 killall gigasetm101d
			
 
				-     before disconnecting the device.
			
 
				+     which drives the device through the regular serial line driver. It must
			
 
				+     be attached to the serial line to which the M101 is connected with the
			
 
				+     ldattach(8) command (requires util-linux-ng release 2.14 or later), for
			
 
				+     example:
			
 
				+	 ldattach GIGASET_M101 /dev/ttyS1
			
 
				+     This will open the device file, attach the line discipline to it, and
			
 
				+     then sleep in the background, keeping the device open so that the line
			
 
				+     discipline remains active. To deactivate it, kill the daemon, for example
			
 
				+     with
			
 
				+	 killall ldattach
			
 
				+     before disconnecting the device. To have this happen automatically at
			
 
				+     system startup/shutdown on an LSB compatible system, create and activate
			
 
				+     an appropriate LSB startup script /etc/init.d/gigaset. (The init name
			
 
				+     'gigaset' is officially assigned to this project by LANANA.)
			
 
				+     Alternatively, just add the 'ldattach' command line to /etc/rc.local.
			
 
				 
			
 
				 2.2. Device nodes for user space programs
			
 
				      ------------------------------------
			
@@ -194,10 +198,11 @@ GigaSet 307x Device Driver
 
				      operation (for wireless access to the base), but are needed for access
			
 
				      to the M105's own configuration mode (registration to the base, baudrate
			
 
				      and line format settings, device status queries) via the gigacontr
			
 
				-     utility. Their use is disabled in the driver by default for safety
			
 
				-     reasons but can be enabled by setting the kernel configuration option
			
 
				-     "Support for undocumented USB requests" (GIGASET_UNDOCREQ) to "Y" and
			
 
				-     recompiling.
			
 
				+     utility. Their use is controlled by the kernel configuration option
			
 
				+     "Support for undocumented USB requests" (CONFIG_GIGASET_UNDOCREQ). If you
			
 
				+     encounter error code -ENOTTY when trying to use some features of the
			
 
				+     M105, try setting that option to "y" via 'make {x,menu}config' and
			
 
				+     recompiling the driver.
			
 
				 
			
 
				 
			
 
				 3.   Troubleshooting
			
@@ -228,6 +233,13 @@ GigaSet 307x Device Driver
 
				      Solution:
			
 
				         Select Unimodem mode for all DECT data adapters. (see section 2.4.)
			
 
				 
			
 
				+     Problem:
			
 
				+        You want to configure your USB DECT data adapter (M105) but gigacontr
			
 
				+        reports an error: "/dev/ttyGU0: Inappropriate ioctl for device".
			
 
				+     Solution:
			
 
				+        Recompile the usb_gigaset driver with the kernel configuration option
			
 
				+        CONFIG_GIGASET_UNDOCREQ set to 'y'. (see section 2.6.)
			
 
				+
			
 
				 3.2. Telling the driver to provide more information
			
 
				      ----------------------------------------------
			
 
				      Building the driver with the "Gigaset debugging" kernel configuration
			
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -40,10 +40,16 @@ This document describes the Linux kernel Makefiles.
 
				 	   --- 6.7 Custom kbuild commands
			
 
				 	   --- 6.8 Preprocessing linker scripts
			
 
				 
			
 
				-	=== 7 Kbuild Variables
			
 
				-	=== 8 Makefile language
			
 
				-	=== 9 Credits
			
 
				-	=== 10 TODO
			
 
				+	=== 7 Kbuild syntax for exported headers
			
 
				+		--- 7.1 header-y
			
 
				+		--- 7.2 objhdr-y
			
 
				+		--- 7.3 destination-y
			
 
				+		--- 7.4 unifdef-y (deprecated)
			
 
				+
			
 
				+	=== 8 Kbuild Variables
			
 
				+	=== 9 Makefile language
			
 
				+	=== 10 Credits
			
 
				+	=== 11 TODO
			
 
				 
			
 
				 === 1 Overview
			
 
				 
			
@@ -310,6 +316,16 @@ more details, with real examples.
 
				 		#arch/m68k/fpsp040/Makefile
			
 
				 		ldflags-y := -x
			
 
				 
			
 
				+    subdir-ccflags-y, subdir-asflags-y
			
 
				+	The two flags listed above are similar to ccflags-y and as-falgs-y.
			
 
				+	The difference is that the subdir- variants has effect for the kbuild
			
 
				+	file where tey are present and all subdirectories.
			
 
				+	Options specified using subdir-* are added to the commandline before
			
 
				+	the options specified using the non-subdir variants.
			
 
				+
			
 
				+	Example:
			
 
				+		subdir-ccflags-y := -Werror
			
 
				+
			
 
				     CFLAGS_$@, AFLAGS_$@
			
 
				 
			
 
				 	CFLAGS_$@ and AFLAGS_$@ only apply to commands in current
			
@@ -1143,8 +1159,69 @@ When kbuild executes, the following steps are followed (roughly):
 
				 	The kbuild infrastructure for *lds file are used in several
			
 
				 	architecture-specific files.
			
 
				 
			
 
				+=== 7 Kbuild syntax for exported headers
			
 
				+
			
 
				+The kernel include a set of headers that is exported to userspace.
			
 
				+Many headers can be exported as-is but other headers requires  a
			
 
				+minimal pre-processing before they are ready for user-space.
			
 
				+The pre-processing does:
			
 
				+- drop kernel specific annotations
			
 
				+- drop include of compiler.h
			
 
				+- drop all sections that is kernel internat (guarded by ifdef __KERNEL__)
			
 
				+
			
 
				+Each relevant directory contain a file name "Kbuild" which specify the
			
 
				+headers to be exported.
			
 
				+See subsequent chapter for the syntax of the Kbuild file.
			
 
				+
			
 
				+	--- 7.1 header-y
			
 
				+
			
 
				+	header-y specify header files to be exported.
			
 
				+
			
 
				+		Example:
			
 
				+			#include/linux/Kbuild
			
 
				+			header-y += usb/
			
 
				+			header-y += aio_abi.h
			
 
				+
			
 
				+	The convention is to list one file per line and
			
 
				+	preferably in alphabetic order.
			
 
				+
			
 
				+	header-y also specify which subdirectories to visit.
			
 
				+	A subdirectory is identified by a trailing '/' which
			
 
				+	can be seen in the example above for the usb subdirectory.
			
 
				+
			
 
				+	Subdirectories are visited before their parent directories.
			
 
				+
			
 
				+	--- 7.2 objhdr-y
			
 
				+
			
 
				+	objhdr-y specifies generated files to be exported.
			
 
				+	Generated files are special as they need to be looked
			
 
				+	up in another directory when doing 'make O=...' builds.
			
 
				+
			
 
				+		Example:
			
 
				+			#include/linux/Kbuild
			
 
				+			objhdr-y += version.h
			
 
				+
			
 
				+	--- 7.3 destination-y
			
 
				+
			
 
				+	When an architecture have a set of exported headers that needs to be
			
 
				+	exported to a different directory destination-y is used.
			
 
				+	destination-y specify the destination directory for all exported
			
 
				+	headers in the file where it is present.
			
 
				+
			
 
				+		Example:
			
 
				+			#arch/xtensa/platforms/s6105/include/platform/Kbuild
			
 
				+			destination-y := include/linux
			
 
				+
			
 
				+	In the example above all exported headers in the Kbuild file
			
 
				+	will be located in the directory "include/linux" when exported.
			
 
				+
			
 
				+
			
 
				+	--- 7.4 unifdef-y (deprecated)
			
 
				+
			
 
				+	unifdef-y is deprecated. A direct replacement is header-y.
			
 
				+
			
 
				 
			
 
				-=== 7 Kbuild Variables
			
 
				+=== 8 Kbuild Variables
			
 
				 
			
 
				 The top Makefile exports the following variables:
			
 
				 
			
@@ -1206,7 +1283,7 @@ The top Makefile exports the following variables:
 
				 	INSTALL_MOD_STRIP will used as the option(s) to the strip command.
			
 
				 
			
 
				 
			
 
				-=== 8 Makefile language
			
 
				+=== 9 Makefile language
			
 
				 
			
 
				 The kernel Makefiles are designed to be run with GNU Make.  The Makefiles
			
 
				 use only the documented features of GNU Make, but they do use many
			
@@ -1225,14 +1302,14 @@ time the left-hand side is used.
 
				 There are some cases where "=" is appropriate.  Usually, though, ":="
			
 
				 is the right choice.
			
 
				 
			
 
				-=== 9 Credits
			
 
				+=== 10 Credits
			
 
				 
			
 
				 Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net>
			
 
				 Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de>
			
 
				 Updates by Sam Ravnborg <sam@ravnborg.org>
			
 
				 Language QA by Jan Engelhardt <jengelh@gmx.de>
			
 
				 
			
 
				-=== 10 TODO
			
 
				+=== 11 TODO
			
 
				 
			
 
				 - Describe how kbuild supports shipped files with _shipped.
			
 
				 - Generating offset header files.
			
--- a/Documentation/kernel-doc-nano-HOWTO.txt
+++ b/Documentation/kernel-doc-nano-HOWTO.txt
@@ -269,7 +269,10 @@ Use the argument mechanism to document members or constants.
 
				 
			
 
				 Inside a struct description, you can use the "private:" and "public:"
			
 
				 comment tags.  Structure fields that are inside a "private:" area
			
 
				-are not listed in the generated output documentation.
			
 
				+are not listed in the generated output documentation.  The "private:"
			
 
				+and "public:" tags must begin immediately following a "/*" comment
			
 
				+marker.  They may optionally include comments between the ":" and the
			
 
				+ending "*/" marker.
			
 
				 
			
 
				 Example:
			
 
				 
			
@@ -283,7 +286,7 @@ Example:
 
				 struct my_struct {
			
 
				     int a;
			
 
				     int b;
			
 
				-/* private: */
			
 
				+/* private: internal use only */
			
 
				     int c;
			
 
				 };
			
 
				 
			
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -212,7 +212,9 @@ hit, Kprobes calls kp->pre_handler.  After the probed instruction
 
				 is single-stepped, Kprobe calls kp->post_handler.  If a fault
			
 
				 occurs during execution of kp->pre_handler or kp->post_handler,
			
 
				 or during single-stepping of the probed instruction, Kprobes calls
			
 
				-kp->fault_handler.  Any or all handlers can be NULL.
			
 
				+kp->fault_handler.  Any or all handlers can be NULL. If kp->flags
			
 
				+is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
			
 
				+so, it's handlers aren't hit until calling enable_kprobe(kp).
			
 
				 
			
 
				 NOTE:
			
 
				 1. With the introduction of the "symbol_name" field to struct kprobe,
			
@@ -363,6 +365,26 @@ probes) in the specified array, they clear the addr field of those
 
				 incorrect probes. However, other probes in the array are
			
 
				 unregistered correctly.
			
 
				 
			
 
				+4.7 disable_*probe
			
 
				+
			
 
				+#include <linux/kprobes.h>
			
 
				+int disable_kprobe(struct kprobe *kp);
			
 
				+int disable_kretprobe(struct kretprobe *rp);
			
 
				+int disable_jprobe(struct jprobe *jp);
			
 
				+
			
 
				+Temporarily disables the specified *probe. You can enable it again by using
			
 
				+enable_*probe(). You must specify the probe which has been registered.
			
 
				+
			
 
				+4.8 enable_*probe
			
 
				+
			
 
				+#include <linux/kprobes.h>
			
 
				+int enable_kprobe(struct kprobe *kp);
			
 
				+int enable_kretprobe(struct kretprobe *rp);
			
 
				+int enable_jprobe(struct jprobe *jp);
			
 
				+
			
 
				+Enables *probe which has been disabled by disable_*probe(). You must specify
			
 
				+the probe which has been registered.
			
 
				+
			
 
				 5. Kprobes Features and Limitations
			
 
				 
			
 
				 Kprobes allows multiple probes at the same address.  Currently,
			
@@ -500,10 +522,14 @@ the probe. If the probed function belongs to a module, the module name
 
				 is also specified. Following columns show probe status. If the probe is on
			
 
				 a virtual address that is no longer valid (module init sections, module
			
 
				 virtual addresses that correspond to modules that've been unloaded),
			
 
				-such probes are marked with [GONE].
			
 
				+such probes are marked with [GONE]. If the probe is temporarily disabled,
			
 
				+such probes are marked with [DISABLED].
			
 
				 
			
 
				-/debug/kprobes/enabled: Turn kprobes ON/OFF
			
 
				+/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
			
 
				 
			
 
				-Provides a knob to globally turn registered kprobes ON or OFF. By default,
			
 
				-all kprobes are enabled. By echoing "0" to this file, all registered probes
			
 
				-will be disarmed, till such time a "1" is echoed to this file.
			
 
				+Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
			
 
				+By default, all kprobes are enabled. By echoing "0" to this file, all
			
 
				+registered probes will be disarmed, till such time a "1" is echoed to this
			
 
				+file. Note that this knob just disarms and arms all kprobes and doesn't
			
 
				+change each probe's disabling state. This means that disabled kprobes (marked
			
 
				+[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
			
--- a/Documentation/laptops/acer-wmi.txt
+++ b/Documentation/laptops/acer-wmi.txt
@@ -1,9 +1,9 @@
 
				 Acer Laptop WMI Extras Driver
			
 
				 http://code.google.com/p/aceracpi
			
 
				-Version 0.2
			
 
				-18th August 2008
			
 
				+Version 0.3
			
 
				+4th April 2009
			
 
				 
			
 
				-Copyright 2007-2008 Carlos Corbacho <carlos@strangeworlds.co.uk>
			
 
				+Copyright 2007-2009 Carlos Corbacho <carlos@strangeworlds.co.uk>
			
 
				 
			
 
				 acer-wmi is a driver to allow you to control various parts of your Acer laptop
			
 
				 hardware under Linux which are exposed via ACPI-WMI.
			
@@ -36,6 +36,10 @@ not possible in kernel space from a 64 bit OS.
 
				 Supported Hardware
			
 
				 ******************
			
 
				 
			
 
				+NOTE: The Acer Aspire One is not supported hardware. It cannot work with
			
 
				+acer-wmi until Acer fix their ACPI-WMI implementation on them, so has been
			
 
				+blacklisted until that happens.
			
 
				+
			
 
				 Please see the website for the current list of known working hardare:
			
 
				 
			
 
				 http://code.google.com/p/aceracpi/wiki/SupportedHardware
			
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -1,7 +1,7 @@
 
				 		     ThinkPad ACPI Extras Driver
			
 
				 
			
 
				-                            Version 0.22
			
 
				-                        November 23rd,  2008
			
 
				+                            Version 0.23
			
 
				+                          April 10th, 2009
			
 
				 
			
 
				                Borislav Deianov <borislav@users.sf.net>
			
 
				              Henrique de Moraes Holschuh <hmh@hmh.eng.br>
			
@@ -20,7 +20,8 @@ moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel
 
				 kernel 2.6.29 and release 0.22.
			
 
				 
			
 
				 The driver is named "thinkpad-acpi".  In some places, like module
			
 
				-names, "thinkpad_acpi" is used because of userspace issues.
			
 
				+names and log messages, "thinkpad_acpi" is used because of userspace
			
 
				+issues.
			
 
				 
			
 
				 "tpacpi" is used as a shorthand where "thinkpad-acpi" would be too
			
 
				 long due to length limitations on some Linux kernel versions.
			
@@ -37,7 +38,7 @@ detailed description):
 
				 	- ThinkLight on and off
			
 
				 	- limited docking and undocking
			
 
				 	- UltraBay eject
			
 
				-	- CMOS control
			
 
				+	- CMOS/UCMS control
			
 
				 	- LED control
			
 
				 	- ACPI sounds
			
 
				 	- temperature sensors
			
@@ -46,6 +47,7 @@ detailed description):
 
				 	- Volume control
			
 
				 	- Fan control and monitoring: fan speed, fan enable/disable
			
 
				 	- WAN enable and disable
			
 
				+	- UWB enable and disable
			
 
				 
			
 
				 A compatibility table by model and feature is maintained on the web
			
 
				 site, http://ibm-acpi.sf.net/. I appreciate any success or failure
			
@@ -53,7 +55,7 @@ reports, especially if they add to or correct the compatibility table.
 
				 Please include the following information in your report:
			
 
				 
			
 
				 	- ThinkPad model name
			
 
				-	- a copy of your DSDT, from /proc/acpi/dsdt
			
 
				+	- a copy of your ACPI tables, using the "acpidump" utility
			
 
				 	- a copy of the output of dmidecode, with serial numbers
			
 
				 	  and UUIDs masked off
			
 
				 	- which driver features work and which don't
			
@@ -66,17 +68,18 @@ Installation
 
				 ------------
			
 
				 
			
 
				 If you are compiling this driver as included in the Linux kernel
			
 
				-sources, simply enable the CONFIG_THINKPAD_ACPI option, and optionally
			
 
				-enable the CONFIG_THINKPAD_ACPI_BAY option if you want the
			
 
				-thinkpad-specific bay functionality.
			
 
				+sources, look for the CONFIG_THINKPAD_ACPI Kconfig option.
			
 
				+It is located on the menu path: "Device Drivers" -> "X86 Platform
			
 
				+Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras".
			
 
				+
			
 
				 
			
 
				 Features
			
 
				 --------
			
 
				 
			
 
				 The driver exports two different interfaces to userspace, which can be
			
 
				 used to access the features it provides.  One is a legacy procfs-based
			
 
				-interface, which will be removed at some time in the distant future.
			
 
				-The other is a new sysfs-based interface which is not complete yet.
			
 
				+interface, which will be removed at some time in the future.  The other
			
 
				+is a new sysfs-based interface which is not complete yet.
			
 
				 
			
 
				 The procfs interface creates the /proc/acpi/ibm directory.  There is a
			
 
				 file under that directory for each feature it supports.  The procfs
			
@@ -111,15 +114,17 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver
 
				 as a driver attribute (see below).
			
 
				 
			
 
				 Sysfs driver attributes are on the driver's sysfs attribute space,
			
 
				-for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and
			
 
				+for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and
			
 
				 /sys/bus/platform/drivers/thinkpad_hwmon/
			
 
				 
			
 
				 Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
			
 
				-space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/.
			
 
				+space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/.
			
 
				 
			
 
				 Sysfs device attributes for the sensors and fan are on the
			
 
				 thinkpad_hwmon device's sysfs attribute space, but you should locate it
			
 
				-looking for a hwmon device with the name attribute of "thinkpad".
			
 
				+looking for a hwmon device with the name attribute of "thinkpad", or
			
 
				+better yet, through libsensors.
			
 
				+
			
 
				 
			
 
				 Driver version
			
 
				 --------------
			
@@ -129,6 +134,7 @@ sysfs driver attribute: version
 
				 
			
 
				 The driver name and version. No commands can be written to this file.
			
 
				 
			
 
				+
			
 
				 Sysfs interface version
			
 
				 -----------------------
			
 
				 
			
@@ -160,6 +166,7 @@ expect that an attribute might not be there, and deal with it properly
 
				 (an attribute not being there *is* a valid way to make it clear that a
			
 
				 feature is not available in sysfs).
			
 
				 
			
 
				+
			
 
				 Hot keys
			
 
				 --------
			
 
				 
			
@@ -172,17 +179,14 @@ system.  Enabling the hotkey functionality of thinkpad-acpi signals the
 
				 firmware that such a driver is present, and modifies how the ThinkPad
			
 
				 firmware will behave in many situations.
			
 
				 
			
 
				-The driver enables the hot key feature automatically when loaded.  The
			
 
				-feature can later be disabled and enabled back at runtime.  The driver
			
 
				-will also restore the hot key feature to its previous state and mask
			
 
				-when it is unloaded.
			
 
				+The driver enables the HKEY ("hot key") event reporting automatically
			
 
				+when loaded, and disables it when it is removed.
			
 
				 
			
 
				-When the hotkey feature is enabled and the hot key mask is set (see
			
 
				-below), the driver will report HKEY events in the following format:
			
 
				+The driver will report HKEY events in the following format:
			
 
				 
			
 
				 	ibm/hotkey HKEY 00000080 0000xxxx
			
 
				 
			
 
				-Some of these events refer to hot key presses, but not all.
			
 
				+Some of these events refer to hot key presses, but not all of them.
			
 
				 
			
 
				 The driver will generate events over the input layer for hot keys and
			
 
				 radio switches, and over the ACPI netlink layer for other events.  The
			
@@ -214,13 +218,17 @@ procfs notes:
 
				 
			
 
				 The following commands can be written to the /proc/acpi/ibm/hotkey file:
			
 
				 
			
 
				-	echo enable > /proc/acpi/ibm/hotkey -- enable the hot keys feature
			
 
				-	echo disable > /proc/acpi/ibm/hotkey -- disable the hot keys feature
			
 
				 	echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys
			
 
				 	echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys
			
 
				 	... any other 8-hex-digit mask ...
			
 
				 	echo reset > /proc/acpi/ibm/hotkey -- restore the original mask
			
 
				 
			
 
				+The following commands have been deprecated and will cause the kernel
			
 
				+to log a warning:
			
 
				+
			
 
				+	echo enable > /proc/acpi/ibm/hotkey -- does nothing
			
 
				+	echo disable > /proc/acpi/ibm/hotkey -- returns an error
			
 
				+
			
 
				 The procfs interface does not support NVRAM polling control.  So as to
			
 
				 maintain maximum bug-to-bug compatibility, it does not report any masks,
			
 
				 nor does it allow one to manipulate the hot key mask when the firmware
			
@@ -229,12 +237,9 @@ does not support masks at all, even if NVRAM polling is in use.
 
				 sysfs notes:
			
 
				 
			
 
				 	hotkey_bios_enabled:
			
 
				-		Returns the status of the hot keys feature when
			
 
				-		thinkpad-acpi was loaded.  Upon module unload, the hot
			
 
				-		key feature status will be restored to this value.
			
 
				+		DEPRECATED, WILL BE REMOVED SOON.
			
 
				 
			
 
				-		0: hot keys were disabled
			
 
				-		1: hot keys were enabled (unusual)
			
 
				+		Returns 0.
			
 
				 
			
 
				 	hotkey_bios_mask:
			
 
				 		Returns the hot keys mask when thinkpad-acpi was loaded.
			
@@ -242,13 +247,10 @@ sysfs notes:
 
				 		to this value.
			
 
				 
			
 
				 	hotkey_enable:
			
 
				-		Enables/disables the hot keys feature in the ACPI
			
 
				-		firmware, and reports current status of the hot keys
			
 
				-		feature.  Has no effect on the NVRAM hot key polling
			
 
				-		functionality.
			
 
				+		DEPRECATED, WILL BE REMOVED SOON.
			
 
				 
			
 
				-		0: disables the hot keys feature / feature disabled
			
 
				-		1: enables the hot keys feature / feature enabled
			
 
				+		0: returns -EPERM
			
 
				+		1: does nothing
			
 
				 
			
 
				 	hotkey_mask:
			
 
				 		bit mask to enable driver-handling (and depending on
			
@@ -618,6 +620,7 @@ For Lenovo models *with* ACPI backlight control:
 
				    and map them to KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN.  Process
			
 
				    these keys on userspace somehow (e.g. by calling xbacklight).
			
 
				 
			
 
				+
			
 
				 Bluetooth
			
 
				 ---------
			
 
				 
			
@@ -628,6 +631,9 @@ sysfs rfkill class: switch "tpacpi_bluetooth_sw"
 
				 This feature shows the presence and current state of a ThinkPad
			
 
				 Bluetooth device in the internal ThinkPad CDC slot.
			
 
				 
			
 
				+If the ThinkPad supports it, the Bluetooth state is stored in NVRAM,
			
 
				+so it is kept across reboots and power-off.
			
 
				+
			
 
				 Procfs notes:
			
 
				 
			
 
				 If Bluetooth is installed, the following commands can be used:
			
@@ -652,6 +658,7 @@ Sysfs notes:
 
				 	rfkill controller switch "tpacpi_bluetooth_sw": refer to
			
 
				 	Documentation/rfkill.txt for details.
			
 
				 
			
 
				+
			
 
				 Video output control -- /proc/acpi/ibm/video
			
 
				 --------------------------------------------
			
 
				 
			
@@ -693,11 +700,8 @@ Fn-F7 from working. This also disables the video output switching
 
				 features of this driver, as it uses the same ACPI methods as
			
 
				 Fn-F7. Video switching on the console should still work.
			
 
				 
			
 
				-UPDATE: There's now a patch for the X.org Radeon driver which
			
 
				-addresses this issue. Some people are reporting success with the patch
			
 
				-while others are still having problems. For more information:
			
 
				+UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000
			
 
				 
			
 
				-https://bugs.freedesktop.org/show_bug.cgi?id=2000
			
 
				 
			
 
				 ThinkLight control
			
 
				 ------------------
			
@@ -720,10 +724,11 @@ The ThinkLight sysfs interface is documented by the LED class
 
				 documentation, in Documentation/leds-class.txt.  The ThinkLight LED name
			
 
				 is "tpacpi::thinklight".
			
 
				 
			
 
				-Due to limitations in the sysfs LED class, if the status of the thinklight
			
 
				+Due to limitations in the sysfs LED class, if the status of the ThinkLight
			
 
				 cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
			
 
				 It is impossible to know if the status returned through sysfs is valid.
			
 
				 
			
 
				+
			
 
				 Docking / undocking -- /proc/acpi/ibm/dock
			
 
				 ------------------------------------------
			
 
				 
			
@@ -784,6 +789,7 @@ the only docking stations currently supported are the X-series
 
				 UltraBase docks and "dumb" port replicators like the Mini Dock (the
			
 
				 latter don't need any ACPI support, actually).
			
 
				 
			
 
				+
			
 
				 UltraBay eject -- /proc/acpi/ibm/bay
			
 
				 ------------------------------------
			
 
				 
			
@@ -847,8 +853,9 @@ supported. Use "eject2" instead of "eject" for the second bay.
 
				 Note: the UltraBay eject support on the 600e/x, A22p and A3x is
			
 
				 EXPERIMENTAL and may not work as expected. USE WITH CAUTION!
			
 
				 
			
 
				-CMOS control
			
 
				-------------
			
 
				+
			
 
				+CMOS/UCMS control
			
 
				+-----------------
			
 
				 
			
 
				 procfs: /proc/acpi/ibm/cmos
			
 
				 sysfs device attribute: cmos_command
			
@@ -882,6 +889,7 @@ The cmos command interface is prone to firmware split-brain problems, as
 
				 in newer ThinkPads it is just a compatibility layer.  Do not use it, it is
			
 
				 exported just as a debug tool.
			
 
				 
			
 
				+
			
 
				 LED control
			
 
				 -----------
			
 
				 
			
@@ -893,6 +901,17 @@ some older ThinkPad models, it is possible to query the status of the
 
				 LED indicators as well.  Newer ThinkPads cannot query the real status
			
 
				 of the LED indicators.
			
 
				 
			
 
				+Because misuse of the LEDs could induce an unaware user to perform
			
 
				+dangerous actions (like undocking or ejecting a bay device while the
			
 
				+buses are still active), or mask an important alarm (such as a nearly
			
 
				+empty battery, or a broken battery), access to most LEDs is
			
 
				+restricted.
			
 
				+
			
 
				+Unrestricted access to all LEDs requires that thinkpad-acpi be
			
 
				+compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled.
			
 
				+Distributions must never enable this option.  Individual users that
			
 
				+are aware of the consequences are welcome to enabling it.
			
 
				+
			
 
				 procfs notes:
			
 
				 
			
 
				 The available commands are:
			
@@ -939,6 +958,7 @@ ThinkPad indicator LED should blink in hardware accelerated mode, use the
 
				 "timer" trigger, and leave the delay_on and delay_off parameters set to
			
 
				 zero (to request hardware acceleration autodetection).
			
 
				 
			
 
				+
			
 
				 ACPI sounds -- /proc/acpi/ibm/beep
			
 
				 ----------------------------------
			
 
				 
			
@@ -968,6 +988,7 @@ X40:
 
				 	16 - one medium-pitched beep repeating constantly, stop with 17
			
 
				 	17 - stop 16
			
 
				 
			
 
				+
			
 
				 Temperature sensors
			
 
				 -------------------
			
 
				 
			
@@ -1115,6 +1136,7 @@ registers contain the current battery capacity, etc. If you experiment
 
				 with this, do send me your results (including some complete dumps with
			
 
				 a description of the conditions when they were taken.)
			
 
				 
			
 
				+
			
 
				 LCD brightness control
			
 
				 ----------------------
			
 
				 
			
@@ -1124,10 +1146,9 @@ sysfs backlight device "thinkpad_screen"
 
				 This feature allows software control of the LCD brightness on ThinkPad
			
 
				 models which don't have a hardware brightness slider.
			
 
				 
			
 
				-It has some limitations: the LCD backlight cannot be actually turned on or
			
 
				-off by this interface, and in many ThinkPad models, the "dim while on
			
 
				-battery" functionality will be enabled by the BIOS when this interface is
			
 
				-used, and cannot be controlled.
			
 
				+It has some limitations: the LCD backlight cannot be actually turned
			
 
				+on or off by this interface, it just controls the backlight brightness
			
 
				+level.
			
 
				 
			
 
				 On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control
			
 
				 has eight brightness levels, ranging from 0 to 7.  Some of the levels
			
@@ -1136,10 +1157,15 @@ display backlight brightness control methods have 16 levels, ranging
 
				 from 0 to 15.
			
 
				 
			
 
				 There are two interfaces to the firmware for direct brightness control,
			
 
				-EC and CMOS.  To select which one should be used, use the
			
 
				+EC and UCMS (or CMOS).  To select which one should be used, use the
			
 
				 brightness_mode module parameter: brightness_mode=1 selects EC mode,
			
 
				-brightness_mode=2 selects CMOS mode, brightness_mode=3 selects both EC
			
 
				-and CMOS.  The driver tries to auto-detect which interface to use.
			
 
				+brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
			
 
				+mode with NVRAM backing (so that brightness changes are remembered
			
 
				+across shutdown/reboot).
			
 
				+
			
 
				+The driver tries to select which interface to use from a table of
			
 
				+defaults for each ThinkPad model.  If it makes a wrong choice, please
			
 
				+report this as a bug, so that we can fix it.
			
 
				 
			
 
				 When display backlight brightness controls are available through the
			
 
				 standard ACPI interface, it is best to use it instead of this direct
			
@@ -1201,6 +1227,7 @@ WARNING:
 
				     and maybe reduce the life of the backlight lamps by needlessly kicking
			
 
				     its level up and down at every change.
			
 
				 
			
 
				+
			
 
				 Volume control -- /proc/acpi/ibm/volume
			
 
				 ---------------------------------------
			
 
				 
			
@@ -1217,6 +1244,11 @@ distinct. The unmute the volume after the mute command, use either the
 
				 up or down command (the level command will not unmute the volume).
			
 
				 The current volume level and mute state is shown in the file.
			
 
				 
			
 
				+The ALSA mixer interface to this feature is still missing, but patches
			
 
				+to add it exist.  That problem should be addressed in the not so
			
 
				+distant future.
			
 
				+
			
 
				+
			
 
				 Fan control and monitoring: fan speed, fan enable/disable
			
 
				 ---------------------------------------------------------
			
 
				 
			
@@ -1383,8 +1415,11 @@ procfs: /proc/acpi/ibm/wan
 
				 sysfs device attribute: wwan_enable (deprecated)
			
 
				 sysfs rfkill class: switch "tpacpi_wwan_sw"
			
 
				 
			
 
				-This feature shows the presence and current state of a W-WAN (Sierra
			
 
				-Wireless EV-DO) device.
			
 
				+This feature shows the presence and current state of the built-in
			
 
				+Wireless WAN device.
			
 
				+
			
 
				+If the ThinkPad supports it, the WWAN state is stored in NVRAM,
			
 
				+so it is kept across reboots and power-off.
			
 
				 
			
 
				 It was tested on a Lenovo ThinkPad X60. It should probably work on other
			
 
				 ThinkPad models which come with this module installed.
			
@@ -1413,6 +1448,7 @@ Sysfs notes:
 
				 	rfkill controller switch "tpacpi_wwan_sw": refer to
			
 
				 	Documentation/rfkill.txt for details.
			
 
				 
			
 
				+
			
 
				 EXPERIMENTAL: UWB
			
 
				 -----------------
			
 
				 
			
@@ -1431,6 +1467,7 @@ Sysfs notes:
 
				 	rfkill controller switch "tpacpi_uwb_sw": refer to
			
 
				 	Documentation/rfkill.txt for details.
			
 
				 
			
 
				+
			
 
				 Multiple Commands, Module Parameters
			
 
				 ------------------------------------
			
 
				 
			
@@ -1445,6 +1482,7 @@ for example:
 
				 
			
 
				 	modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable
			
 
				 
			
 
				+
			
 
				 Enabling debugging output
			
 
				 -------------------------
			
 
				 
			
@@ -1457,8 +1495,15 @@ will enable all debugging output classes.  It takes a bitmask, so
 
				 to enable more than one output class, just add their values.
			
 
				 
			
 
				 	Debug bitmask		Description
			
 
				+	0x8000			Disclose PID of userspace programs
			
 
				+				accessing some functions of the driver
			
 
				 	0x0001			Initialization and probing
			
 
				 	0x0002			Removal
			
 
				+	0x0004			RF Transmitter control (RFKILL)
			
 
				+				(bluetooth, WWAN, UWB...)
			
 
				+	0x0008			HKEY event interface, hotkeys
			
 
				+	0x0010			Fan control
			
 
				+	0x0020			Backlight brightness
			
 
				 
			
 
				 There is also a kernel build option to enable more debugging
			
 
				 information, which may be necessary to debug driver problems.
			
@@ -1467,6 +1512,7 @@ The level of debugging information output by the driver can be changed
 
				 at runtime through sysfs, using the driver attribute debug_level.  The
			
 
				 attribute takes the same bitmask as the debug module parameter above.
			
 
				 
			
 
				+
			
 
				 Force loading of module
			
 
				 -----------------------
			
 
				 
			
@@ -1505,3 +1551,7 @@ Sysfs interface changelog:
 
				 
			
 
				 0x020200:	Add poll()/select() support to the following attributes:
			
 
				 		hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
			
 
				+
			
 
				+0x020300:	hotkey enable/disable support removed, attributes
			
 
				+		hotkey_bios_enabled and hotkey_enable deprecated and
			
 
				+		marked for removal.
			
--- a/Documentation/lguest/.gitignore
+++ b/Documentation/lguest/.gitignore
@@ -0,0 +1 @@
 
				+lguest
			
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -3,11 +3,11 @@
 
				  /,    /`      - or, A Young Coder's Illustrated Hypervisor
			
 
				  \\"--\\    http://lguest.ozlabs.org
			
 
				 
			
 
				-Lguest is designed to be a minimal hypervisor for the Linux kernel, for
			
 
				-Linux developers and users to experiment with virtualization with the
			
 
				-minimum of complexity.  Nonetheless, it should have sufficient
			
 
				-features to make it useful for specific tasks, and, of course, you are
			
 
				-encouraged to fork and enhance it (see drivers/lguest/README).
			
 
				+Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
			
 
				+for Linux developers and users to experiment with virtualization with the
			
 
				+minimum of complexity.  Nonetheless, it should have sufficient features to
			
 
				+make it useful for specific tasks, and, of course, you are encouraged to fork
			
 
				+and enhance it (see drivers/lguest/README).
			
 
				 
			
 
				 Features:
			
 
				 
			
@@ -37,6 +37,7 @@ Running Lguest:
 
				      "Paravirtualized guest support" = Y
			
 
				         "Lguest guest support" = Y
			
 
				      "High Memory Support" = off/4GB
			
 
				+     "PAE (Physical Address Extension) Support" = N
			
 
				      "Alignment value to which kernel should be aligned" = 0x100000
			
 
				         (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
			
 
				          CONFIG_PHYSICAL_ALIGN=0x100000)
			
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -54,9 +54,9 @@ locking error messages, inside curlies. A contrived example:
 
				 The bit position indicates STATE, STATE-read, for each of the states listed
			
 
				 above, and the character displayed in each indicates:
			
 
				 
			
 
				-   '.'  acquired while irqs disabled
			
 
				-   '+'  acquired in irq context
			
 
				-   '-'  acquired with irqs enabled
			
 
				+   '.'  acquired while irqs disabled and not in irq context
			
 
				+   '-'  acquired in irq context
			
 
				+   '+'  acquired with irqs enabled
			
 
				    '?'  acquired in irq context with irqs enabled.
			
 
				 
			
 
				 Unused mutexes cannot be part of the cause of an error.
			
--- a/Documentation/logo.gif
+++ b/Documentation/logo.gif
--- a/Documentation/logo.svg
+++ b/Documentation/logo.svg
--- a/Documentation/logo.txt
+++ b/Documentation/logo.txt
@@ -1,4 +1,13 @@
 
				-Tux is taking a three month sabbatical to work as a barber, so Tuz is
			
 
				-standing in.  He's taken pains to ensure you'll hardly notice.
			
 
				+This is the full-colour version of the currently unofficial Linux logo
			
 
				+("currently unofficial" just means that there has been no paperwork and
			
 
				+that I have not really announced it yet).  It was created by Larry Ewing,
			
 
				+and is freely usable as long as you acknowledge Larry as the original
			
 
				+artist. 
			
 
				+
			
 
				+Note that there are black-and-white versions of this available that
			
 
				+scale down to smaller sizes and are better for letterheads or whatever
			
 
				+you want to use it for: for the full range of logos take a look at
			
 
				+Larry's web-page:
			
 
				+
			
 
				+	http://www.isc.tamu.edu/~lewing/linux/
			
 
				 
			
 
				-Image by Andrew McGown and Josh Bush.  Image is licensed CC BY-SA.
			
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -1242,7 +1242,7 @@ monitoring is enabled, and vice-versa.
 
				 To add ARP targets:
			
 
				 # echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
			
 
				 # echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target
			
 
				-	NOTE:  up to 10 target addresses may be specified.
			
 
				+	NOTE:  up to 16 target addresses may be specified.
			
 
				 
			
 
				 To remove an ARP target:
			
 
				 # echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
			
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -43,12 +43,11 @@ Table of Contents
 
				     2) Representing devices without a current OF specification
			
 
				       a) PHY nodes
			
 
				       b) Interrupt controllers
			
 
				-      c) CFI or JEDEC memory-mapped NOR flash
			
 
				-      d) 4xx/Axon EMAC ethernet nodes
			
 
				-      e) Xilinx IP cores
			
 
				-      f) USB EHCI controllers
			
 
				-      g) MDIO on GPIOs
			
 
				-      h) SPI busses
			
 
				+      c) 4xx/Axon EMAC ethernet nodes
			
 
				+      d) Xilinx IP cores
			
 
				+      e) USB EHCI controllers
			
 
				+      f) MDIO on GPIOs
			
 
				+      g) SPI busses
			
 
				 
			
 
				   VII - Marvell Discovery mv64[345]6x System Controller chips
			
 
				     1) The /system-controller node
			
@@ -999,7 +998,7 @@ compatibility.
 
				       translation of SOC addresses for memory mapped SOC registers.
			
 
				     - bus-frequency: Contains the bus frequency for the SOC node.
			
 
				       Typically, the value of this field is filled in by the boot
			
 
				-      loader. 
			
 
				+      loader.
			
 
				 
			
 
				 
			
 
				   Recommended properties:
			
@@ -1287,71 +1286,7 @@ platforms are moved over to use the flattened-device-tree model.
 
				 		device_type = "open-pic";
			
 
				 	};
			
 
				 
			
 
				-   c) CFI or JEDEC memory-mapped NOR flash
			
 
				-
			
 
				-    Flash chips (Memory Technology Devices) are often used for solid state
			
 
				-    file systems on embedded devices.
			
 
				-
			
 
				-     - compatible : should contain the specific model of flash chip(s)
			
 
				-       used, if known, followed by either "cfi-flash" or "jedec-flash"
			
 
				-     - reg : Address range of the flash chip
			
 
				-     - bank-width : Width (in bytes) of the flash bank.  Equal to the
			
 
				-       device width times the number of interleaved chips.
			
 
				-     - device-width : (optional) Width of a single flash chip.  If
			
 
				-       omitted, assumed to be equal to 'bank-width'.
			
 
				-     - #address-cells, #size-cells : Must be present if the flash has
			
 
				-       sub-nodes representing partitions (see below).  In this case
			
 
				-       both #address-cells and #size-cells must be equal to 1.
			
 
				-
			
 
				-    For JEDEC compatible devices, the following additional properties
			
 
				-    are defined:
			
 
				-
			
 
				-     - vendor-id : Contains the flash chip's vendor id (1 byte).
			
 
				-     - device-id : Contains the flash chip's device id (1 byte).
			
 
				-
			
 
				-    In addition to the information on the flash bank itself, the
			
 
				-    device tree may optionally contain additional information
			
 
				-    describing partitions of the flash address space.  This can be
			
 
				-    used on platforms which have strong conventions about which
			
 
				-    portions of the flash are used for what purposes, but which don't
			
 
				-    use an on-flash partition table such as RedBoot.
			
 
				-
			
 
				-    Each partition is represented as a sub-node of the flash device.
			
 
				-    Each node's name represents the name of the corresponding
			
 
				-    partition of the flash device.
			
 
				-
			
 
				-    Flash partitions
			
 
				-     - reg : The partition's offset and size within the flash bank.
			
 
				-     - label : (optional) The label / name for this flash partition.
			
 
				-       If omitted, the label is taken from the node name (excluding
			
 
				-       the unit address).
			
 
				-     - read-only : (optional) This parameter, if present, is a hint to
			
 
				-       Linux that this flash partition should only be mounted
			
 
				-       read-only.  This is usually used for flash partitions
			
 
				-       containing early-boot firmware images or data which should not
			
 
				-       be clobbered.
			
 
				-
			
 
				-    Example:
			
 
				-
			
 
				-	flash@ff000000 {
			
 
				-		compatible = "amd,am29lv128ml", "cfi-flash";
			
 
				-		reg = <ff000000 01000000>;
			
 
				-		bank-width = <4>;
			
 
				-		device-width = <1>;
			
 
				-		#address-cells = <1>;
			
 
				-		#size-cells = <1>;
			
 
				-		fs@0 {
			
 
				-			label = "fs";
			
 
				-			reg = <0 f80000>;
			
 
				-		};
			
 
				-		firmware@f80000 {
			
 
				-			label ="firmware";
			
 
				-			reg = <f80000 80000>;
			
 
				-			read-only;
			
 
				-		};
			
 
				-	};
			
 
				-
			
 
				-    d) 4xx/Axon EMAC ethernet nodes
			
 
				+    c) 4xx/Axon EMAC ethernet nodes
			
 
				 
			
 
				     The EMAC ethernet controller in IBM and AMCC 4xx chips, and also
			
 
				     the Axon bridge.  To operate this needs to interact with a ths
			
@@ -1499,7 +1434,7 @@ platforms are moved over to use the flattened-device-tree model.
 
				 			   available.
			
 
				 			   For Axon: 0x0000012a
			
 
				 
			
 
				-   e) Xilinx IP cores
			
 
				+   d) Xilinx IP cores
			
 
				 
			
 
				    The Xilinx EDK toolchain ships with a set of IP cores (devices) for use
			
 
				    in Xilinx Spartan and Virtex FPGAs.  The devices cover the whole range
			
@@ -1761,7 +1696,7 @@ platforms are moved over to use the flattened-device-tree model.
 
				       listed above, nodes for these devices should include a phy-handle
			
 
				       property, and may include other common network device properties
			
 
				       like local-mac-address.
			
 
				-      
			
 
				+
			
 
				       iv) Xilinx Uartlite
			
 
				 
			
 
				       Xilinx uartlite devices are simple fixed speed serial ports.
			
@@ -1793,7 +1728,7 @@ platforms are moved over to use the flattened-device-tree model.
 
				        - reg-offset : A value of 3 is required
			
 
				        - reg-shift : A value of 2 is required
			
 
				 
			
 
				-    f) USB EHCI controllers
			
 
				+    e) USB EHCI controllers
			
 
				 
			
 
				     Required properties:
			
 
				       - compatible : should be "usb-ehci".
			
@@ -1819,7 +1754,7 @@ platforms are moved over to use the flattened-device-tree model.
 
				 		   big-endian;
			
 
				 	   };
			
 
				 
			
 
				-   g) MDIO on GPIOs
			
 
				+   f) MDIO on GPIOs
			
 
				 
			
 
				    Currently defined compatibles:
			
 
				    - virtual,gpio-mdio
			
@@ -1839,7 +1774,7 @@ platforms are moved over to use the flattened-device-tree model.
 
				 			 &qe_pio_c 6>;
			
 
				 	};
			
 
				 
			
 
				-    h) SPI (Serial Peripheral Interface) busses
			
 
				+    g) SPI (Serial Peripheral Interface) busses
			
 
				 
			
 
				     SPI busses can be described with a node for the SPI master device
			
 
				     and a set of child nodes for each SPI slave on the bus.  For this
			
--- a/Documentation/powerpc/dts-bindings/fsl/i2c.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
@@ -7,8 +7,10 @@ Required properties :
 
				 
			
 
				 Recommended properties :
			
 
				 
			
 
				- - compatible : Should be "fsl-i2c" for parts compatible with
			
 
				-   Freescale I2C specifications.
			
 
				+ - compatible : compatibility list with 2 entries, the first should
			
 
				+   be "fsl,CHIP-i2c" where CHIP is the name of a compatible processor,
			
 
				+   e.g. mpc8313, mpc8543, mpc8544, mpc5200 or mpc5200b. The second one
			
 
				+   should be "fsl-i2c".
			
 
				  - interrupts : <a b> where a is the interrupt number and b is a
			
 
				    field that represents an encoding of the sense and level
			
 
				    information for the interrupt.  This should be encoded based on
			
@@ -16,17 +18,31 @@ Recommended properties :
 
				    controller you have.
			
 
				  - interrupt-parent : the phandle for the interrupt controller that
			
 
				    services interrupts for this device.
			
 
				- - dfsrr : boolean; if defined, indicates that this I2C device has
			
 
				-   a digital filter sampling rate register
			
 
				- - fsl5200-clocking : boolean; if defined, indicated that this device
			
 
				-   uses the FSL 5200 clocking mechanism.
			
 
				-
			
 
				-Example :
			
 
				-	i2c@3000 {
			
 
				-		interrupt-parent = <40000>;
			
 
				-		interrupts = <1b 3>;
			
 
				-		reg = <3000 18>;
			
 
				-		device_type = "i2c";
			
 
				-		compatible  = "fsl-i2c";
			
 
				-		dfsrr;
			
 
				+ - fsl,preserve-clocking : boolean; if defined, the clock settings
			
 
				+   from the bootloader are preserved (not touched).
			
 
				+ - clock-frequency : desired I2C bus clock frequency in Hz.
			
 
				+
			
 
				+Examples :
			
 
				+
			
 
				+	i2c@3d00 {
			
 
				+		#address-cells = <1>;
			
 
				+		#size-cells = <0>;
			
 
				+		compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c";
			
 
				+		cell-index = <0>;
			
 
				+		reg = <0x3d00 0x40>;
			
 
				+		interrupts = <2 15 0>;
			
 
				+		interrupt-parent = <&mpc5200_pic>;
			
 
				+		fsl,preserve-clocking;
			
 
				 	};
			
 
				+
			
 
				+	i2c@3100 {
			
 
				+		#address-cells = <1>;
			
 
				+		#size-cells = <0>;
			
 
				+		cell-index = <1>;
			
 
				+		compatible = "fsl,mpc8544-i2c", "fsl-i2c";
			
 
				+		reg = <0x3100 0x100>;
			
 
				+		interrupts = <43 2>;
			
 
				+		interrupt-parent = <&mpic>;
			
 
				+		clock-frequency = <400000>;
			
 
				+	};
			
 
				+
			
--- a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
@@ -5,9 +5,21 @@ Required properties:
 
				 - reg : should specify localbus chip select and size used for the chip.
			
 
				 - fsl,upm-addr-offset : UPM pattern offset for the address latch.
			
 
				 - fsl,upm-cmd-offset : UPM pattern offset for the command latch.
			
 
				-- gpios : may specify optional GPIO connected to the Ready-Not-Busy pin.
			
 
				 
			
 
				-Example:
			
 
				+Optional properties:
			
 
				+- fsl,upm-wait-flags : add chip-dependent short delays after running the
			
 
				+	UPM pattern (0x1), after writing a data byte (0x2) or after
			
 
				+	writing out a buffer (0x4).
			
 
				+- fsl,upm-addr-line-cs-offsets : address offsets for multi-chip support.
			
 
				+	The corresponding address lines are used to select the chip.
			
 
				+- gpios : may specify optional GPIOs connected to the Ready-Not-Busy pins
			
 
				+	(R/B#). For multi-chip devices, "n" GPIO definitions are required
			
 
				+	according to the number of chips.
			
 
				+- chip-delay : chip dependent delay for transfering data from array to
			
 
				+	read registers (tR). Required if property "gpios" is not used
			
 
				+	(R/B# pins not connected).
			
 
				+
			
 
				+Examples:
			
 
				 
			
 
				 upm@1,0 {
			
 
				 	compatible = "fsl,upm-nand";
			
@@ -26,3 +38,26 @@ upm@1,0 {
 
				 		};
			
 
				 	};
			
 
				 };
			
 
				+
			
 
				+upm@3,0 {
			
 
				+	#address-cells = <0>;
			
 
				+	#size-cells = <0>;
			
 
				+	compatible = "tqc,tqm8548-upm-nand", "fsl,upm-nand";
			
 
				+	reg = <3 0x0 0x800>;
			
 
				+	fsl,upm-addr-offset = <0x10>;
			
 
				+	fsl,upm-cmd-offset = <0x08>;
			
 
				+	/* Multi-chip NAND device */
			
 
				+	fsl,upm-addr-line-cs-offsets = <0x0 0x200>;
			
 
				+	fsl,upm-wait-flags = <0x5>;
			
 
				+	chip-delay = <25>; // in micro-seconds
			
 
				+
			
 
				+	nand@0 {
			
 
				+		#address-cells = <1>;
			
 
				+		#size-cells = <1>;
			
 
				+
			
 
				+		partition@0 {
			
 
				+			    label = "fs";
			
 
				+			    reg = <0x00000000 0x10000000>;
			
 
				+		};
			
 
				+	};
			
 
				+};
			
--- a/Documentation/powerpc/dts-bindings/gpio/led.txt
+++ b/Documentation/powerpc/dts-bindings/gpio/led.txt
@@ -1,15 +1,43 @@
 
				-LED connected to GPIO
			
 
				+LEDs connected to GPIO lines
			
 
				 
			
 
				 Required properties:
			
 
				-- compatible : should be "gpio-led".
			
 
				-- label : (optional) the label for this LED. If omitted, the label is
			
 
				+- compatible : should be "gpio-leds".
			
 
				+
			
 
				+Each LED is represented as a sub-node of the gpio-leds device.  Each
			
 
				+node's name represents the name of the corresponding LED.
			
 
				+
			
 
				+LED sub-node properties:
			
 
				+- gpios :  Should specify the LED's GPIO, see "Specifying GPIO information
			
 
				+  for devices" in Documentation/powerpc/booting-without-of.txt.  Active
			
 
				+  low LEDs should be indicated using flags in the GPIO specifier.
			
 
				+- label :  (optional) The label for this LED.  If omitted, the label is
			
 
				   taken from the node name (excluding the unit address).
			
 
				-- gpios : should specify LED GPIO.
			
 
				+- linux,default-trigger :  (optional) This parameter, if present, is a
			
 
				+  string defining the trigger assigned to the LED.  Current triggers are:
			
 
				+    "backlight" - LED will act as a back-light, controlled by the framebuffer
			
 
				+		  system
			
 
				+    "default-on" - LED will turn on
			
 
				+    "heartbeat" - LED "double" flashes at a load average based rate
			
 
				+    "ide-disk" - LED indicates disk activity
			
 
				+    "timer" - LED flashes at a fixed, configurable rate
			
 
				 
			
 
				-Example:
			
 
				+Examples:
			
 
				 
			
 
				-led@0 {
			
 
				-	compatible = "gpio-led";
			
 
				-	label = "hdd";
			
 
				-	gpios = <&mcu_pio 0 1>;
			
 
				+leds {
			
 
				+	compatible = "gpio-leds";
			
 
				+	hdd {
			
 
				+		label = "IDE Activity";
			
 
				+		gpios = <&mcu_pio 0 1>; /* Active low */
			
 
				+		linux,default-trigger = "ide-disk";
			
 
				+	};
			
 
				 };
			
 
				+
			
 
				+run-control {
			
 
				+	compatible = "gpio-leds";
			
 
				+	red {
			
 
				+		gpios = <&mpc8572 6 0>;
			
 
				+	};
			
 
				+	green {
			
 
				+		gpios = <&mpc8572 7 0>;
			
 
				+	};
			
 
				+}
			
--- a/Documentation/powerpc/dts-bindings/mtd-physmap.txt
+++ b/Documentation/powerpc/dts-bindings/mtd-physmap.txt
@@ -0,0 +1,80 @@
 
				+CFI or JEDEC memory-mapped NOR flash
			
 
				+
			
 
				+Flash chips (Memory Technology Devices) are often used for solid state
			
 
				+file systems on embedded devices.
			
 
				+
			
 
				+ - compatible : should contain the specific model of flash chip(s)
			
 
				+   used, if known, followed by either "cfi-flash" or "jedec-flash"
			
 
				+ - reg : Address range(s) of the flash chip(s)
			
 
				+   It's possible to (optionally) define multiple "reg" tuples so that
			
 
				+   non-identical NOR chips can be described in one flash node.
			
 
				+ - bank-width : Width (in bytes) of the flash bank.  Equal to the
			
 
				+   device width times the number of interleaved chips.
			
 
				+ - device-width : (optional) Width of a single flash chip.  If
			
 
				+   omitted, assumed to be equal to 'bank-width'.
			
 
				+ - #address-cells, #size-cells : Must be present if the flash has
			
 
				+   sub-nodes representing partitions (see below).  In this case
			
 
				+   both #address-cells and #size-cells must be equal to 1.
			
 
				+
			
 
				+For JEDEC compatible devices, the following additional properties
			
 
				+are defined:
			
 
				+
			
 
				+ - vendor-id : Contains the flash chip's vendor id (1 byte).
			
 
				+ - device-id : Contains the flash chip's device id (1 byte).
			
 
				+
			
 
				+In addition to the information on the flash bank itself, the
			
 
				+device tree may optionally contain additional information
			
 
				+describing partitions of the flash address space.  This can be
			
 
				+used on platforms which have strong conventions about which
			
 
				+portions of the flash are used for what purposes, but which don't
			
 
				+use an on-flash partition table such as RedBoot.
			
 
				+
			
 
				+Each partition is represented as a sub-node of the flash device.
			
 
				+Each node's name represents the name of the corresponding
			
 
				+partition of the flash device.
			
 
				+
			
 
				+Flash partitions
			
 
				+ - reg : The partition's offset and size within the flash bank.
			
 
				+ - label : (optional) The label / name for this flash partition.
			
 
				+   If omitted, the label is taken from the node name (excluding
			
 
				+   the unit address).
			
 
				+ - read-only : (optional) This parameter, if present, is a hint to
			
 
				+   Linux that this flash partition should only be mounted
			
 
				+   read-only.  This is usually used for flash partitions
			
 
				+   containing early-boot firmware images or data which should not
			
 
				+   be clobbered.
			
 
				+
			
 
				+Example:
			
 
				+
			
 
				+	flash@ff000000 {
			
 
				+		compatible = "amd,am29lv128ml", "cfi-flash";
			
 
				+		reg = <ff000000 01000000>;
			
 
				+		bank-width = <4>;
			
 
				+		device-width = <1>;
			
 
				+		#address-cells = <1>;
			
 
				+		#size-cells = <1>;
			
 
				+		fs@0 {
			
 
				+			label = "fs";
			
 
				+			reg = <0 f80000>;
			
 
				+		};
			
 
				+		firmware@f80000 {
			
 
				+			label ="firmware";
			
 
				+			reg = <f80000 80000>;
			
 
				+			read-only;
			
 
				+		};
			
 
				+	};
			
 
				+
			
 
				+Here an example with multiple "reg" tuples:
			
 
				+
			
 
				+	flash@f0000000,0 {
			
 
				+		#address-cells = <1>;
			
 
				+		#size-cells = <1>;
			
 
				+		compatible = "intel,PC48F4400P0VB", "cfi-flash";
			
 
				+		reg = <0 0x00000000 0x02000000
			
 
				+		       0 0x02000000 0x02000000>;
			
 
				+		bank-width = <2>;
			
 
				+		partition@0 {
			
 
				+			label = "test-part1";
			
 
				+			reg = <0 0x04000000>;
			
 
				+		};
			
 
				+	};
			
--- a/Documentation/scsi/aacraid.txt
+++ b/Documentation/scsi/aacraid.txt
@@ -60,17 +60,9 @@ Supported Cards/Chipsets
 
				 	9005:0285:9005:02d5	Adaptec	ASR-2405 (Voodoo40 Lite)
			
 
				 	9005:0285:9005:02d6	Adaptec	ASR-2445 (Voodoo44 Lite)
			
 
				 	9005:0285:9005:02d7	Adaptec	ASR-2805 (Voodoo80 Lite)
			
 
				-	9005:0285:9005:02d8	Adaptec	5405G (Voodoo40 PM)
			
 
				-	9005:0285:9005:02d9	Adaptec	5445G (Voodoo44 PM)
			
 
				-	9005:0285:9005:02da	Adaptec	5805G (Voodoo80 PM)
			
 
				-	9005:0285:9005:02db	Adaptec	5085G (Voodoo08 PM)
			
 
				-	9005:0285:9005:02dc	Adaptec	51245G (Voodoo124 PM)
			
 
				-	9005:0285:9005:02dd	Adaptec	51645G (Voodoo164 PM)
			
 
				-	9005:0285:9005:02de	Adaptec	52445G (Voodoo244 PM)
			
 
				-	9005:0285:9005:02df	Adaptec	ASR-2045G (Voodoo04 Lite PM)
			
 
				-	9005:0285:9005:02e0	Adaptec	ASR-2405G (Voodoo40 Lite PM)
			
 
				-	9005:0285:9005:02e1	Adaptec	ASR-2445G (Voodoo44 Lite PM)
			
 
				-	9005:0285:9005:02e2	Adaptec	ASR-2805G (Voodoo80 Lite PM)
			
 
				+	9005:0285:9005:02d8	Adaptec	5405Z (Voodoo40 BLBU)
			
 
				+	9005:0285:9005:02d9	Adaptec	5445Z (Voodoo44 BLBU)
			
 
				+	9005:0285:9005:02da	Adaptec	5805Z (Voodoo80 BLBU)
			
 
				 	1011:0046:9005:0364	Adaptec	5400S (Mustang)
			
 
				 	1011:0046:9005:0365	Adaptec	5400S (Mustang)
			
 
				 	9005:0287:9005:0800	Adaptec	Themisto (Jupiter)
			
@@ -140,6 +132,7 @@ Deanna Bonds                            (non-DASD support, PAE fibs and 64 bit,
 
				 					 where fibs that go to the hardware are consistently called hw_fibs and
			
 
				 					 not just fibs like the name of the driver tracking structure)
			
 
				 Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations.
			
 
				+Achim Leubner <Achim_Leubner@adaptec.com>
			
 
				 
			
 
				 Original Driver
			
 
				 -------------------------
			
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -169,7 +169,7 @@ PCI SSID look-up.
 
				 What `model` option values are available depends on the codec chip.
			
 
				 Check your codec chip from the codec proc file (see "Codec Proc-File"
			
 
				 section below).  It will show the vendor/product name of your codec
			
 
				-chip.  Then, see Documentation/sound/alsa/HD-Audio-Modelstxt file,
			
 
				+chip.  Then, see Documentation/sound/alsa/HD-Audio-Models.txt file,
			
 
				 the section of HD-audio driver.  You can find a list of codecs
			
 
				 and `model` options belonging to each codec.  For example, for Realtek
			
 
				 ALC262 codec chip, pass `model=ultra` for devices that are compatible
			
@@ -177,7 +177,7 @@ with Samsung Q1 Ultra.
 
				 
			
 
				 Thus, the first thing you can do for any brand-new, unsupported and
			
 
				 non-working HD-audio hardware is to check HD-audio codec and several
			
 
				-different `model` option values.  If you have a luck, some of them
			
 
				+different `model` option values.  If you have any luck, some of them
			
 
				 might suit with your device well.
			
 
				 
			
 
				 Some codecs such as ALC880 have a special model option `model=test`.
			
--- a/Documentation/sound/alsa/soc/jack.txt
+++ b/Documentation/sound/alsa/soc/jack.txt
@@ -0,0 +1,71 @@
 
				+ASoC jack detection
			
 
				+===================
			
 
				+
			
 
				+ALSA has a standard API for representing physical jacks to user space,
			
 
				+the kernel side of which can be seen in include/sound/jack.h.  ASoC
			
 
				+provides a version of this API adding two additional features:
			
 
				+
			
 
				+ - It allows more than one jack detection method to work together on one
			
 
				+   user visible jack.  In embedded systems it is common for multiple
			
 
				+   to be present on a single jack but handled by separate bits of
			
 
				+   hardware.
			
 
				+
			
 
				+ - Integration with DAPM, allowing DAPM endpoints to be updated
			
 
				+   automatically based on the detected jack status (eg, turning off the
			
 
				+   headphone outputs if no headphones are present).
			
 
				+
			
 
				+This is done by splitting the jacks up into three things working
			
 
				+together: the jack itself represented by a struct snd_soc_jack, sets of
			
 
				+snd_soc_jack_pins representing DAPM endpoints to update and blocks of
			
 
				+code providing jack reporting mechanisms.
			
 
				+
			
 
				+For example, a system may have a stereo headset jack with two reporting
			
 
				+mechanisms, one for the headphone and one for the microphone.  Some
			
 
				+systems won't be able to use their speaker output while a headphone is
			
 
				+connected and so will want to make sure to update both speaker and
			
 
				+headphone when the headphone jack status changes.
			
 
				+
			
 
				+The jack - struct snd_soc_jack
			
 
				+==============================
			
 
				+
			
 
				+This represents a physical jack on the system and is what is visible to
			
 
				+user space.  The jack itself is completely passive, it is set up by the
			
 
				+machine driver and updated by jack detection methods.
			
 
				+
			
 
				+Jacks are created by the machine driver calling snd_soc_jack_new().
			
 
				+
			
 
				+snd_soc_jack_pin
			
 
				+================
			
 
				+
			
 
				+These represent a DAPM pin to update depending on some of the status
			
 
				+bits supported by the jack.  Each snd_soc_jack has zero or more of these
			
 
				+which are updated automatically.  They are created by the machine driver
			
 
				+and associated with the jack using snd_soc_jack_add_pins().  The status
			
 
				+of the endpoint may configured to be the opposite of the jack status if
			
 
				+required (eg, enabling a built in microphone if a microphone is not
			
 
				+connected via a jack).
			
 
				+
			
 
				+Jack detection methods
			
 
				+======================
			
 
				+
			
 
				+Actual jack detection is done by code which is able to monitor some
			
 
				+input to the system and update a jack by calling snd_soc_jack_report(),
			
 
				+specifying a subset of bits to update.  The jack detection code should
			
 
				+be set up by the machine driver, taking configuration for the jack to
			
 
				+update and the set of things to report when the jack is connected.
			
 
				+
			
 
				+Often this is done based on the status of a GPIO - a handler for this is
			
 
				+provided by the snd_soc_jack_add_gpio() function.  Other methods are
			
 
				+also available, for example integrated into CODECs.  One example of
			
 
				+CODEC integrated jack detection can be see in the WM8350 driver.
			
 
				+
			
 
				+Each jack may have multiple reporting mechanisms, though it will need at
			
 
				+least one to be useful.
			
 
				+
			
 
				+Machine drivers
			
 
				+===============
			
 
				+
			
 
				+These are all hooked together by the machine driver depending on the
			
 
				+system hardware.  The machine driver will set up the snd_soc_jack and
			
 
				+the list of pins to update then set up one or more jack detection
			
 
				+mechanisms to update that jack based on their current status.
			
--- a/Documentation/sparse.txt
+++ b/Documentation/sparse.txt
@@ -42,6 +42,14 @@ sure that bitwise types don't get mixed up (little-endian vs big-endian
 
				 vs cpu-endian vs whatever), and there the constant "0" really _is_
			
 
				 special.
			
 
				 
			
 
				+__bitwise__ - to be used for relatively compact stuff (gfp_t, etc.) that
			
 
				+is mostly warning-free and is supposed to stay that way.  Warnings will
			
 
				+be generated without __CHECK_ENDIAN__.
			
 
				+
			
 
				+__bitwise - noisy stuff; in particular, __le*/__be* are that.  We really
			
 
				+don't want to drown in noise unless we'd explicitly asked for it.
			
 
				+
			
 
				+
			
 
				 Getting sparse
			
 
				 ~~~~~~~~~~~~~~
			
 
				 
			
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -511,10 +511,16 @@ SPI MASTER METHODS
 
				 	This sets up the device clock rate, SPI mode, and word sizes.
			
 
				 	Drivers may change the defaults provided by board_info, and then
			
 
				 	call spi_setup(spi) to invoke this routine.  It may sleep.
			
 
				+
			
 
				 	Unless each SPI slave has its own configuration registers, don't
			
 
				 	change them right away ... otherwise drivers could corrupt I/O
			
 
				 	that's in progress for other SPI devices.
			
 
				 
			
 
				+		** BUG ALERT:  for some reason the first version of
			
 
				+		** many spi_master drivers seems to get this wrong.
			
 
				+		** When you code setup(), ASSUME that the controller
			
 
				+		** is actively processing transfers for another device.
			
 
				+
			
 
				     master->transfer(struct spi_device *spi, struct spi_message *message)
			
 
				     	This must not sleep.  Its responsibility is arrange that the
			
 
				 	transfer happens and its complete() callback is issued.  The two
			
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -95,7 +95,7 @@ of struct cmsghdr structures with appended data.
 
				 
			
 
				 There is only one file in this directory.
			
 
				 unix_dgram_qlen limits the max number of datagrams queued in Unix domain
			
 
				-socket's buffer. It will not take effect unless PF_UNIX flag is spicified.
			
 
				+socket's buffer. It will not take effect unless PF_UNIX flag is specified.
			
 
				 
			
 
				 
			
 
				 3. /proc/sys/net/ipv4 - IPV4 settings
			
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -88,6 +88,10 @@ will itself start writeback.
 
				 If dirty_bytes is written, dirty_ratio becomes a function of its value
			
 
				 (dirty_bytes / the amount of dirtyable system memory).
			
 
				 
			
 
				+Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
			
 
				+value lower than this limit will be ignored and the old configuration will be
			
 
				+retained.
			
 
				+
			
 
				 ==============================================================
			
 
				 
			
 
				 dirty_expire_centisecs
			
--- a/Documentation/sysfs-rules.txt
+++ b/Documentation/sysfs-rules.txt
@@ -113,7 +113,7 @@ versions of the sysfs interface.
 
				   "devices" directory at /sys/subsystem/<name>/devices.
			
 
				 
			
 
				   If /sys/subsystem exists, /sys/bus, /sys/class and /sys/block can be
			
 
				-  ignored. If it does not exist, you have always to scan all three
			
 
				+  ignored. If it does not exist, you always have to scan all three
			
 
				   places, as the kernel is free to move a subsystem from one place to
			
 
				   the other, as long as the devices are still reachable by the same
			
 
				   subsystem name.
			
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -115,6 +115,8 @@ On all -  write a character to /proc/sysrq-trigger.  e.g.:
 
				 
			
 
				 'x'	- Used by xmon interface on ppc/powerpc platforms.
			
 
				 
			
 
				+'z'	- Dump the ftrace buffer
			
 
				+
			
 
				 '0'-'9' - Sets the console log level, controlling which kernel messages
			
 
				           will be printed to your console. ('0', for example would make
			
 
				           it so that only emergency messages like PANICs or OOPSes would
			
--- a/Documentation/tomoyo.txt
+++ b/Documentation/tomoyo.txt
@@ -0,0 +1,55 @@
 
				+--- What is TOMOYO? ---
			
 
				+
			
 
				+TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel.
			
 
				+
			
 
				+LiveCD-based tutorials are available at
			
 
				+http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/ubuntu8.04-live/
			
 
				+http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/centos5-live/ .
			
 
				+Though these tutorials use non-LSM version of TOMOYO, they are useful for you
			
 
				+to know what TOMOYO is.
			
 
				+
			
 
				+--- How to enable TOMOYO? ---
			
 
				+
			
 
				+Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on
			
 
				+kernel's command line.
			
 
				+
			
 
				+Please see http://tomoyo.sourceforge.jp/en/2.2.x/ for details.
			
 
				+
			
 
				+--- Where is documentation? ---
			
 
				+
			
 
				+User <-> Kernel interface documentation is available at
			
 
				+http://tomoyo.sourceforge.jp/en/2.2.x/policy-reference.html .
			
 
				+
			
 
				+Materials we prepared for seminars and symposiums are available at
			
 
				+http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 .
			
 
				+Below lists are chosen from three aspects.
			
 
				+
			
 
				+What is TOMOYO?
			
 
				+  TOMOYO Linux Overview
			
 
				+    http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf
			
 
				+  TOMOYO Linux: pragmatic and manageable security for Linux
			
 
				+    http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf
			
 
				+  TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box
			
 
				+    http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf
			
 
				+
			
 
				+What can TOMOYO do?
			
 
				+  Deep inside TOMOYO Linux
			
 
				+    http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf
			
 
				+  The role of "pathname based access control" in security.
			
 
				+    http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf
			
 
				+
			
 
				+History of TOMOYO?
			
 
				+  Realities of Mainlining
			
 
				+    http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf
			
 
				+
			
 
				+--- What is future plan? ---
			
 
				+
			
 
				+We believe that inode based security and name based security are complementary
			
 
				+and both should be used together. But unfortunately, so far, we cannot enable
			
 
				+multiple LSM modules at the same time. We feel sorry that you have to give up
			
 
				+SELinux/SMACK/AppArmor etc. when you want to use TOMOYO.
			
 
				+
			
 
				+We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM
			
 
				+version of TOMOYO, available at http://tomoyo.sourceforge.jp/en/1.6.x/ .
			
 
				+LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning
			
 
				+to port non-LSM version's functionalities to LSM versions.
			
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -0,0 +1,1828 @@
 
				+		ftrace - Function Tracer
			
 
				+		========================
			
 
				+
			
 
				+Copyright 2008 Red Hat Inc.
			
 
				+   Author:   Steven Rostedt <srostedt@redhat.com>
			
 
				+  License:   The GNU Free Documentation License, Version 1.2
			
 
				+               (dual licensed under the GPL v2)
			
 
				+Reviewers:   Elias Oltmanns, Randy Dunlap, Andrew Morton,
			
 
				+	     John Kacur, and David Teigland.
			
 
				+
			
 
				+Written for: 2.6.28-rc2
			
 
				+
			
 
				+Introduction
			
 
				+------------
			
 
				+
			
 
				+Ftrace is an internal tracer designed to help out developers and
			
 
				+designers of systems to find what is going on inside the kernel.
			
 
				+It can be used for debugging or analyzing latencies and
			
 
				+performance issues that take place outside of user-space.
			
 
				+
			
 
				+Although ftrace is the function tracer, it also includes an
			
 
				+infrastructure that allows for other types of tracing. Some of
			
 
				+the tracers that are currently in ftrace include a tracer to
			
 
				+trace context switches, the time it takes for a high priority
			
 
				+task to run after it was woken up, the time interrupts are
			
 
				+disabled, and more (ftrace allows for tracer plugins, which
			
 
				+means that the list of tracers can always grow).
			
 
				+
			
 
				+
			
 
				+The File System
			
 
				+---------------
			
 
				+
			
 
				+Ftrace uses the debugfs file system to hold the control files as
			
 
				+well as the files to display output.
			
 
				+
			
 
				+To mount the debugfs system:
			
 
				+
			
 
				+  # mkdir /debug
			
 
				+  # mount -t debugfs nodev /debug
			
 
				+
			
 
				+( Note: it is more common to mount at /sys/kernel/debug, but for
			
 
				+  simplicity this document will use /debug)
			
 
				+
			
 
				+That's it! (assuming that you have ftrace configured into your kernel)
			
 
				+
			
 
				+After mounting the debugfs, you can see a directory called
			
 
				+"tracing".  This directory contains the control and output files
			
 
				+of ftrace. Here is a list of some of the key files:
			
 
				+
			
 
				+
			
 
				+ Note: all time values are in microseconds.
			
 
				+
			
 
				+  current_tracer:
			
 
				+
			
 
				+	This is used to set or display the current tracer
			
 
				+	that is configured.
			
 
				+
			
 
				+  available_tracers:
			
 
				+
			
 
				+	This holds the different types of tracers that
			
 
				+	have been compiled into the kernel. The
			
 
				+	tracers listed here can be configured by
			
 
				+	echoing their name into current_tracer.
			
 
				+
			
 
				+  tracing_enabled:
			
 
				+
			
 
				+	This sets or displays whether the current_tracer
			
 
				+	is activated and tracing or not. Echo 0 into this
			
 
				+	file to disable the tracer or 1 to enable it.
			
 
				+
			
 
				+  trace:
			
 
				+
			
 
				+	This file holds the output of the trace in a human
			
 
				+	readable format (described below).
			
 
				+
			
 
				+  latency_trace:
			
 
				+
			
 
				+	This file shows the same trace but the information
			
 
				+	is organized more to display possible latencies
			
 
				+	in the system (described below).
			
 
				+
			
 
				+  trace_pipe:
			
 
				+
			
 
				+	The output is the same as the "trace" file but this
			
 
				+	file is meant to be streamed with live tracing.
			
 
				+	Reads from this file will block until new data
			
 
				+	is retrieved. Unlike the "trace" and "latency_trace"
			
 
				+	files, this file is a consumer. This means reading
			
 
				+	from this file causes sequential reads to display
			
 
				+	more current data. Once data is read from this
			
 
				+	file, it is consumed, and will not be read
			
 
				+	again with a sequential read. The "trace" and
			
 
				+	"latency_trace" files are static, and if the
			
 
				+	tracer is not adding more data, they will display
			
 
				+	the same information every time they are read.
			
 
				+
			
 
				+  trace_options:
			
 
				+
			
 
				+	This file lets the user control the amount of data
			
 
				+	that is displayed in one of the above output
			
 
				+	files.
			
 
				+
			
 
				+  tracing_max_latency:
			
 
				+
			
 
				+	Some of the tracers record the max latency.
			
 
				+	For example, the time interrupts are disabled.
			
 
				+	This time is saved in this file. The max trace
			
 
				+	will also be stored, and displayed by either
			
 
				+	"trace" or "latency_trace".  A new max trace will
			
 
				+	only be recorded if the latency is greater than
			
 
				+	the value in this file. (in microseconds)
			
 
				+
			
 
				+  buffer_size_kb:
			
 
				+
			
 
				+	This sets or displays the number of kilobytes each CPU
			
 
				+	buffer can hold. The tracer buffers are the same size
			
 
				+	for each CPU. The displayed number is the size of the
			
 
				+	CPU buffer and not total size of all buffers. The
			
 
				+	trace buffers are allocated in pages (blocks of memory
			
 
				+	that the kernel uses for allocation, usually 4 KB in size).
			
 
				+	If the last page allocated has room for more bytes
			
 
				+	than requested, the rest of the page will be used,
			
 
				+	making the actual allocation bigger than requested.
			
 
				+	( Note, the size may not be a multiple of the page size
			
 
				+	  due to buffer managment overhead. )
			
 
				+
			
 
				+	This can only be updated when the current_tracer
			
 
				+	is set to "nop".
			
 
				+
			
 
				+  tracing_cpumask:
			
 
				+
			
 
				+	This is a mask that lets the user only trace
			
 
				+	on specified CPUS. The format is a hex string
			
 
				+	representing the CPUS.
			
 
				+
			
 
				+  set_ftrace_filter:
			
 
				+
			
 
				+	When dynamic ftrace is configured in (see the
			
 
				+	section below "dynamic ftrace"), the code is dynamically
			
 
				+	modified (code text rewrite) to disable calling of the
			
 
				+	function profiler (mcount). This lets tracing be configured
			
 
				+	in with practically no overhead in performance.  This also
			
 
				+	has a side effect of enabling or disabling specific functions
			
 
				+	to be traced. Echoing names of functions into this file
			
 
				+	will limit the trace to only those functions.
			
 
				+
			
 
				+  set_ftrace_notrace:
			
 
				+
			
 
				+	This has an effect opposite to that of
			
 
				+	set_ftrace_filter. Any function that is added here will not
			
 
				+	be traced. If a function exists in both set_ftrace_filter
			
 
				+	and set_ftrace_notrace,	the function will _not_ be traced.
			
 
				+
			
 
				+  set_ftrace_pid:
			
 
				+
			
 
				+	Have the function tracer only trace a single thread.
			
 
				+
			
 
				+  set_graph_function:
			
 
				+
			
 
				+	Set a "trigger" function where tracing should start
			
 
				+	with the function graph tracer (See the section
			
 
				+	"dynamic ftrace" for more details).
			
 
				+
			
 
				+  available_filter_functions:
			
 
				+
			
 
				+	This lists the functions that ftrace
			
 
				+	has processed and can trace. These are the function
			
 
				+	names that you can pass to "set_ftrace_filter" or
			
 
				+	"set_ftrace_notrace". (See the section "dynamic ftrace"
			
 
				+	below for more details.)
			
 
				+
			
 
				+
			
 
				+The Tracers
			
 
				+-----------
			
 
				+
			
 
				+Here is the list of current tracers that may be configured.
			
 
				+
			
 
				+  "function"
			
 
				+
			
 
				+	Function call tracer to trace all kernel functions.
			
 
				+
			
 
				+  "function_graph_tracer"
			
 
				+
			
 
				+	Similar to the function tracer except that the
			
 
				+	function tracer probes the functions on their entry
			
 
				+	whereas the function graph tracer traces on both entry
			
 
				+	and exit of the functions. It then provides the ability
			
 
				+	to draw a graph of function calls similar to C code
			
 
				+	source.
			
 
				+
			
 
				+  "sched_switch"
			
 
				+
			
 
				+	Traces the context switches and wakeups between tasks.
			
 
				+
			
 
				+  "irqsoff"
			
 
				+
			
 
				+	Traces the areas that disable interrupts and saves
			
 
				+	the trace with the longest max latency.
			
 
				+	See tracing_max_latency. When a new max is recorded,
			
 
				+	it replaces the old trace. It is best to view this
			
 
				+	trace via the latency_trace file.
			
 
				+
			
 
				+  "preemptoff"
			
 
				+
			
 
				+	Similar to irqsoff but traces and records the amount of
			
 
				+	time for which preemption is disabled.
			
 
				+
			
 
				+  "preemptirqsoff"
			
 
				+
			
 
				+	Similar to irqsoff and preemptoff, but traces and
			
 
				+	records the largest time for which irqs and/or preemption
			
 
				+	is disabled.
			
 
				+
			
 
				+  "wakeup"
			
 
				+
			
 
				+	Traces and records the max latency that it takes for
			
 
				+	the highest priority task to get scheduled after
			
 
				+	it has been woken up.
			
 
				+
			
 
				+  "hw-branch-tracer"
			
 
				+
			
 
				+	Uses the BTS CPU feature on x86 CPUs to traces all
			
 
				+	branches executed.
			
 
				+
			
 
				+  "nop"
			
 
				+
			
 
				+	This is the "trace nothing" tracer. To remove all
			
 
				+	tracers from tracing simply echo "nop" into
			
 
				+	current_tracer.
			
 
				+
			
 
				+
			
 
				+Examples of using the tracer
			
 
				+----------------------------
			
 
				+
			
 
				+Here are typical examples of using the tracers when controlling
			
 
				+them only with the debugfs interface (without using any
			
 
				+user-land utilities).
			
 
				+
			
 
				+Output format:
			
 
				+--------------
			
 
				+
			
 
				+Here is an example of the output format of the file "trace"
			
 
				+
			
 
				+                             --------
			
 
				+# tracer: function
			
 
				+#
			
 
				+#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				+#              | |      |          |         |
			
 
				+            bash-4251  [01] 10152.583854: path_put <-path_walk
			
 
				+            bash-4251  [01] 10152.583855: dput <-path_put
			
 
				+            bash-4251  [01] 10152.583855: _atomic_dec_and_lock <-dput
			
 
				+                             --------
			
 
				+
			
 
				+A header is printed with the tracer name that is represented by
			
 
				+the trace. In this case the tracer is "function". Then a header
			
 
				+showing the format. Task name "bash", the task PID "4251", the
			
 
				+CPU that it was running on "01", the timestamp in <secs>.<usecs>
			
 
				+format, the function name that was traced "path_put" and the
			
 
				+parent function that called this function "path_walk". The
			
 
				+timestamp is the time at which the function was entered.
			
 
				+
			
 
				+The sched_switch tracer also includes tracing of task wakeups
			
 
				+and context switches.
			
 
				+
			
 
				+     ksoftirqd/1-7     [01]  1453.070013:      7:115:R   +  2916:115:S
			
 
				+     ksoftirqd/1-7     [01]  1453.070013:      7:115:R   +    10:115:S
			
 
				+     ksoftirqd/1-7     [01]  1453.070013:      7:115:R ==>    10:115:R
			
 
				+        events/1-10    [01]  1453.070013:     10:115:S ==>  2916:115:R
			
 
				+     kondemand/1-2916  [01]  1453.070013:   2916:115:S ==>     7:115:R
			
 
				+     ksoftirqd/1-7     [01]  1453.070013:      7:115:S ==>     0:140:R
			
 
				+
			
 
				+Wake ups are represented by a "+" and the context switches are
			
 
				+shown as "==>".  The format is:
			
 
				+
			
 
				+ Context switches:
			
 
				+
			
 
				+       Previous task              Next Task
			
 
				+
			
 
				+  <pid>:<prio>:<state>  ==>  <pid>:<prio>:<state>
			
 
				+
			
 
				+ Wake ups:
			
 
				+
			
 
				+       Current task               Task waking up
			
 
				+
			
 
				+  <pid>:<prio>:<state>    +  <pid>:<prio>:<state>
			
 
				+
			
 
				+The prio is the internal kernel priority, which is the inverse
			
 
				+of the priority that is usually displayed by user-space tools.
			
 
				+Zero represents the highest priority (99). Prio 100 starts the
			
 
				+"nice" priorities with 100 being equal to nice -20 and 139 being
			
 
				+nice 19. The prio "140" is reserved for the idle task which is
			
 
				+the lowest priority thread (pid 0).
			
 
				+
			
 
				+
			
 
				+Latency trace format
			
 
				+--------------------
			
 
				+
			
 
				+For traces that display latency times, the latency_trace file
			
 
				+gives somewhat more information to see why a latency happened.
			
 
				+Here is a typical trace.
			
 
				+
			
 
				+# tracer: irqsoff
			
 
				+#
			
 
				+irqsoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 97 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				+    -----------------
			
 
				+ => started at: apic_timer_interrupt
			
 
				+ => ended at:   do_softirq
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+  <idle>-0     0d..1    0us+: trace_hardirqs_off_thunk (apic_timer_interrupt)
			
 
				+  <idle>-0     0d.s.   97us : __do_softirq (do_softirq)
			
 
				+  <idle>-0     0d.s1   98us : trace_hardirqs_on (do_softirq)
			
 
				+
			
 
				+
			
 
				+This shows that the current tracer is "irqsoff" tracing the time
			
 
				+for which interrupts were disabled. It gives the trace version
			
 
				+and the version of the kernel upon which this was executed on
			
 
				+(2.6.26-rc8). Then it displays the max latency in microsecs (97
			
 
				+us). The number of trace entries displayed and the total number
			
 
				+recorded (both are three: #3/3). The type of preemption that was
			
 
				+used (PREEMPT). VP, KP, SP, and HP are always zero and are
			
 
				+reserved for later use. #P is the number of online CPUS (#P:2).
			
 
				+
			
 
				+The task is the process that was running when the latency
			
 
				+occurred. (swapper pid: 0).
			
 
				+
			
 
				+The start and stop (the functions in which the interrupts were
			
 
				+disabled and enabled respectively) that caused the latencies:
			
 
				+
			
 
				+  apic_timer_interrupt is where the interrupts were disabled.
			
 
				+  do_softirq is where they were enabled again.
			
 
				+
			
 
				+The next lines after the header are the trace itself. The header
			
 
				+explains which is which.
			
 
				+
			
 
				+  cmd: The name of the process in the trace.
			
 
				+
			
 
				+  pid: The PID of that process.
			
 
				+
			
 
				+  CPU#: The CPU which the process was running on.
			
 
				+
			
 
				+  irqs-off: 'd' interrupts are disabled. '.' otherwise.
			
 
				+	    Note: If the architecture does not support a way to
			
 
				+		  read the irq flags variable, an 'X' will always
			
 
				+		  be printed here.
			
 
				+
			
 
				+  need-resched: 'N' task need_resched is set, '.' otherwise.
			
 
				+
			
 
				+  hardirq/softirq:
			
 
				+	'H' - hard irq occurred inside a softirq.
			
 
				+	'h' - hard irq is running
			
 
				+	's' - soft irq is running
			
 
				+	'.' - normal context.
			
 
				+
			
 
				+  preempt-depth: The level of preempt_disabled
			
 
				+
			
 
				+The above is mostly meaningful for kernel developers.
			
 
				+
			
 
				+  time: This differs from the trace file output. The trace file output
			
 
				+	includes an absolute timestamp. The timestamp used by the
			
 
				+	latency_trace file is relative to the start of the trace.
			
 
				+
			
 
				+  delay: This is just to help catch your eye a bit better. And
			
 
				+	 needs to be fixed to be only relative to the same CPU.
			
 
				+	 The marks are determined by the difference between this
			
 
				+	 current trace and the next trace.
			
 
				+	  '!' - greater than preempt_mark_thresh (default 100)
			
 
				+	  '+' - greater than 1 microsecond
			
 
				+	  ' ' - less than or equal to 1 microsecond.
			
 
				+
			
 
				+  The rest is the same as the 'trace' file.
			
 
				+
			
 
				+
			
 
				+trace_options
			
 
				+-------------
			
 
				+
			
 
				+The trace_options file is used to control what gets printed in
			
 
				+the trace output. To see what is available, simply cat the file:
			
 
				+
			
 
				+  cat /debug/tracing/trace_options
			
 
				+  print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
			
 
				+  noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
			
 
				+
			
 
				+To disable one of the options, echo in the option prepended with
			
 
				+"no".
			
 
				+
			
 
				+  echo noprint-parent > /debug/tracing/trace_options
			
 
				+
			
 
				+To enable an option, leave off the "no".
			
 
				+
			
 
				+  echo sym-offset > /debug/tracing/trace_options
			
 
				+
			
 
				+Here are the available options:
			
 
				+
			
 
				+  print-parent - On function traces, display the calling (parent)
			
 
				+		 function as well as the function being traced.
			
 
				+
			
 
				+  print-parent:
			
 
				+   bash-4000  [01]  1477.606694: simple_strtoul <-strict_strtoul
			
 
				+
			
 
				+  noprint-parent:
			
 
				+   bash-4000  [01]  1477.606694: simple_strtoul
			
 
				+
			
 
				+
			
 
				+  sym-offset - Display not only the function name, but also the
			
 
				+	       offset in the function. For example, instead of
			
 
				+	       seeing just "ktime_get", you will see
			
 
				+	       "ktime_get+0xb/0x20".
			
 
				+
			
 
				+  sym-offset:
			
 
				+   bash-4000  [01]  1477.606694: simple_strtoul+0x6/0xa0
			
 
				+
			
 
				+  sym-addr - this will also display the function address as well
			
 
				+	     as the function name.
			
 
				+
			
 
				+  sym-addr:
			
 
				+   bash-4000  [01]  1477.606694: simple_strtoul <c0339346>
			
 
				+
			
 
				+  verbose - This deals with the latency_trace file.
			
 
				+
			
 
				+    bash  4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
			
 
				+    (+0.000ms): simple_strtoul (strict_strtoul)
			
 
				+
			
 
				+  raw - This will display raw numbers. This option is best for
			
 
				+	use with user applications that can translate the raw
			
 
				+	numbers better than having it done in the kernel.
			
 
				+
			
 
				+  hex - Similar to raw, but the numbers will be in a hexadecimal
			
 
				+	format.
			
 
				+
			
 
				+  bin - This will print out the formats in raw binary.
			
 
				+
			
 
				+  block - TBD (needs update)
			
 
				+
			
 
				+  stacktrace - This is one of the options that changes the trace
			
 
				+	       itself. When a trace is recorded, so is the stack
			
 
				+	       of functions. This allows for back traces of
			
 
				+	       trace sites.
			
 
				+
			
 
				+  userstacktrace - This option changes the trace. It records a
			
 
				+		   stacktrace of the current userspace thread.
			
 
				+
			
 
				+  sym-userobj - when user stacktrace are enabled, look up which
			
 
				+		object the address belongs to, and print a
			
 
				+		relative address. This is especially useful when
			
 
				+		ASLR is on, otherwise you don't get a chance to
			
 
				+		resolve the address to object/file/line after
			
 
				+		the app is no longer running
			
 
				+
			
 
				+		The lookup is performed when you read
			
 
				+		trace,trace_pipe,latency_trace. Example:
			
 
				+
			
 
				+		a.out-1623  [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
			
 
				+x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
			
 
				+
			
 
				+  sched-tree - trace all tasks that are on the runqueue, at
			
 
				+	       every scheduling event. Will add overhead if
			
 
				+	       there's a lot of tasks running at once.
			
 
				+
			
 
				+
			
 
				+sched_switch
			
 
				+------------
			
 
				+
			
 
				+This tracer simply records schedule switches. Here is an example
			
 
				+of how to use it.
			
 
				+
			
 
				+ # echo sched_switch > /debug/tracing/current_tracer
			
 
				+ # echo 1 > /debug/tracing/tracing_enabled
			
 
				+ # sleep 1
			
 
				+ # echo 0 > /debug/tracing/tracing_enabled
			
 
				+ # cat /debug/tracing/trace
			
 
				+
			
 
				+# tracer: sched_switch
			
 
				+#
			
 
				+#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				+#              | |      |          |         |
			
 
				+            bash-3997  [01]   240.132281:   3997:120:R   +  4055:120:R
			
 
				+            bash-3997  [01]   240.132284:   3997:120:R ==>  4055:120:R
			
 
				+           sleep-4055  [01]   240.132371:   4055:120:S ==>  3997:120:R
			
 
				+            bash-3997  [01]   240.132454:   3997:120:R   +  4055:120:S
			
 
				+            bash-3997  [01]   240.132457:   3997:120:R ==>  4055:120:R
			
 
				+           sleep-4055  [01]   240.132460:   4055:120:D ==>  3997:120:R
			
 
				+            bash-3997  [01]   240.132463:   3997:120:R   +  4055:120:D
			
 
				+            bash-3997  [01]   240.132465:   3997:120:R ==>  4055:120:R
			
 
				+          <idle>-0     [00]   240.132589:      0:140:R   +     4:115:S
			
 
				+          <idle>-0     [00]   240.132591:      0:140:R ==>     4:115:R
			
 
				+     ksoftirqd/0-4     [00]   240.132595:      4:115:S ==>     0:140:R
			
 
				+          <idle>-0     [00]   240.132598:      0:140:R   +     4:115:S
			
 
				+          <idle>-0     [00]   240.132599:      0:140:R ==>     4:115:R
			
 
				+     ksoftirqd/0-4     [00]   240.132603:      4:115:S ==>     0:140:R
			
 
				+           sleep-4055  [01]   240.133058:   4055:120:S ==>  3997:120:R
			
 
				+ [...]
			
 
				+
			
 
				+
			
 
				+As we have discussed previously about this format, the header
			
 
				+shows the name of the trace and points to the options. The
			
 
				+"FUNCTION" is a misnomer since here it represents the wake ups
			
 
				+and context switches.
			
 
				+
			
 
				+The sched_switch file only lists the wake ups (represented with
			
 
				+'+') and context switches ('==>') with the previous task or
			
 
				+current task first followed by the next task or task waking up.
			
 
				+The format for both of these is PID:KERNEL-PRIO:TASK-STATE.
			
 
				+Remember that the KERNEL-PRIO is the inverse of the actual
			
 
				+priority with zero (0) being the highest priority and the nice
			
 
				+values starting at 100 (nice -20). Below is a quick chart to map
			
 
				+the kernel priority to user land priorities.
			
 
				+
			
 
				+  Kernel priority: 0 to 99    ==> user RT priority 99 to 0
			
 
				+  Kernel priority: 100 to 139 ==> user nice -20 to 19
			
 
				+  Kernel priority: 140        ==> idle task priority
			
 
				+
			
 
				+The task states are:
			
 
				+
			
 
				+ R - running : wants to run, may not actually be running
			
 
				+ S - sleep   : process is waiting to be woken up (handles signals)
			
 
				+ D - disk sleep (uninterruptible sleep) : process must be woken up
			
 
				+					(ignores signals)
			
 
				+ T - stopped : process suspended
			
 
				+ t - traced  : process is being traced (with something like gdb)
			
 
				+ Z - zombie  : process waiting to be cleaned up
			
 
				+ X - unknown
			
 
				+
			
 
				+
			
 
				+ftrace_enabled
			
 
				+--------------
			
 
				+
			
 
				+The following tracers (listed below) give different output
			
 
				+depending on whether or not the sysctl ftrace_enabled is set. To
			
 
				+set ftrace_enabled, one can either use the sysctl function or
			
 
				+set it via the proc file system interface.
			
 
				+
			
 
				+  sysctl kernel.ftrace_enabled=1
			
 
				+
			
 
				+ or
			
 
				+
			
 
				+  echo 1 > /proc/sys/kernel/ftrace_enabled
			
 
				+
			
 
				+To disable ftrace_enabled simply replace the '1' with '0' in the
			
 
				+above commands.
			
 
				+
			
 
				+When ftrace_enabled is set the tracers will also record the
			
 
				+functions that are within the trace. The descriptions of the
			
 
				+tracers will also show an example with ftrace enabled.
			
 
				+
			
 
				+
			
 
				+irqsoff
			
 
				+-------
			
 
				+
			
 
				+When interrupts are disabled, the CPU can not react to any other
			
 
				+external event (besides NMIs and SMIs). This prevents the timer
			
 
				+interrupt from triggering or the mouse interrupt from letting
			
 
				+the kernel know of a new mouse event. The result is a latency
			
 
				+with the reaction time.
			
 
				+
			
 
				+The irqsoff tracer tracks the time for which interrupts are
			
 
				+disabled. When a new maximum latency is hit, the tracer saves
			
 
				+the trace leading up to that latency point so that every time a
			
 
				+new maximum is reached, the old saved trace is discarded and the
			
 
				+new trace is saved.
			
 
				+
			
 
				+To reset the maximum, echo 0 into tracing_max_latency. Here is
			
 
				+an example:
			
 
				+
			
 
				+ # echo irqsoff > /debug/tracing/current_tracer
			
 
				+ # echo 0 > /debug/tracing/tracing_max_latency
			
 
				+ # echo 1 > /debug/tracing/tracing_enabled
			
 
				+ # ls -ltr
			
 
				+ [...]
			
 
				+ # echo 0 > /debug/tracing/tracing_enabled
			
 
				+ # cat /debug/tracing/latency_trace
			
 
				+# tracer: irqsoff
			
 
				+#
			
 
				+irqsoff latency trace v1.1.5 on 2.6.26
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 12 us, #3/3, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: bash-3730 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				+    -----------------
			
 
				+ => started at: sys_setpgid
			
 
				+ => ended at:   sys_setpgid
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+    bash-3730  1d...    0us : _write_lock_irq (sys_setpgid)
			
 
				+    bash-3730  1d..1    1us+: _write_unlock_irq (sys_setpgid)
			
 
				+    bash-3730  1d..2   14us : trace_hardirqs_on (sys_setpgid)
			
 
				+
			
 
				+
			
 
				+Here we see that that we had a latency of 12 microsecs (which is
			
 
				+very good). The _write_lock_irq in sys_setpgid disabled
			
 
				+interrupts. The difference between the 12 and the displayed
			
 
				+timestamp 14us occurred because the clock was incremented
			
 
				+between the time of recording the max latency and the time of
			
 
				+recording the function that had that latency.
			
 
				+
			
 
				+Note the above example had ftrace_enabled not set. If we set the
			
 
				+ftrace_enabled, we get a much larger output:
			
 
				+
			
 
				+# tracer: irqsoff
			
 
				+#
			
 
				+irqsoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 50 us, #101/101, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: ls-4339 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				+    -----------------
			
 
				+ => started at: __alloc_pages_internal
			
 
				+ => ended at:   __alloc_pages_internal
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+      ls-4339  0...1    0us+: get_page_from_freelist (__alloc_pages_internal)
			
 
				+      ls-4339  0d..1    3us : rmqueue_bulk (get_page_from_freelist)
			
 
				+      ls-4339  0d..1    3us : _spin_lock (rmqueue_bulk)
			
 
				+      ls-4339  0d..1    4us : add_preempt_count (_spin_lock)
			
 
				+      ls-4339  0d..2    4us : __rmqueue (rmqueue_bulk)
			
 
				+      ls-4339  0d..2    5us : __rmqueue_smallest (__rmqueue)
			
 
				+      ls-4339  0d..2    5us : __mod_zone_page_state (__rmqueue_smallest)
			
 
				+      ls-4339  0d..2    6us : __rmqueue (rmqueue_bulk)
			
 
				+      ls-4339  0d..2    6us : __rmqueue_smallest (__rmqueue)
			
 
				+      ls-4339  0d..2    7us : __mod_zone_page_state (__rmqueue_smallest)
			
 
				+      ls-4339  0d..2    7us : __rmqueue (rmqueue_bulk)
			
 
				+      ls-4339  0d..2    8us : __rmqueue_smallest (__rmqueue)
			
 
				+[...]
			
 
				+      ls-4339  0d..2   46us : __rmqueue_smallest (__rmqueue)
			
 
				+      ls-4339  0d..2   47us : __mod_zone_page_state (__rmqueue_smallest)
			
 
				+      ls-4339  0d..2   47us : __rmqueue (rmqueue_bulk)
			
 
				+      ls-4339  0d..2   48us : __rmqueue_smallest (__rmqueue)
			
 
				+      ls-4339  0d..2   48us : __mod_zone_page_state (__rmqueue_smallest)
			
 
				+      ls-4339  0d..2   49us : _spin_unlock (rmqueue_bulk)
			
 
				+      ls-4339  0d..2   49us : sub_preempt_count (_spin_unlock)
			
 
				+      ls-4339  0d..1   50us : get_page_from_freelist (__alloc_pages_internal)
			
 
				+      ls-4339  0d..2   51us : trace_hardirqs_on (__alloc_pages_internal)
			
 
				+
			
 
				+
			
 
				+
			
 
				+Here we traced a 50 microsecond latency. But we also see all the
			
 
				+functions that were called during that time. Note that by
			
 
				+enabling function tracing, we incur an added overhead. This
			
 
				+overhead may extend the latency times. But nevertheless, this
			
 
				+trace has provided some very helpful debugging information.
			
 
				+
			
 
				+
			
 
				+preemptoff
			
 
				+----------
			
 
				+
			
 
				+When preemption is disabled, we may be able to receive
			
 
				+interrupts but the task cannot be preempted and a higher
			
 
				+priority task must wait for preemption to be enabled again
			
 
				+before it can preempt a lower priority task.
			
 
				+
			
 
				+The preemptoff tracer traces the places that disable preemption.
			
 
				+Like the irqsoff tracer, it records the maximum latency for
			
 
				+which preemption was disabled. The control of preemptoff tracer
			
 
				+is much like the irqsoff tracer.
			
 
				+
			
 
				+ # echo preemptoff > /debug/tracing/current_tracer
			
 
				+ # echo 0 > /debug/tracing/tracing_max_latency
			
 
				+ # echo 1 > /debug/tracing/tracing_enabled
			
 
				+ # ls -ltr
			
 
				+ [...]
			
 
				+ # echo 0 > /debug/tracing/tracing_enabled
			
 
				+ # cat /debug/tracing/latency_trace
			
 
				+# tracer: preemptoff
			
 
				+#
			
 
				+preemptoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 29 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				+    -----------------
			
 
				+ => started at: do_IRQ
			
 
				+ => ended at:   __do_softirq
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+    sshd-4261  0d.h.    0us+: irq_enter (do_IRQ)
			
 
				+    sshd-4261  0d.s.   29us : _local_bh_enable (__do_softirq)
			
 
				+    sshd-4261  0d.s1   30us : trace_preempt_on (__do_softirq)
			
 
				+
			
 
				+
			
 
				+This has some more changes. Preemption was disabled when an
			
 
				+interrupt came in (notice the 'h'), and was enabled while doing
			
 
				+a softirq. (notice the 's'). But we also see that interrupts
			
 
				+have been disabled when entering the preempt off section and
			
 
				+leaving it (the 'd'). We do not know if interrupts were enabled
			
 
				+in the mean time.
			
 
				+
			
 
				+# tracer: preemptoff
			
 
				+#
			
 
				+preemptoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 63 us, #87/87, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				+    -----------------
			
 
				+ => started at: remove_wait_queue
			
 
				+ => ended at:   __do_softirq
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+    sshd-4261  0d..1    0us : _spin_lock_irqsave (remove_wait_queue)
			
 
				+    sshd-4261  0d..1    1us : _spin_unlock_irqrestore (remove_wait_queue)
			
 
				+    sshd-4261  0d..1    2us : do_IRQ (common_interrupt)
			
 
				+    sshd-4261  0d..1    2us : irq_enter (do_IRQ)
			
 
				+    sshd-4261  0d..1    2us : idle_cpu (irq_enter)
			
 
				+    sshd-4261  0d..1    3us : add_preempt_count (irq_enter)
			
 
				+    sshd-4261  0d.h1    3us : idle_cpu (irq_enter)
			
 
				+    sshd-4261  0d.h.    4us : handle_fasteoi_irq (do_IRQ)
			
 
				+[...]
			
 
				+    sshd-4261  0d.h.   12us : add_preempt_count (_spin_lock)
			
 
				+    sshd-4261  0d.h1   12us : ack_ioapic_quirk_irq (handle_fasteoi_irq)
			
 
				+    sshd-4261  0d.h1   13us : move_native_irq (ack_ioapic_quirk_irq)
			
 
				+    sshd-4261  0d.h1   13us : _spin_unlock (handle_fasteoi_irq)
			
 
				+    sshd-4261  0d.h1   14us : sub_preempt_count (_spin_unlock)
			
 
				+    sshd-4261  0d.h1   14us : irq_exit (do_IRQ)
			
 
				+    sshd-4261  0d.h1   15us : sub_preempt_count (irq_exit)
			
 
				+    sshd-4261  0d..2   15us : do_softirq (irq_exit)
			
 
				+    sshd-4261  0d...   15us : __do_softirq (do_softirq)
			
 
				+    sshd-4261  0d...   16us : __local_bh_disable (__do_softirq)
			
 
				+    sshd-4261  0d...   16us+: add_preempt_count (__local_bh_disable)
			
 
				+    sshd-4261  0d.s4   20us : add_preempt_count (__local_bh_disable)
			
 
				+    sshd-4261  0d.s4   21us : sub_preempt_count (local_bh_enable)
			
 
				+    sshd-4261  0d.s5   21us : sub_preempt_count (local_bh_enable)
			
 
				+[...]
			
 
				+    sshd-4261  0d.s6   41us : add_preempt_count (__local_bh_disable)
			
 
				+    sshd-4261  0d.s6   42us : sub_preempt_count (local_bh_enable)
			
 
				+    sshd-4261  0d.s7   42us : sub_preempt_count (local_bh_enable)
			
 
				+    sshd-4261  0d.s5   43us : add_preempt_count (__local_bh_disable)
			
 
				+    sshd-4261  0d.s5   43us : sub_preempt_count (local_bh_enable_ip)
			
 
				+    sshd-4261  0d.s6   44us : sub_preempt_count (local_bh_enable_ip)
			
 
				+    sshd-4261  0d.s5   44us : add_preempt_count (__local_bh_disable)
			
 
				+    sshd-4261  0d.s5   45us : sub_preempt_count (local_bh_enable)
			
 
				+[...]
			
 
				+    sshd-4261  0d.s.   63us : _local_bh_enable (__do_softirq)
			
 
				+    sshd-4261  0d.s1   64us : trace_preempt_on (__do_softirq)
			
 
				+
			
 
				+
			
 
				+The above is an example of the preemptoff trace with
			
 
				+ftrace_enabled set. Here we see that interrupts were disabled
			
 
				+the entire time. The irq_enter code lets us know that we entered
			
 
				+an interrupt 'h'. Before that, the functions being traced still
			
 
				+show that it is not in an interrupt, but we can see from the
			
 
				+functions themselves that this is not the case.
			
 
				+
			
 
				+Notice that __do_softirq when called does not have a
			
 
				+preempt_count. It may seem that we missed a preempt enabling.
			
 
				+What really happened is that the preempt count is held on the
			
 
				+thread's stack and we switched to the softirq stack (4K stacks
			
 
				+in effect). The code does not copy the preempt count, but
			
 
				+because interrupts are disabled, we do not need to worry about
			
 
				+it. Having a tracer like this is good for letting people know
			
 
				+what really happens inside the kernel.
			
 
				+
			
 
				+
			
 
				+preemptirqsoff
			
 
				+--------------
			
 
				+
			
 
				+Knowing the locations that have interrupts disabled or
			
 
				+preemption disabled for the longest times is helpful. But
			
 
				+sometimes we would like to know when either preemption and/or
			
 
				+interrupts are disabled.
			
 
				+
			
 
				+Consider the following code:
			
 
				+
			
 
				+    local_irq_disable();
			
 
				+    call_function_with_irqs_off();
			
 
				+    preempt_disable();
			
 
				+    call_function_with_irqs_and_preemption_off();
			
 
				+    local_irq_enable();
			
 
				+    call_function_with_preemption_off();
			
 
				+    preempt_enable();
			
 
				+
			
 
				+The irqsoff tracer will record the total length of
			
 
				+call_function_with_irqs_off() and
			
 
				+call_function_with_irqs_and_preemption_off().
			
 
				+
			
 
				+The preemptoff tracer will record the total length of
			
 
				+call_function_with_irqs_and_preemption_off() and
			
 
				+call_function_with_preemption_off().
			
 
				+
			
 
				+But neither will trace the time that interrupts and/or
			
 
				+preemption is disabled. This total time is the time that we can
			
 
				+not schedule. To record this time, use the preemptirqsoff
			
 
				+tracer.
			
 
				+
			
 
				+Again, using this trace is much like the irqsoff and preemptoff
			
 
				+tracers.
			
 
				+
			
 
				+ # echo preemptirqsoff > /debug/tracing/current_tracer
			
 
				+ # echo 0 > /debug/tracing/tracing_max_latency
			
 
				+ # echo 1 > /debug/tracing/tracing_enabled
			
 
				+ # ls -ltr
			
 
				+ [...]
			
 
				+ # echo 0 > /debug/tracing/tracing_enabled
			
 
				+ # cat /debug/tracing/latency_trace
			
 
				+# tracer: preemptirqsoff
			
 
				+#
			
 
				+preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 293 us, #3/3, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: ls-4860 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				+    -----------------
			
 
				+ => started at: apic_timer_interrupt
			
 
				+ => ended at:   __do_softirq
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+      ls-4860  0d...    0us!: trace_hardirqs_off_thunk (apic_timer_interrupt)
			
 
				+      ls-4860  0d.s.  294us : _local_bh_enable (__do_softirq)
			
 
				+      ls-4860  0d.s1  294us : trace_preempt_on (__do_softirq)
			
 
				+
			
 
				+
			
 
				+
			
 
				+The trace_hardirqs_off_thunk is called from assembly on x86 when
			
 
				+interrupts are disabled in the assembly code. Without the
			
 
				+function tracing, we do not know if interrupts were enabled
			
 
				+within the preemption points. We do see that it started with
			
 
				+preemption enabled.
			
 
				+
			
 
				+Here is a trace with ftrace_enabled set:
			
 
				+
			
 
				+
			
 
				+# tracer: preemptirqsoff
			
 
				+#
			
 
				+preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 105 us, #183/183, CPU#0 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: sshd-4261 (uid:0 nice:0 policy:0 rt_prio:0)
			
 
				+    -----------------
			
 
				+ => started at: write_chan
			
 
				+ => ended at:   __do_softirq
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+      ls-4473  0.N..    0us : preempt_schedule (write_chan)
			
 
				+      ls-4473  0dN.1    1us : _spin_lock (schedule)
			
 
				+      ls-4473  0dN.1    2us : add_preempt_count (_spin_lock)
			
 
				+      ls-4473  0d..2    2us : put_prev_task_fair (schedule)
			
 
				+[...]
			
 
				+      ls-4473  0d..2   13us : set_normalized_timespec (ktime_get_ts)
			
 
				+      ls-4473  0d..2   13us : __switch_to (schedule)
			
 
				+    sshd-4261  0d..2   14us : finish_task_switch (schedule)
			
 
				+    sshd-4261  0d..2   14us : _spin_unlock_irq (finish_task_switch)
			
 
				+    sshd-4261  0d..1   15us : add_preempt_count (_spin_lock_irqsave)
			
 
				+    sshd-4261  0d..2   16us : _spin_unlock_irqrestore (hrtick_set)
			
 
				+    sshd-4261  0d..2   16us : do_IRQ (common_interrupt)
			
 
				+    sshd-4261  0d..2   17us : irq_enter (do_IRQ)
			
 
				+    sshd-4261  0d..2   17us : idle_cpu (irq_enter)
			
 
				+    sshd-4261  0d..2   18us : add_preempt_count (irq_enter)
			
 
				+    sshd-4261  0d.h2   18us : idle_cpu (irq_enter)
			
 
				+    sshd-4261  0d.h.   18us : handle_fasteoi_irq (do_IRQ)
			
 
				+    sshd-4261  0d.h.   19us : _spin_lock (handle_fasteoi_irq)
			
 
				+    sshd-4261  0d.h.   19us : add_preempt_count (_spin_lock)
			
 
				+    sshd-4261  0d.h1   20us : _spin_unlock (handle_fasteoi_irq)
			
 
				+    sshd-4261  0d.h1   20us : sub_preempt_count (_spin_unlock)
			
 
				+[...]
			
 
				+    sshd-4261  0d.h1   28us : _spin_unlock (handle_fasteoi_irq)
			
 
				+    sshd-4261  0d.h1   29us : sub_preempt_count (_spin_unlock)
			
 
				+    sshd-4261  0d.h2   29us : irq_exit (do_IRQ)
			
 
				+    sshd-4261  0d.h2   29us : sub_preempt_count (irq_exit)
			
 
				+    sshd-4261  0d..3   30us : do_softirq (irq_exit)
			
 
				+    sshd-4261  0d...   30us : __do_softirq (do_softirq)
			
 
				+    sshd-4261  0d...   31us : __local_bh_disable (__do_softirq)
			
 
				+    sshd-4261  0d...   31us+: add_preempt_count (__local_bh_disable)
			
 
				+    sshd-4261  0d.s4   34us : add_preempt_count (__local_bh_disable)
			
 
				+[...]
			
 
				+    sshd-4261  0d.s3   43us : sub_preempt_count (local_bh_enable_ip)
			
 
				+    sshd-4261  0d.s4   44us : sub_preempt_count (local_bh_enable_ip)
			
 
				+    sshd-4261  0d.s3   44us : smp_apic_timer_interrupt (apic_timer_interrupt)
			
 
				+    sshd-4261  0d.s3   45us : irq_enter (smp_apic_timer_interrupt)
			
 
				+    sshd-4261  0d.s3   45us : idle_cpu (irq_enter)
			
 
				+    sshd-4261  0d.s3   46us : add_preempt_count (irq_enter)
			
 
				+    sshd-4261  0d.H3   46us : idle_cpu (irq_enter)
			
 
				+    sshd-4261  0d.H3   47us : hrtimer_interrupt (smp_apic_timer_interrupt)
			
 
				+    sshd-4261  0d.H3   47us : ktime_get (hrtimer_interrupt)
			
 
				+[...]
			
 
				+    sshd-4261  0d.H3   81us : tick_program_event (hrtimer_interrupt)
			
 
				+    sshd-4261  0d.H3   82us : ktime_get (tick_program_event)
			
 
				+    sshd-4261  0d.H3   82us : ktime_get_ts (ktime_get)
			
 
				+    sshd-4261  0d.H3   83us : getnstimeofday (ktime_get_ts)
			
 
				+    sshd-4261  0d.H3   83us : set_normalized_timespec (ktime_get_ts)
			
 
				+    sshd-4261  0d.H3   84us : clockevents_program_event (tick_program_event)
			
 
				+    sshd-4261  0d.H3   84us : lapic_next_event (clockevents_program_event)
			
 
				+    sshd-4261  0d.H3   85us : irq_exit (smp_apic_timer_interrupt)
			
 
				+    sshd-4261  0d.H3   85us : sub_preempt_count (irq_exit)
			
 
				+    sshd-4261  0d.s4   86us : sub_preempt_count (irq_exit)
			
 
				+    sshd-4261  0d.s3   86us : add_preempt_count (__local_bh_disable)
			
 
				+[...]
			
 
				+    sshd-4261  0d.s1   98us : sub_preempt_count (net_rx_action)
			
 
				+    sshd-4261  0d.s.   99us : add_preempt_count (_spin_lock_irq)
			
 
				+    sshd-4261  0d.s1   99us+: _spin_unlock_irq (run_timer_softirq)
			
 
				+    sshd-4261  0d.s.  104us : _local_bh_enable (__do_softirq)
			
 
				+    sshd-4261  0d.s.  104us : sub_preempt_count (_local_bh_enable)
			
 
				+    sshd-4261  0d.s.  105us : _local_bh_enable (__do_softirq)
			
 
				+    sshd-4261  0d.s1  105us : trace_preempt_on (__do_softirq)
			
 
				+
			
 
				+
			
 
				+This is a very interesting trace. It started with the preemption
			
 
				+of the ls task. We see that the task had the "need_resched" bit
			
 
				+set via the 'N' in the trace.  Interrupts were disabled before
			
 
				+the spin_lock at the beginning of the trace. We see that a
			
 
				+schedule took place to run sshd.  When the interrupts were
			
 
				+enabled, we took an interrupt. On return from the interrupt
			
 
				+handler, the softirq ran. We took another interrupt while
			
 
				+running the softirq as we see from the capital 'H'.
			
 
				+
			
 
				+
			
 
				+wakeup
			
 
				+------
			
 
				+
			
 
				+In a Real-Time environment it is very important to know the
			
 
				+wakeup time it takes for the highest priority task that is woken
			
 
				+up to the time that it executes. This is also known as "schedule
			
 
				+latency". I stress the point that this is about RT tasks. It is
			
 
				+also important to know the scheduling latency of non-RT tasks,
			
 
				+but the average schedule latency is better for non-RT tasks.
			
 
				+Tools like LatencyTop are more appropriate for such
			
 
				+measurements.
			
 
				+
			
 
				+Real-Time environments are interested in the worst case latency.
			
 
				+That is the longest latency it takes for something to happen,
			
 
				+and not the average. We can have a very fast scheduler that may
			
 
				+only have a large latency once in a while, but that would not
			
 
				+work well with Real-Time tasks.  The wakeup tracer was designed
			
 
				+to record the worst case wakeups of RT tasks. Non-RT tasks are
			
 
				+not recorded because the tracer only records one worst case and
			
 
				+tracing non-RT tasks that are unpredictable will overwrite the
			
 
				+worst case latency of RT tasks.
			
 
				+
			
 
				+Since this tracer only deals with RT tasks, we will run this
			
 
				+slightly differently than we did with the previous tracers.
			
 
				+Instead of performing an 'ls', we will run 'sleep 1' under
			
 
				+'chrt' which changes the priority of the task.
			
 
				+
			
 
				+ # echo wakeup > /debug/tracing/current_tracer
			
 
				+ # echo 0 > /debug/tracing/tracing_max_latency
			
 
				+ # echo 1 > /debug/tracing/tracing_enabled
			
 
				+ # chrt -f 5 sleep 1
			
 
				+ # echo 0 > /debug/tracing/tracing_enabled
			
 
				+ # cat /debug/tracing/latency_trace
			
 
				+# tracer: wakeup
			
 
				+#
			
 
				+wakeup latency trace v1.1.5 on 2.6.26-rc8
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 4 us, #2/2, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: sleep-4901 (uid:0 nice:0 policy:1 rt_prio:5)
			
 
				+    -----------------
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+  <idle>-0     1d.h4    0us+: try_to_wake_up (wake_up_process)
			
 
				+  <idle>-0     1d..4    4us : schedule (cpu_idle)
			
 
				+
			
 
				+
			
 
				+Running this on an idle system, we see that it only took 4
			
 
				+microseconds to perform the task switch.  Note, since the trace
			
 
				+marker in the schedule is before the actual "switch", we stop
			
 
				+the tracing when the recorded task is about to schedule in. This
			
 
				+may change if we add a new marker at the end of the scheduler.
			
 
				+
			
 
				+Notice that the recorded task is 'sleep' with the PID of 4901
			
 
				+and it has an rt_prio of 5. This priority is user-space priority
			
 
				+and not the internal kernel priority. The policy is 1 for
			
 
				+SCHED_FIFO and 2 for SCHED_RR.
			
 
				+
			
 
				+Doing the same with chrt -r 5 and ftrace_enabled set.
			
 
				+
			
 
				+# tracer: wakeup
			
 
				+#
			
 
				+wakeup latency trace v1.1.5 on 2.6.26-rc8
			
 
				+--------------------------------------------------------------------
			
 
				+ latency: 50 us, #60/60, CPU#1 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:2)
			
 
				+    -----------------
			
 
				+    | task: sleep-4068 (uid:0 nice:0 policy:2 rt_prio:5)
			
 
				+    -----------------
			
 
				+
			
 
				+#                _------=> CPU#
			
 
				+#               / _-----=> irqs-off
			
 
				+#              | / _----=> need-resched
			
 
				+#              || / _---=> hardirq/softirq
			
 
				+#              ||| / _--=> preempt-depth
			
 
				+#              |||| /
			
 
				+#              |||||     delay
			
 
				+#  cmd     pid ||||| time  |   caller
			
 
				+#     \   /    |||||   \   |   /
			
 
				+ksoftirq-7     1d.H3    0us : try_to_wake_up (wake_up_process)
			
 
				+ksoftirq-7     1d.H4    1us : sub_preempt_count (marker_probe_cb)
			
 
				+ksoftirq-7     1d.H3    2us : check_preempt_wakeup (try_to_wake_up)
			
 
				+ksoftirq-7     1d.H3    3us : update_curr (check_preempt_wakeup)
			
 
				+ksoftirq-7     1d.H3    4us : calc_delta_mine (update_curr)
			
 
				+ksoftirq-7     1d.H3    5us : __resched_task (check_preempt_wakeup)
			
 
				+ksoftirq-7     1d.H3    6us : task_wake_up_rt (try_to_wake_up)
			
 
				+ksoftirq-7     1d.H3    7us : _spin_unlock_irqrestore (try_to_wake_up)
			
 
				+[...]
			
 
				+ksoftirq-7     1d.H2   17us : irq_exit (smp_apic_timer_interrupt)
			
 
				+ksoftirq-7     1d.H2   18us : sub_preempt_count (irq_exit)
			
 
				+ksoftirq-7     1d.s3   19us : sub_preempt_count (irq_exit)
			
 
				+ksoftirq-7     1..s2   20us : rcu_process_callbacks (__do_softirq)
			
 
				+[...]
			
 
				+ksoftirq-7     1..s2   26us : __rcu_process_callbacks (rcu_process_callbacks)
			
 
				+ksoftirq-7     1d.s2   27us : _local_bh_enable (__do_softirq)
			
 
				+ksoftirq-7     1d.s2   28us : sub_preempt_count (_local_bh_enable)
			
 
				+ksoftirq-7     1.N.3   29us : sub_preempt_count (ksoftirqd)
			
 
				+ksoftirq-7     1.N.2   30us : _cond_resched (ksoftirqd)
			
 
				+ksoftirq-7     1.N.2   31us : __cond_resched (_cond_resched)
			
 
				+ksoftirq-7     1.N.2   32us : add_preempt_count (__cond_resched)
			
 
				+ksoftirq-7     1.N.2   33us : schedule (__cond_resched)
			
 
				+ksoftirq-7     1.N.2   33us : add_preempt_count (schedule)
			
 
				+ksoftirq-7     1.N.3   34us : hrtick_clear (schedule)
			
 
				+ksoftirq-7     1dN.3   35us : _spin_lock (schedule)
			
 
				+ksoftirq-7     1dN.3   36us : add_preempt_count (_spin_lock)
			
 
				+ksoftirq-7     1d..4   37us : put_prev_task_fair (schedule)
			
 
				+ksoftirq-7     1d..4   38us : update_curr (put_prev_task_fair)
			
 
				+[...]
			
 
				+ksoftirq-7     1d..5   47us : _spin_trylock (tracing_record_cmdline)
			
 
				+ksoftirq-7     1d..5   48us : add_preempt_count (_spin_trylock)
			
 
				+ksoftirq-7     1d..6   49us : _spin_unlock (tracing_record_cmdline)
			
 
				+ksoftirq-7     1d..6   49us : sub_preempt_count (_spin_unlock)
			
 
				+ksoftirq-7     1d..4   50us : schedule (__cond_resched)
			
 
				+
			
 
				+The interrupt went off while running ksoftirqd. This task runs
			
 
				+at SCHED_OTHER. Why did not we see the 'N' set early? This may
			
 
				+be a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K
			
 
				+stacks configured, the interrupt and softirq run with their own
			
 
				+stack. Some information is held on the top of the task's stack
			
 
				+(need_resched and preempt_count are both stored there). The
			
 
				+setting of the NEED_RESCHED bit is done directly to the task's
			
 
				+stack, but the reading of the NEED_RESCHED is done by looking at
			
 
				+the current stack, which in this case is the stack for the hard
			
 
				+interrupt. This hides the fact that NEED_RESCHED has been set.
			
 
				+We do not see the 'N' until we switch back to the task's
			
 
				+assigned stack.
			
 
				+
			
 
				+function
			
 
				+--------
			
 
				+
			
 
				+This tracer is the function tracer. Enabling the function tracer
			
 
				+can be done from the debug file system. Make sure the
			
 
				+ftrace_enabled is set; otherwise this tracer is a nop.
			
 
				+
			
 
				+ # sysctl kernel.ftrace_enabled=1
			
 
				+ # echo function > /debug/tracing/current_tracer
			
 
				+ # echo 1 > /debug/tracing/tracing_enabled
			
 
				+ # usleep 1
			
 
				+ # echo 0 > /debug/tracing/tracing_enabled
			
 
				+ # cat /debug/tracing/trace
			
 
				+# tracer: function
			
 
				+#
			
 
				+#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				+#              | |      |          |         |
			
 
				+            bash-4003  [00]   123.638713: finish_task_switch <-schedule
			
 
				+            bash-4003  [00]   123.638714: _spin_unlock_irq <-finish_task_switch
			
 
				+            bash-4003  [00]   123.638714: sub_preempt_count <-_spin_unlock_irq
			
 
				+            bash-4003  [00]   123.638715: hrtick_set <-schedule
			
 
				+            bash-4003  [00]   123.638715: _spin_lock_irqsave <-hrtick_set
			
 
				+            bash-4003  [00]   123.638716: add_preempt_count <-_spin_lock_irqsave
			
 
				+            bash-4003  [00]   123.638716: _spin_unlock_irqrestore <-hrtick_set
			
 
				+            bash-4003  [00]   123.638717: sub_preempt_count <-_spin_unlock_irqrestore
			
 
				+            bash-4003  [00]   123.638717: hrtick_clear <-hrtick_set
			
 
				+            bash-4003  [00]   123.638718: sub_preempt_count <-schedule
			
 
				+            bash-4003  [00]   123.638718: sub_preempt_count <-preempt_schedule
			
 
				+            bash-4003  [00]   123.638719: wait_for_completion <-__stop_machine_run
			
 
				+            bash-4003  [00]   123.638719: wait_for_common <-wait_for_completion
			
 
				+            bash-4003  [00]   123.638720: _spin_lock_irq <-wait_for_common
			
 
				+            bash-4003  [00]   123.638720: add_preempt_count <-_spin_lock_irq
			
 
				+[...]
			
 
				+
			
 
				+
			
 
				+Note: function tracer uses ring buffers to store the above
			
 
				+entries. The newest data may overwrite the oldest data.
			
 
				+Sometimes using echo to stop the trace is not sufficient because
			
 
				+the tracing could have overwritten the data that you wanted to
			
 
				+record. For this reason, it is sometimes better to disable
			
 
				+tracing directly from a program. This allows you to stop the
			
 
				+tracing at the point that you hit the part that you are
			
 
				+interested in. To disable the tracing directly from a C program,
			
 
				+something like following code snippet can be used:
			
 
				+
			
 
				+int trace_fd;
			
 
				+[...]
			
 
				+int main(int argc, char *argv[]) {
			
 
				+	[...]
			
 
				+	trace_fd = open("/debug/tracing/tracing_enabled", O_WRONLY);
			
 
				+	[...]
			
 
				+	if (condition_hit()) {
			
 
				+		write(trace_fd, "0", 1);
			
 
				+	}
			
 
				+	[...]
			
 
				+}
			
 
				+
			
 
				+Note: Here we hard coded the path name. The debugfs mount is not
			
 
				+guaranteed to be at /debug (and is more commonly at
			
 
				+/sys/kernel/debug). For simple one time traces, the above is
			
 
				+sufficent. For anything else, a search through /proc/mounts may
			
 
				+be needed to find where the debugfs file-system is mounted.
			
 
				+
			
 
				+
			
 
				+Single thread tracing
			
 
				+---------------------
			
 
				+
			
 
				+By writing into /debug/tracing/set_ftrace_pid you can trace a
			
 
				+single thread. For example:
			
 
				+
			
 
				+# cat /debug/tracing/set_ftrace_pid
			
 
				+no pid
			
 
				+# echo 3111 > /debug/tracing/set_ftrace_pid
			
 
				+# cat /debug/tracing/set_ftrace_pid
			
 
				+3111
			
 
				+# echo function > /debug/tracing/current_tracer
			
 
				+# cat /debug/tracing/trace | head
			
 
				+ # tracer: function
			
 
				+ #
			
 
				+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
			
 
				+ #              | |       |          |         |
			
 
				+     yum-updatesd-3111  [003]  1637.254676: finish_task_switch <-thread_return
			
 
				+     yum-updatesd-3111  [003]  1637.254681: hrtimer_cancel <-schedule_hrtimeout_range
			
 
				+     yum-updatesd-3111  [003]  1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel
			
 
				+     yum-updatesd-3111  [003]  1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
			
 
				+     yum-updatesd-3111  [003]  1637.254685: fget_light <-do_sys_poll
			
 
				+     yum-updatesd-3111  [003]  1637.254686: pipe_poll <-do_sys_poll
			
 
				+# echo -1 > /debug/tracing/set_ftrace_pid
			
 
				+# cat /debug/tracing/trace |head
			
 
				+ # tracer: function
			
 
				+ #
			
 
				+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
			
 
				+ #              | |       |          |         |
			
 
				+ ##### CPU 3 buffer started ####
			
 
				+     yum-updatesd-3111  [003]  1701.957688: free_poll_entry <-poll_freewait
			
 
				+     yum-updatesd-3111  [003]  1701.957689: remove_wait_queue <-free_poll_entry
			
 
				+     yum-updatesd-3111  [003]  1701.957691: fput <-free_poll_entry
			
 
				+     yum-updatesd-3111  [003]  1701.957692: audit_syscall_exit <-sysret_audit
			
 
				+     yum-updatesd-3111  [003]  1701.957693: path_put <-audit_syscall_exit
			
 
				+
			
 
				+If you want to trace a function when executing, you could use
			
 
				+something like this simple program:
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/stat.h>
			
 
				+#include <fcntl.h>
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+int main (int argc, char **argv)
			
 
				+{
			
 
				+        if (argc < 1)
			
 
				+                exit(-1);
			
 
				+
			
 
				+        if (fork() > 0) {
			
 
				+                int fd, ffd;
			
 
				+                char line[64];
			
 
				+                int s;
			
 
				+
			
 
				+                ffd = open("/debug/tracing/current_tracer", O_WRONLY);
			
 
				+                if (ffd < 0)
			
 
				+                        exit(-1);
			
 
				+                write(ffd, "nop", 3);
			
 
				+
			
 
				+                fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY);
			
 
				+                s = sprintf(line, "%d\n", getpid());
			
 
				+                write(fd, line, s);
			
 
				+
			
 
				+                write(ffd, "function", 8);
			
 
				+
			
 
				+                close(fd);
			
 
				+                close(ffd);
			
 
				+
			
 
				+                execvp(argv[1], argv+1);
			
 
				+        }
			
 
				+
			
 
				+        return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+hw-branch-tracer (x86 only)
			
 
				+---------------------------
			
 
				+
			
 
				+This tracer uses the x86 last branch tracing hardware feature to
			
 
				+collect a branch trace on all cpus with relatively low overhead.
			
 
				+
			
 
				+The tracer uses a fixed-size circular buffer per cpu and only
			
 
				+traces ring 0 branches. The trace file dumps that buffer in the
			
 
				+following format:
			
 
				+
			
 
				+# tracer: hw-branch-tracer
			
 
				+#
			
 
				+# CPU#        TO  <-  FROM
			
 
				+   0  scheduler_tick+0xb5/0x1bf	  <-  task_tick_idle+0x5/0x6
			
 
				+   2  run_posix_cpu_timers+0x2b/0x72a	  <-  run_posix_cpu_timers+0x25/0x72a
			
 
				+   0  scheduler_tick+0x139/0x1bf	  <-  scheduler_tick+0xed/0x1bf
			
 
				+   0  scheduler_tick+0x17c/0x1bf	  <-  scheduler_tick+0x148/0x1bf
			
 
				+   2  run_posix_cpu_timers+0x9e/0x72a	  <-  run_posix_cpu_timers+0x5e/0x72a
			
 
				+   0  scheduler_tick+0x1b6/0x1bf	  <-  scheduler_tick+0x1aa/0x1bf
			
 
				+
			
 
				+
			
 
				+The tracer may be used to dump the trace for the oops'ing cpu on
			
 
				+a kernel oops into the system log. To enable this,
			
 
				+ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
			
 
				+can either use the sysctl function or set it via the proc system
			
 
				+interface.
			
 
				+
			
 
				+  sysctl kernel.ftrace_dump_on_oops=1
			
 
				+
			
 
				+or
			
 
				+
			
 
				+  echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
			
 
				+
			
 
				+
			
 
				+Here's an example of such a dump after a null pointer
			
 
				+dereference in a kernel module:
			
 
				+
			
 
				+[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
			
 
				+[57848.106019] IP: [<ffffffffa0000006>] open+0x6/0x14 [oops]
			
 
				+[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0
			
 
				+[57848.106019] Oops: 0002 [#1] SMP
			
 
				+[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus
			
 
				+[57848.106019] Dumping ftrace buffer:
			
 
				+[57848.106019] ---------------------------------
			
 
				+[...]
			
 
				+[57848.106019]    0  chrdev_open+0xe6/0x165	  <-  cdev_put+0x23/0x24
			
 
				+[57848.106019]    0  chrdev_open+0x117/0x165	  <-  chrdev_open+0xfa/0x165
			
 
				+[57848.106019]    0  chrdev_open+0x120/0x165	  <-  chrdev_open+0x11c/0x165
			
 
				+[57848.106019]    0  chrdev_open+0x134/0x165	  <-  chrdev_open+0x12b/0x165
			
 
				+[57848.106019]    0  open+0x0/0x14 [oops]	  <-  chrdev_open+0x144/0x165
			
 
				+[57848.106019]    0  page_fault+0x0/0x30	  <-  open+0x6/0x14 [oops]
			
 
				+[57848.106019]    0  error_entry+0x0/0x5b	  <-  page_fault+0x4/0x30
			
 
				+[57848.106019]    0  error_kernelspace+0x0/0x31	  <-  error_entry+0x59/0x5b
			
 
				+[57848.106019]    0  error_sti+0x0/0x1	  <-  error_kernelspace+0x2d/0x31
			
 
				+[57848.106019]    0  page_fault+0x9/0x30	  <-  error_sti+0x0/0x1
			
 
				+[57848.106019]    0  do_page_fault+0x0/0x881	  <-  page_fault+0x1a/0x30
			
 
				+[...]
			
 
				+[57848.106019]    0  do_page_fault+0x66b/0x881	  <-  is_prefetch+0x1ee/0x1f2
			
 
				+[57848.106019]    0  do_page_fault+0x6e0/0x881	  <-  do_page_fault+0x67a/0x881
			
 
				+[57848.106019]    0  oops_begin+0x0/0x96	  <-  do_page_fault+0x6e0/0x881
			
 
				+[57848.106019]    0  trace_hw_branch_oops+0x0/0x2d	  <-  oops_begin+0x9/0x96
			
 
				+[...]
			
 
				+[57848.106019]    0  ds_suspend_bts+0x2a/0xe3	  <-  ds_suspend_bts+0x1a/0xe3
			
 
				+[57848.106019] ---------------------------------
			
 
				+[57848.106019] CPU 0
			
 
				+[57848.106019] Modules linked in: oops
			
 
				+[57848.106019] Pid: 5542, comm: cat Tainted: G        W  2.6.28 #23
			
 
				+[57848.106019] RIP: 0010:[<ffffffffa0000006>]  [<ffffffffa0000006>] open+0x6/0x14 [oops]
			
 
				+[57848.106019] RSP: 0018:ffff880235457d48  EFLAGS: 00010246
			
 
				+[...]
			
 
				+
			
 
				+
			
 
				+function graph tracer
			
 
				+---------------------------
			
 
				+
			
 
				+This tracer is similar to the function tracer except that it
			
 
				+probes a function on its entry and its exit. This is done by
			
 
				+using a dynamically allocated stack of return addresses in each
			
 
				+task_struct. On function entry the tracer overwrites the return
			
 
				+address of each function traced to set a custom probe. Thus the
			
 
				+original return address is stored on the stack of return address
			
 
				+in the task_struct.
			
 
				+
			
 
				+Probing on both ends of a function leads to special features
			
 
				+such as:
			
 
				+
			
 
				+- measure of a function's time execution
			
 
				+- having a reliable call stack to draw function calls graph
			
 
				+
			
 
				+This tracer is useful in several situations:
			
 
				+
			
 
				+- you want to find the reason of a strange kernel behavior and
			
 
				+  need to see what happens in detail on any areas (or specific
			
 
				+  ones).
			
 
				+
			
 
				+- you are experiencing weird latencies but it's difficult to
			
 
				+  find its origin.
			
 
				+
			
 
				+- you want to find quickly which path is taken by a specific
			
 
				+  function
			
 
				+
			
 
				+- you just want to peek inside a working kernel and want to see
			
 
				+  what happens there.
			
 
				+
			
 
				+# tracer: function_graph
			
 
				+#
			
 
				+# CPU  DURATION                  FUNCTION CALLS
			
 
				+# |     |   |                     |   |   |   |
			
 
				+
			
 
				+ 0)               |  sys_open() {
			
 
				+ 0)               |    do_sys_open() {
			
 
				+ 0)               |      getname() {
			
 
				+ 0)               |        kmem_cache_alloc() {
			
 
				+ 0)   1.382 us    |          __might_sleep();
			
 
				+ 0)   2.478 us    |        }
			
 
				+ 0)               |        strncpy_from_user() {
			
 
				+ 0)               |          might_fault() {
			
 
				+ 0)   1.389 us    |            __might_sleep();
			
 
				+ 0)   2.553 us    |          }
			
 
				+ 0)   3.807 us    |        }
			
 
				+ 0)   7.876 us    |      }
			
 
				+ 0)               |      alloc_fd() {
			
 
				+ 0)   0.668 us    |        _spin_lock();
			
 
				+ 0)   0.570 us    |        expand_files();
			
 
				+ 0)   0.586 us    |        _spin_unlock();
			
 
				+
			
 
				+
			
 
				+There are several columns that can be dynamically
			
 
				+enabled/disabled. You can use every combination of options you
			
 
				+want, depending on your needs.
			
 
				+
			
 
				+- The cpu number on which the function executed is default
			
 
				+  enabled.  It is sometimes better to only trace one cpu (see
			
 
				+  tracing_cpu_mask file) or you might sometimes see unordered
			
 
				+  function calls while cpu tracing switch.
			
 
				+
			
 
				+	hide: echo nofuncgraph-cpu > /debug/tracing/trace_options
			
 
				+	show: echo funcgraph-cpu > /debug/tracing/trace_options
			
 
				+
			
 
				+- The duration (function's time of execution) is displayed on
			
 
				+  the closing bracket line of a function or on the same line
			
 
				+  than the current function in case of a leaf one. It is default
			
 
				+  enabled.
			
 
				+
			
 
				+	hide: echo nofuncgraph-duration > /debug/tracing/trace_options
			
 
				+	show: echo funcgraph-duration > /debug/tracing/trace_options
			
 
				+
			
 
				+- The overhead field precedes the duration field in case of
			
 
				+  reached duration thresholds.
			
 
				+
			
 
				+	hide: echo nofuncgraph-overhead > /debug/tracing/trace_options
			
 
				+	show: echo funcgraph-overhead > /debug/tracing/trace_options
			
 
				+	depends on: funcgraph-duration
			
 
				+
			
 
				+  ie:
			
 
				+
			
 
				+  0)               |    up_write() {
			
 
				+  0)   0.646 us    |      _spin_lock_irqsave();
			
 
				+  0)   0.684 us    |      _spin_unlock_irqrestore();
			
 
				+  0)   3.123 us    |    }
			
 
				+  0)   0.548 us    |    fput();
			
 
				+  0) + 58.628 us   |  }
			
 
				+
			
 
				+  [...]
			
 
				+
			
 
				+  0)               |      putname() {
			
 
				+  0)               |        kmem_cache_free() {
			
 
				+  0)   0.518 us    |          __phys_addr();
			
 
				+  0)   1.757 us    |        }
			
 
				+  0)   2.861 us    |      }
			
 
				+  0) ! 115.305 us  |    }
			
 
				+  0) ! 116.402 us  |  }
			
 
				+
			
 
				+  + means that the function exceeded 10 usecs.
			
 
				+  ! means that the function exceeded 100 usecs.
			
 
				+
			
 
				+
			
 
				+- The task/pid field displays the thread cmdline and pid which
			
 
				+  executed the function. It is default disabled.
			
 
				+
			
 
				+	hide: echo nofuncgraph-proc > /debug/tracing/trace_options
			
 
				+	show: echo funcgraph-proc > /debug/tracing/trace_options
			
 
				+
			
 
				+  ie:
			
 
				+
			
 
				+  # tracer: function_graph
			
 
				+  #
			
 
				+  # CPU  TASK/PID        DURATION                  FUNCTION CALLS
			
 
				+  # |    |    |           |   |                     |   |   |   |
			
 
				+  0)    sh-4802     |               |                  d_free() {
			
 
				+  0)    sh-4802     |               |                    call_rcu() {
			
 
				+  0)    sh-4802     |               |                      __call_rcu() {
			
 
				+  0)    sh-4802     |   0.616 us    |                        rcu_process_gp_end();
			
 
				+  0)    sh-4802     |   0.586 us    |                        check_for_new_grace_period();
			
 
				+  0)    sh-4802     |   2.899 us    |                      }
			
 
				+  0)    sh-4802     |   4.040 us    |                    }
			
 
				+  0)    sh-4802     |   5.151 us    |                  }
			
 
				+  0)    sh-4802     | + 49.370 us   |                }
			
 
				+
			
 
				+
			
 
				+- The absolute time field is an absolute timestamp given by the
			
 
				+  system clock since it started. A snapshot of this time is
			
 
				+  given on each entry/exit of functions
			
 
				+
			
 
				+	hide: echo nofuncgraph-abstime > /debug/tracing/trace_options
			
 
				+	show: echo funcgraph-abstime > /debug/tracing/trace_options
			
 
				+
			
 
				+  ie:
			
 
				+
			
 
				+  #
			
 
				+  #      TIME       CPU  DURATION                  FUNCTION CALLS
			
 
				+  #       |         |     |   |                     |   |   |   |
			
 
				+  360.774522 |   1)   0.541 us    |                                          }
			
 
				+  360.774522 |   1)   4.663 us    |                                        }
			
 
				+  360.774523 |   1)   0.541 us    |                                        __wake_up_bit();
			
 
				+  360.774524 |   1)   6.796 us    |                                      }
			
 
				+  360.774524 |   1)   7.952 us    |                                    }
			
 
				+  360.774525 |   1)   9.063 us    |                                  }
			
 
				+  360.774525 |   1)   0.615 us    |                                  journal_mark_dirty();
			
 
				+  360.774527 |   1)   0.578 us    |                                  __brelse();
			
 
				+  360.774528 |   1)               |                                  reiserfs_prepare_for_journal() {
			
 
				+  360.774528 |   1)               |                                    unlock_buffer() {
			
 
				+  360.774529 |   1)               |                                      wake_up_bit() {
			
 
				+  360.774529 |   1)               |                                        bit_waitqueue() {
			
 
				+  360.774530 |   1)   0.594 us    |                                          __phys_addr();
			
 
				+
			
 
				+
			
 
				+You can put some comments on specific functions by using
			
 
				+trace_printk() For example, if you want to put a comment inside
			
 
				+the __might_sleep() function, you just have to include
			
 
				+<linux/ftrace.h> and call trace_printk() inside __might_sleep()
			
 
				+
			
 
				+trace_printk("I'm a comment!\n")
			
 
				+
			
 
				+will produce:
			
 
				+
			
 
				+ 1)               |             __might_sleep() {
			
 
				+ 1)               |                /* I'm a comment! */
			
 
				+ 1)   1.449 us    |             }
			
 
				+
			
 
				+
			
 
				+You might find other useful features for this tracer in the
			
 
				+following "dynamic ftrace" section such as tracing only specific
			
 
				+functions or tasks.
			
 
				+
			
 
				+dynamic ftrace
			
 
				+--------------
			
 
				+
			
 
				+If CONFIG_DYNAMIC_FTRACE is set, the system will run with
			
 
				+virtually no overhead when function tracing is disabled. The way
			
 
				+this works is the mcount function call (placed at the start of
			
 
				+every kernel function, produced by the -pg switch in gcc),
			
 
				+starts of pointing to a simple return. (Enabling FTRACE will
			
 
				+include the -pg switch in the compiling of the kernel.)
			
 
				+
			
 
				+At compile time every C file object is run through the
			
 
				+recordmcount.pl script (located in the scripts directory). This
			
 
				+script will process the C object using objdump to find all the
			
 
				+locations in the .text section that call mcount. (Note, only the
			
 
				+.text section is processed, since processing other sections like
			
 
				+.init.text may cause races due to those sections being freed).
			
 
				+
			
 
				+A new section called "__mcount_loc" is created that holds
			
 
				+references to all the mcount call sites in the .text section.
			
 
				+This section is compiled back into the original object. The
			
 
				+final linker will add all these references into a single table.
			
 
				+
			
 
				+On boot up, before SMP is initialized, the dynamic ftrace code
			
 
				+scans this table and updates all the locations into nops. It
			
 
				+also records the locations, which are added to the
			
 
				+available_filter_functions list.  Modules are processed as they
			
 
				+are loaded and before they are executed.  When a module is
			
 
				+unloaded, it also removes its functions from the ftrace function
			
 
				+list. This is automatic in the module unload code, and the
			
 
				+module author does not need to worry about it.
			
 
				+
			
 
				+When tracing is enabled, kstop_machine is called to prevent
			
 
				+races with the CPUS executing code being modified (which can
			
 
				+cause the CPU to do undesireable things), and the nops are
			
 
				+patched back to calls. But this time, they do not call mcount
			
 
				+(which is just a function stub). They now call into the ftrace
			
 
				+infrastructure.
			
 
				+
			
 
				+One special side-effect to the recording of the functions being
			
 
				+traced is that we can now selectively choose which functions we
			
 
				+wish to trace and which ones we want the mcount calls to remain
			
 
				+as nops.
			
 
				+
			
 
				+Two files are used, one for enabling and one for disabling the
			
 
				+tracing of specified functions. They are:
			
 
				+
			
 
				+  set_ftrace_filter
			
 
				+
			
 
				+and
			
 
				+
			
 
				+  set_ftrace_notrace
			
 
				+
			
 
				+A list of available functions that you can add to these files is
			
 
				+listed in:
			
 
				+
			
 
				+   available_filter_functions
			
 
				+
			
 
				+ # cat /debug/tracing/available_filter_functions
			
 
				+put_prev_task_idle
			
 
				+kmem_cache_create
			
 
				+pick_next_task_rt
			
 
				+get_online_cpus
			
 
				+pick_next_task_fair
			
 
				+mutex_lock
			
 
				+[...]
			
 
				+
			
 
				+If I am only interested in sys_nanosleep and hrtimer_interrupt:
			
 
				+
			
 
				+ # echo sys_nanosleep hrtimer_interrupt \
			
 
				+		> /debug/tracing/set_ftrace_filter
			
 
				+ # echo ftrace > /debug/tracing/current_tracer
			
 
				+ # echo 1 > /debug/tracing/tracing_enabled
			
 
				+ # usleep 1
			
 
				+ # echo 0 > /debug/tracing/tracing_enabled
			
 
				+ # cat /debug/tracing/trace
			
 
				+# tracer: ftrace
			
 
				+#
			
 
				+#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				+#              | |      |          |         |
			
 
				+          usleep-4134  [00]  1317.070017: hrtimer_interrupt <-smp_apic_timer_interrupt
			
 
				+          usleep-4134  [00]  1317.070111: sys_nanosleep <-syscall_call
			
 
				+          <idle>-0     [00]  1317.070115: hrtimer_interrupt <-smp_apic_timer_interrupt
			
 
				+
			
 
				+To see which functions are being traced, you can cat the file:
			
 
				+
			
 
				+ # cat /debug/tracing/set_ftrace_filter
			
 
				+hrtimer_interrupt
			
 
				+sys_nanosleep
			
 
				+
			
 
				+
			
 
				+Perhaps this is not enough. The filters also allow simple wild
			
 
				+cards. Only the following are currently available
			
 
				+
			
 
				+  <match>*  - will match functions that begin with <match>
			
 
				+  *<match>  - will match functions that end with <match>
			
 
				+  *<match>* - will match functions that have <match> in it
			
 
				+
			
 
				+These are the only wild cards which are supported.
			
 
				+
			
 
				+  <match>*<match> will not work.
			
 
				+
			
 
				+Note: It is better to use quotes to enclose the wild cards,
			
 
				+      otherwise the shell may expand the parameters into names
			
 
				+      of files in the local directory.
			
 
				+
			
 
				+ # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
			
 
				+
			
 
				+Produces:
			
 
				+
			
 
				+# tracer: ftrace
			
 
				+#
			
 
				+#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				+#              | |      |          |         |
			
 
				+            bash-4003  [00]  1480.611794: hrtimer_init <-copy_process
			
 
				+            bash-4003  [00]  1480.611941: hrtimer_start <-hrtick_set
			
 
				+            bash-4003  [00]  1480.611956: hrtimer_cancel <-hrtick_clear
			
 
				+            bash-4003  [00]  1480.611956: hrtimer_try_to_cancel <-hrtimer_cancel
			
 
				+          <idle>-0     [00]  1480.612019: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				+          <idle>-0     [00]  1480.612025: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				+          <idle>-0     [00]  1480.612032: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				+          <idle>-0     [00]  1480.612037: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				+          <idle>-0     [00]  1480.612382: hrtimer_get_next_event <-get_next_timer_interrupt
			
 
				+
			
 
				+
			
 
				+Notice that we lost the sys_nanosleep.
			
 
				+
			
 
				+ # cat /debug/tracing/set_ftrace_filter
			
 
				+hrtimer_run_queues
			
 
				+hrtimer_run_pending
			
 
				+hrtimer_init
			
 
				+hrtimer_cancel
			
 
				+hrtimer_try_to_cancel
			
 
				+hrtimer_forward
			
 
				+hrtimer_start
			
 
				+hrtimer_reprogram
			
 
				+hrtimer_force_reprogram
			
 
				+hrtimer_get_next_event
			
 
				+hrtimer_interrupt
			
 
				+hrtimer_nanosleep
			
 
				+hrtimer_wakeup
			
 
				+hrtimer_get_remaining
			
 
				+hrtimer_get_res
			
 
				+hrtimer_init_sleeper
			
 
				+
			
 
				+
			
 
				+This is because the '>' and '>>' act just like they do in bash.
			
 
				+To rewrite the filters, use '>'
			
 
				+To append to the filters, use '>>'
			
 
				+
			
 
				+To clear out a filter so that all functions will be recorded
			
 
				+again:
			
 
				+
			
 
				+ # echo > /debug/tracing/set_ftrace_filter
			
 
				+ # cat /debug/tracing/set_ftrace_filter
			
 
				+ #
			
 
				+
			
 
				+Again, now we want to append.
			
 
				+
			
 
				+ # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
			
 
				+ # cat /debug/tracing/set_ftrace_filter
			
 
				+sys_nanosleep
			
 
				+ # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter
			
 
				+ # cat /debug/tracing/set_ftrace_filter
			
 
				+hrtimer_run_queues
			
 
				+hrtimer_run_pending
			
 
				+hrtimer_init
			
 
				+hrtimer_cancel
			
 
				+hrtimer_try_to_cancel
			
 
				+hrtimer_forward
			
 
				+hrtimer_start
			
 
				+hrtimer_reprogram
			
 
				+hrtimer_force_reprogram
			
 
				+hrtimer_get_next_event
			
 
				+hrtimer_interrupt
			
 
				+sys_nanosleep
			
 
				+hrtimer_nanosleep
			
 
				+hrtimer_wakeup
			
 
				+hrtimer_get_remaining
			
 
				+hrtimer_get_res
			
 
				+hrtimer_init_sleeper
			
 
				+
			
 
				+
			
 
				+The set_ftrace_notrace prevents those functions from being
			
 
				+traced.
			
 
				+
			
 
				+ # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace
			
 
				+
			
 
				+Produces:
			
 
				+
			
 
				+# tracer: ftrace
			
 
				+#
			
 
				+#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				+#              | |      |          |         |
			
 
				+            bash-4043  [01]   115.281644: finish_task_switch <-schedule
			
 
				+            bash-4043  [01]   115.281645: hrtick_set <-schedule
			
 
				+            bash-4043  [01]   115.281645: hrtick_clear <-hrtick_set
			
 
				+            bash-4043  [01]   115.281646: wait_for_completion <-__stop_machine_run
			
 
				+            bash-4043  [01]   115.281647: wait_for_common <-wait_for_completion
			
 
				+            bash-4043  [01]   115.281647: kthread_stop <-stop_machine_run
			
 
				+            bash-4043  [01]   115.281648: init_waitqueue_head <-kthread_stop
			
 
				+            bash-4043  [01]   115.281648: wake_up_process <-kthread_stop
			
 
				+            bash-4043  [01]   115.281649: try_to_wake_up <-wake_up_process
			
 
				+
			
 
				+We can see that there's no more lock or preempt tracing.
			
 
				+
			
 
				+
			
 
				+Dynamic ftrace with the function graph tracer
			
 
				+---------------------------------------------
			
 
				+
			
 
				+Although what has been explained above concerns both the
			
 
				+function tracer and the function-graph-tracer, there are some
			
 
				+special features only available in the function-graph tracer.
			
 
				+
			
 
				+If you want to trace only one function and all of its children,
			
 
				+you just have to echo its name into set_graph_function:
			
 
				+
			
 
				+ echo __do_fault > set_graph_function
			
 
				+
			
 
				+will produce the following "expanded" trace of the __do_fault()
			
 
				+function:
			
 
				+
			
 
				+ 0)               |  __do_fault() {
			
 
				+ 0)               |    filemap_fault() {
			
 
				+ 0)               |      find_lock_page() {
			
 
				+ 0)   0.804 us    |        find_get_page();
			
 
				+ 0)               |        __might_sleep() {
			
 
				+ 0)   1.329 us    |        }
			
 
				+ 0)   3.904 us    |      }
			
 
				+ 0)   4.979 us    |    }
			
 
				+ 0)   0.653 us    |    _spin_lock();
			
 
				+ 0)   0.578 us    |    page_add_file_rmap();
			
 
				+ 0)   0.525 us    |    native_set_pte_at();
			
 
				+ 0)   0.585 us    |    _spin_unlock();
			
 
				+ 0)               |    unlock_page() {
			
 
				+ 0)   0.541 us    |      page_waitqueue();
			
 
				+ 0)   0.639 us    |      __wake_up_bit();
			
 
				+ 0)   2.786 us    |    }
			
 
				+ 0) + 14.237 us   |  }
			
 
				+ 0)               |  __do_fault() {
			
 
				+ 0)               |    filemap_fault() {
			
 
				+ 0)               |      find_lock_page() {
			
 
				+ 0)   0.698 us    |        find_get_page();
			
 
				+ 0)               |        __might_sleep() {
			
 
				+ 0)   1.412 us    |        }
			
 
				+ 0)   3.950 us    |      }
			
 
				+ 0)   5.098 us    |    }
			
 
				+ 0)   0.631 us    |    _spin_lock();
			
 
				+ 0)   0.571 us    |    page_add_file_rmap();
			
 
				+ 0)   0.526 us    |    native_set_pte_at();
			
 
				+ 0)   0.586 us    |    _spin_unlock();
			
 
				+ 0)               |    unlock_page() {
			
 
				+ 0)   0.533 us    |      page_waitqueue();
			
 
				+ 0)   0.638 us    |      __wake_up_bit();
			
 
				+ 0)   2.793 us    |    }
			
 
				+ 0) + 14.012 us   |  }
			
 
				+
			
 
				+You can also expand several functions at once:
			
 
				+
			
 
				+ echo sys_open > set_graph_function
			
 
				+ echo sys_close >> set_graph_function
			
 
				+
			
 
				+Now if you want to go back to trace all functions you can clear
			
 
				+this special filter via:
			
 
				+
			
 
				+ echo > set_graph_function
			
 
				+
			
 
				+
			
 
				+trace_pipe
			
 
				+----------
			
 
				+
			
 
				+The trace_pipe outputs the same content as the trace file, but
			
 
				+the effect on the tracing is different. Every read from
			
 
				+trace_pipe is consumed. This means that subsequent reads will be
			
 
				+different. The trace is live.
			
 
				+
			
 
				+ # echo function > /debug/tracing/current_tracer
			
 
				+ # cat /debug/tracing/trace_pipe > /tmp/trace.out &
			
 
				+[1] 4153
			
 
				+ # echo 1 > /debug/tracing/tracing_enabled
			
 
				+ # usleep 1
			
 
				+ # echo 0 > /debug/tracing/tracing_enabled
			
 
				+ # cat /debug/tracing/trace
			
 
				+# tracer: function
			
 
				+#
			
 
				+#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
			
 
				+#              | |      |          |         |
			
 
				+
			
 
				+ #
			
 
				+ # cat /tmp/trace.out
			
 
				+            bash-4043  [00] 41.267106: finish_task_switch <-schedule
			
 
				+            bash-4043  [00] 41.267106: hrtick_set <-schedule
			
 
				+            bash-4043  [00] 41.267107: hrtick_clear <-hrtick_set
			
 
				+            bash-4043  [00] 41.267108: wait_for_completion <-__stop_machine_run
			
 
				+            bash-4043  [00] 41.267108: wait_for_common <-wait_for_completion
			
 
				+            bash-4043  [00] 41.267109: kthread_stop <-stop_machine_run
			
 
				+            bash-4043  [00] 41.267109: init_waitqueue_head <-kthread_stop
			
 
				+            bash-4043  [00] 41.267110: wake_up_process <-kthread_stop
			
 
				+            bash-4043  [00] 41.267110: try_to_wake_up <-wake_up_process
			
 
				+            bash-4043  [00] 41.267111: select_task_rq_rt <-try_to_wake_up
			
 
				+
			
 
				+
			
 
				+Note, reading the trace_pipe file will block until more input is
			
 
				+added. By changing the tracer, trace_pipe will issue an EOF. We
			
 
				+needed to set the function tracer _before_ we "cat" the
			
 
				+trace_pipe file.
			
 
				+
			
 
				+
			
 
				+trace entries
			
 
				+-------------
			
 
				+
			
 
				+Having too much or not enough data can be troublesome in
			
 
				+diagnosing an issue in the kernel. The file buffer_size_kb is
			
 
				+used to modify the size of the internal trace buffers. The
			
 
				+number listed is the number of entries that can be recorded per
			
 
				+CPU. To know the full size, multiply the number of possible CPUS
			
 
				+with the number of entries.
			
 
				+
			
 
				+ # cat /debug/tracing/buffer_size_kb
			
 
				+1408 (units kilobytes)
			
 
				+
			
 
				+Note, to modify this, you must have tracing completely disabled.
			
 
				+To do that, echo "nop" into the current_tracer. If the
			
 
				+current_tracer is not set to "nop", an EINVAL error will be
			
 
				+returned.
			
 
				+
			
 
				+ # echo nop > /debug/tracing/current_tracer
			
 
				+ # echo 10000 > /debug/tracing/buffer_size_kb
			
 
				+ # cat /debug/tracing/buffer_size_kb
			
 
				+10000 (units kilobytes)
			
 
				+
			
 
				+The number of pages which will be allocated is limited to a
			
 
				+percentage of available memory. Allocating too much will produce
			
 
				+an error.
			
 
				+
			
 
				+ # echo 1000000000000 > /debug/tracing/buffer_size_kb
			
 
				+-bash: echo: write error: Cannot allocate memory
			
 
				+ # cat /debug/tracing/buffer_size_kb
			
 
				+85
			
 
				+
			
 
				+-----------
			
 
				+
			
 
				+More details can be found in the source code, in the
			
 
				+kernel/tracing/*.c files.
			
--- a/Documentation/trace/kmemtrace.txt
+++ b/Documentation/trace/kmemtrace.txt
@@ -0,0 +1,126 @@
 
				+			kmemtrace - Kernel Memory Tracer
			
 
				+
			
 
				+			  by Eduard - Gabriel Munteanu
			
 
				+			     <eduard.munteanu@linux360.ro>
			
 
				+
			
 
				+I. Introduction
			
 
				+===============
			
 
				+
			
 
				+kmemtrace helps kernel developers figure out two things:
			
 
				+1) how different allocators (SLAB, SLUB etc.) perform
			
 
				+2) how kernel code allocates memory and how much
			
 
				+
			
 
				+To do this, we trace every allocation and export information to the userspace
			
 
				+through the relay interface. We export things such as the number of requested
			
 
				+bytes, the number of bytes actually allocated (i.e. including internal
			
 
				+fragmentation), whether this is a slab allocation or a plain kmalloc() and so
			
 
				+on.
			
 
				+
			
 
				+The actual analysis is performed by a userspace tool (see section III for
			
 
				+details on where to get it from). It logs the data exported by the kernel,
			
 
				+processes it and (as of writing this) can provide the following information:
			
 
				+- the total amount of memory allocated and fragmentation per call-site
			
 
				+- the amount of memory allocated and fragmentation per allocation
			
 
				+- total memory allocated and fragmentation in the collected dataset
			
 
				+- number of cross-CPU allocation and frees (makes sense in NUMA environments)
			
 
				+
			
 
				+Moreover, it can potentially find inconsistent and erroneous behavior in
			
 
				+kernel code, such as using slab free functions on kmalloc'ed memory or
			
 
				+allocating less memory than requested (but not truly failed allocations).
			
 
				+
			
 
				+kmemtrace also makes provisions for tracing on some arch and analysing the
			
 
				+data on another.
			
 
				+
			
 
				+II. Design and goals
			
 
				+====================
			
 
				+
			
 
				+kmemtrace was designed to handle rather large amounts of data. Thus, it uses
			
 
				+the relay interface to export whatever is logged to userspace, which then
			
 
				+stores it. Analysis and reporting is done asynchronously, that is, after the
			
 
				+data is collected and stored. By design, it allows one to log and analyse
			
 
				+on different machines and different arches.
			
 
				+
			
 
				+As of writing this, the ABI is not considered stable, though it might not
			
 
				+change much. However, no guarantees are made about compatibility yet. When
			
 
				+deemed stable, the ABI should still allow easy extension while maintaining
			
 
				+backward compatibility. This is described further in Documentation/ABI.
			
 
				+
			
 
				+Summary of design goals:
			
 
				+	- allow logging and analysis to be done across different machines
			
 
				+	- be fast and anticipate usage in high-load environments (*)
			
 
				+	- be reasonably extensible
			
 
				+	- make it possible for GNU/Linux distributions to have kmemtrace
			
 
				+	included in their repositories
			
 
				+
			
 
				+(*) - one of the reasons Pekka Enberg's original userspace data analysis
			
 
				+    tool's code was rewritten from Perl to C (although this is more than a
			
 
				+    simple conversion)
			
 
				+
			
 
				+
			
 
				+III. Quick usage guide
			
 
				+======================
			
 
				+
			
 
				+1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
			
 
				+CONFIG_KMEMTRACE).
			
 
				+
			
 
				+2) Get the userspace tool and build it:
			
 
				+$ git-clone git://repo.or.cz/kmemtrace-user.git		# current repository
			
 
				+$ cd kmemtrace-user/
			
 
				+$ ./autogen.sh
			
 
				+$ ./configure
			
 
				+$ make
			
 
				+
			
 
				+3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
			
 
				+'single' runlevel (so that relay buffers don't fill up easily), and run
			
 
				+kmemtrace:
			
 
				+# '$' does not mean user, but root here.
			
 
				+$ mount -t debugfs none /sys/kernel/debug
			
 
				+$ mount -t proc none /proc
			
 
				+$ cd path/to/kmemtrace-user/
			
 
				+$ ./kmemtraced
			
 
				+Wait a bit, then stop it with CTRL+C.
			
 
				+$ cat /sys/kernel/debug/kmemtrace/total_overruns	# Check if we didn't
			
 
				+							# overrun, should
			
 
				+							# be zero.
			
 
				+$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
			
 
				+		check its correctness]
			
 
				+$ ./kmemtrace-report
			
 
				+
			
 
				+Now you should have a nice and short summary of how the allocator performs.
			
 
				+
			
 
				+IV. FAQ and known issues
			
 
				+========================
			
 
				+
			
 
				+Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
			
 
				+this? Should I worry?
			
 
				+A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
			
 
				+large the number is. You can fix it by supplying a higher
			
 
				+'kmemtrace.subbufs=N' kernel parameter.
			
 
				+---
			
 
				+
			
 
				+Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
			
 
				+A: This is a bug and should be reported. It can occur for a variety of
			
 
				+reasons:
			
 
				+	- possible bugs in relay code
			
 
				+	- possible misuse of relay by kmemtrace
			
 
				+	- timestamps being collected unorderly
			
 
				+Or you may fix it yourself and send us a patch.
			
 
				+---
			
 
				+
			
 
				+Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
			
 
				+A: This is a known issue and I'm working on it. These might be true errors
			
 
				+in kernel code, which may have inconsistent behavior (e.g. allocating memory
			
 
				+with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
			
 
				+out this behavior may work with SLAB, but may fail with other allocators.
			
 
				+
			
 
				+It may also be due to lack of tracing in some unusual allocator functions.
			
 
				+
			
 
				+We don't want bug reports regarding this issue yet.
			
 
				+---
			
 
				+
			
 
				+V. See also
			
 
				+===========
			
 
				+
			
 
				+Documentation/kernel-parameters.txt
			
 
				+Documentation/ABI/testing/debugfs-kmemtrace
			
 
				+
			
--- a/Documentation/tracers/mmiotrace.txt
+++ b/Documentation/tracers/mmiotrace.txt
--- a/Documentation/trace/tracepoints.txt
+++ b/Documentation/trace/tracepoints.txt
@@ -0,0 +1,116 @@
 
				+	             Using the Linux Kernel Tracepoints
			
 
				+
			
 
				+			    Mathieu Desnoyers
			
 
				+
			
 
				+
			
 
				+This document introduces Linux Kernel Tracepoints and their use. It
			
 
				+provides examples of how to insert tracepoints in the kernel and
			
 
				+connect probe functions to them and provides some examples of probe
			
 
				+functions.
			
 
				+
			
 
				+
			
 
				+* Purpose of tracepoints
			
 
				+
			
 
				+A tracepoint placed in code provides a hook to call a function (probe)
			
 
				+that you can provide at runtime. A tracepoint can be "on" (a probe is
			
 
				+connected to it) or "off" (no probe is attached). When a tracepoint is
			
 
				+"off" it has no effect, except for adding a tiny time penalty
			
 
				+(checking a condition for a branch) and space penalty (adding a few
			
 
				+bytes for the function call at the end of the instrumented function
			
 
				+and adds a data structure in a separate section).  When a tracepoint
			
 
				+is "on", the function you provide is called each time the tracepoint
			
 
				+is executed, in the execution context of the caller. When the function
			
 
				+provided ends its execution, it returns to the caller (continuing from
			
 
				+the tracepoint site).
			
 
				+
			
 
				+You can put tracepoints at important locations in the code. They are
			
 
				+lightweight hooks that can pass an arbitrary number of parameters,
			
 
				+which prototypes are described in a tracepoint declaration placed in a
			
 
				+header file.
			
 
				+
			
 
				+They can be used for tracing and performance accounting.
			
 
				+
			
 
				+
			
 
				+* Usage
			
 
				+
			
 
				+Two elements are required for tracepoints :
			
 
				+
			
 
				+- A tracepoint definition, placed in a header file.
			
 
				+- The tracepoint statement, in C code.
			
 
				+
			
 
				+In order to use tracepoints, you should include linux/tracepoint.h.
			
 
				+
			
 
				+In include/trace/subsys.h :
			
 
				+
			
 
				+#include <linux/tracepoint.h>
			
 
				+
			
 
				+DECLARE_TRACE(subsys_eventname,
			
 
				+	TP_PROTO(int firstarg, struct task_struct *p),
			
 
				+	TP_ARGS(firstarg, p));
			
 
				+
			
 
				+In subsys/file.c (where the tracing statement must be added) :
			
 
				+
			
 
				+#include <trace/subsys.h>
			
 
				+
			
 
				+DEFINE_TRACE(subsys_eventname);
			
 
				+
			
 
				+void somefct(void)
			
 
				+{
			
 
				+	...
			
 
				+	trace_subsys_eventname(arg, task);
			
 
				+	...
			
 
				+}
			
 
				+
			
 
				+Where :
			
 
				+- subsys_eventname is an identifier unique to your event
			
 
				+    - subsys is the name of your subsystem.
			
 
				+    - eventname is the name of the event to trace.
			
 
				+
			
 
				+- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
			
 
				+  function called by this tracepoint.
			
 
				+
			
 
				+- TP_ARGS(firstarg, p) are the parameters names, same as found in the
			
 
				+  prototype.
			
 
				+
			
 
				+Connecting a function (probe) to a tracepoint is done by providing a
			
 
				+probe (function to call) for the specific tracepoint through
			
 
				+register_trace_subsys_eventname().  Removing a probe is done through
			
 
				+unregister_trace_subsys_eventname(); it will remove the probe.
			
 
				+
			
 
				+tracepoint_synchronize_unregister() must be called before the end of
			
 
				+the module exit function to make sure there is no caller left using
			
 
				+the probe. This, and the fact that preemption is disabled around the
			
 
				+probe call, make sure that probe removal and module unload are safe.
			
 
				+See the "Probe example" section below for a sample probe module.
			
 
				+
			
 
				+The tracepoint mechanism supports inserting multiple instances of the
			
 
				+same tracepoint, but a single definition must be made of a given
			
 
				+tracepoint name over all the kernel to make sure no type conflict will
			
 
				+occur. Name mangling of the tracepoints is done using the prototypes
			
 
				+to make sure typing is correct. Verification of probe type correctness
			
 
				+is done at the registration site by the compiler. Tracepoints can be
			
 
				+put in inline functions, inlined static functions, and unrolled loops
			
 
				+as well as regular functions.
			
 
				+
			
 
				+The naming scheme "subsys_event" is suggested here as a convention
			
 
				+intended to limit collisions. Tracepoint names are global to the
			
 
				+kernel: they are considered as being the same whether they are in the
			
 
				+core kernel image or in modules.
			
 
				+
			
 
				+If the tracepoint has to be used in kernel modules, an
			
 
				+EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be
			
 
				+used to export the defined tracepoints.
			
 
				+
			
 
				+* Probe / tracepoint example
			
 
				+
			
 
				+See the example provided in samples/tracepoints
			
 
				+
			
 
				+Compile them with your kernel.  They are built during 'make' (not
			
 
				+'make modules') when CONFIG_SAMPLE_TRACEPOINTS=m.
			
 
				+
			
 
				+Run, as root :
			
 
				+modprobe tracepoint-sample (insmod order is not important)
			
 
				+modprobe tracepoint-probe-sample
			
 
				+cat /proc/tracepoint-sample (returns an expected error)
			
 
				+rmmod tracepoint-sample tracepoint-probe-sample
			
 
				+dmesg
			
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -1,115 +0,0 @@
 
				-	             Using the Linux Kernel Tracepoints
			
 
				-
			
 
				-			    Mathieu Desnoyers
			
 
				-
			
 
				-
			
 
				-This document introduces Linux Kernel Tracepoints and their use. It
			
 
				-provides examples of how to insert tracepoints in the kernel and
			
 
				-connect probe functions to them and provides some examples of probe
			
 
				-functions.
			
 
				-
			
 
				-
			
 
				-* Purpose of tracepoints
			
 
				-
			
 
				-A tracepoint placed in code provides a hook to call a function (probe)
			
 
				-that you can provide at runtime. A tracepoint can be "on" (a probe is
			
 
				-connected to it) or "off" (no probe is attached). When a tracepoint is
			
 
				-"off" it has no effect, except for adding a tiny time penalty
			
 
				-(checking a condition for a branch) and space penalty (adding a few
			
 
				-bytes for the function call at the end of the instrumented function
			
 
				-and adds a data structure in a separate section).  When a tracepoint
			
 
				-is "on", the function you provide is called each time the tracepoint
			
 
				-is executed, in the execution context of the caller. When the function
			
 
				-provided ends its execution, it returns to the caller (continuing from
			
 
				-the tracepoint site).
			
 
				-
			
 
				-You can put tracepoints at important locations in the code. They are
			
 
				-lightweight hooks that can pass an arbitrary number of parameters,
			
 
				-which prototypes are described in a tracepoint declaration placed in a
			
 
				-header file.
			
 
				-
			
 
				-They can be used for tracing and performance accounting.
			
 
				-
			
 
				-
			
 
				-* Usage
			
 
				-
			
 
				-Two elements are required for tracepoints :
			
 
				-
			
 
				-- A tracepoint definition, placed in a header file.
			
 
				-- The tracepoint statement, in C code.
			
 
				-
			
 
				-In order to use tracepoints, you should include linux/tracepoint.h.
			
 
				-
			
 
				-In include/trace/subsys.h :
			
 
				-
			
 
				-#include <linux/tracepoint.h>
			
 
				-
			
 
				-DECLARE_TRACE(subsys_eventname,
			
 
				-	TPPROTO(int firstarg, struct task_struct *p),
			
 
				-	TPARGS(firstarg, p));
			
 
				-
			
 
				-In subsys/file.c (where the tracing statement must be added) :
			
 
				-
			
 
				-#include <trace/subsys.h>
			
 
				-
			
 
				-DEFINE_TRACE(subsys_eventname);
			
 
				-
			
 
				-void somefct(void)
			
 
				-{
			
 
				-	...
			
 
				-	trace_subsys_eventname(arg, task);
			
 
				-	...
			
 
				-}
			
 
				-
			
 
				-Where :
			
 
				-- subsys_eventname is an identifier unique to your event
			
 
				-    - subsys is the name of your subsystem.
			
 
				-    - eventname is the name of the event to trace.
			
 
				-
			
 
				-- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
			
 
				-  function called by this tracepoint.
			
 
				-
			
 
				-- TPARGS(firstarg, p) are the parameters names, same as found in the
			
 
				-  prototype.
			
 
				-
			
 
				-Connecting a function (probe) to a tracepoint is done by providing a
			
 
				-probe (function to call) for the specific tracepoint through
			
 
				-register_trace_subsys_eventname().  Removing a probe is done through
			
 
				-unregister_trace_subsys_eventname(); it will remove the probe.
			
 
				-
			
 
				-tracepoint_synchronize_unregister() must be called before the end of
			
 
				-the module exit function to make sure there is no caller left using
			
 
				-the probe. This, and the fact that preemption is disabled around the
			
 
				-probe call, make sure that probe removal and module unload are safe.
			
 
				-See the "Probe example" section below for a sample probe module.
			
 
				-
			
 
				-The tracepoint mechanism supports inserting multiple instances of the
			
 
				-same tracepoint, but a single definition must be made of a given
			
 
				-tracepoint name over all the kernel to make sure no type conflict will
			
 
				-occur. Name mangling of the tracepoints is done using the prototypes
			
 
				-to make sure typing is correct. Verification of probe type correctness
			
 
				-is done at the registration site by the compiler. Tracepoints can be
			
 
				-put in inline functions, inlined static functions, and unrolled loops
			
 
				-as well as regular functions.
			
 
				-
			
 
				-The naming scheme "subsys_event" is suggested here as a convention
			
 
				-intended to limit collisions. Tracepoint names are global to the
			
 
				-kernel: they are considered as being the same whether they are in the
			
 
				-core kernel image or in modules.
			
 
				-
			
 
				-If the tracepoint has to be used in kernel modules, an
			
 
				-EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be
			
 
				-used to export the defined tracepoints.
			
 
				-
			
 
				-* Probe / tracepoint example
			
 
				-
			
 
				-See the example provided in samples/tracepoints/src
			
 
				-
			
 
				-Compile them with your kernel.
			
 
				-
			
 
				-Run, as root :
			
 
				-modprobe tracepoint-example (insmod order is not important)
			
 
				-modprobe tracepoint-probe-example
			
 
				-cat /proc/tracepoint-example (returns an expected error)
			
 
				-rmmod tracepoint-example tracepoint-probe-example
			
 
				-dmesg
			
--- a/Documentation/video4linux/pxa_camera.txt
+++ b/Documentation/video4linux/pxa_camera.txt
@@ -0,0 +1,125 @@
 
				+                              PXA-Camera Host Driver
			
 
				+                              ======================
			
 
				+
			
 
				+Constraints
			
 
				+-----------
			
 
				+  a) Image size for YUV422P format
			
 
				+     All YUV422P images are enforced to have width x height % 16 = 0.
			
 
				+     This is due to DMA constraints, which transfers only planes of 8 byte
			
 
				+     multiples.
			
 
				+
			
 
				+
			
 
				+Global video workflow
			
 
				+---------------------
			
 
				+  a) QCI stopped
			
 
				+     Initialy, the QCI interface is stopped.
			
 
				+     When a buffer is queued (pxa_videobuf_ops->buf_queue), the QCI starts.
			
 
				+
			
 
				+  b) QCI started
			
 
				+     More buffers can be queued while the QCI is started without halting the
			
 
				+     capture.  The new buffers are "appended" at the tail of the DMA chain, and
			
 
				+     smoothly captured one frame after the other.
			
 
				+
			
 
				+     Once a buffer is filled in the QCI interface, it is marked as "DONE" and
			
 
				+     removed from the active buffers list. It can be then requeud or dequeued by
			
 
				+     userland application.
			
 
				+
			
 
				+     Once the last buffer is filled in, the QCI interface stops.
			
 
				+
			
 
				+
			
 
				+DMA usage
			
 
				+---------
			
 
				+  a) DMA flow
			
 
				+     - first buffer queued for capture
			
 
				+       Once a first buffer is queued for capture, the QCI is started, but data
			
 
				+       transfer is not started. On "End Of Frame" interrupt, the irq handler
			
 
				+       starts the DMA chain.
			
 
				+     - capture of one videobuffer
			
 
				+       The DMA chain starts transfering data into videobuffer RAM pages.
			
 
				+       When all pages are transfered, the DMA irq is raised on "ENDINTR" status
			
 
				+     - finishing one videobuffer
			
 
				+       The DMA irq handler marks the videobuffer as "done", and removes it from
			
 
				+       the active running queue
			
 
				+       Meanwhile, the next videobuffer (if there is one), is transfered by DMA
			
 
				+     - finishing the last videobuffer
			
 
				+       On the DMA irq of the last videobuffer, the QCI is stopped.
			
 
				+
			
 
				+  b) DMA prepared buffer will have this structure
			
 
				+
			
 
				+     +------------+-----+---------------+-----------------+
			
 
				+     | desc-sg[0] | ... | desc-sg[last] | finisher/linker |
			
 
				+     +------------+-----+---------------+-----------------+
			
 
				+
			
 
				+     This structure is pointed by dma->sg_cpu.
			
 
				+     The descriptors are used as follows :
			
 
				+      - desc-sg[i]: i-th descriptor, transfering the i-th sg
			
 
				+        element to the video buffer scatter gather
			
 
				+      - finisher: has ddadr=DADDR_STOP, dcmd=ENDIRQEN
			
 
				+      - linker: has ddadr= desc-sg[0] of next video buffer, dcmd=0
			
 
				+
			
 
				+     For the next schema, let's assume d0=desc-sg[0] .. dN=desc-sg[N],
			
 
				+     "f" stands for finisher and "l" for linker.
			
 
				+     A typical running chain is :
			
 
				+
			
 
				+         Videobuffer 1         Videobuffer 2
			
 
				+     +---------+----+---+  +----+----+----+---+
			
 
				+     | d0 | .. | dN | l |  | d0 | .. | dN | f |
			
 
				+     +---------+----+-|-+  ^----+----+----+---+
			
 
				+                      |    |
			
 
				+                      +----+
			
 
				+
			
 
				+     After the chaining is finished, the chain looks like :
			
 
				+
			
 
				+         Videobuffer 1         Videobuffer 2         Videobuffer 3
			
 
				+     +---------+----+---+  +----+----+----+---+  +----+----+----+---+
			
 
				+     | d0 | .. | dN | l |  | d0 | .. | dN | l |  | d0 | .. | dN | f |
			
 
				+     +---------+----+-|-+  ^----+----+----+-|-+  ^----+----+----+---+
			
 
				+                      |    |                |    |
			
 
				+                      +----+                +----+
			
 
				+                                           new_link
			
 
				+
			
 
				+  c) DMA hot chaining timeslice issue
			
 
				+
			
 
				+     As DMA chaining is done while DMA _is_ running, the linking may be done
			
 
				+     while the DMA jumps from one Videobuffer to another. On the schema, that
			
 
				+     would be a problem if the following sequence is encountered :
			
 
				+
			
 
				+      - DMA chain is Videobuffer1 + Videobuffer2
			
 
				+      - pxa_videobuf_queue() is called to queue Videobuffer3
			
 
				+      - DMA controller finishes Videobuffer2, and DMA stops
			
 
				+      =>
			
 
				+         Videobuffer 1         Videobuffer 2
			
 
				+     +---------+----+---+  +----+----+----+---+
			
 
				+     | d0 | .. | dN | l |  | d0 | .. | dN | f |
			
 
				+     +---------+----+-|-+  ^----+----+----+-^-+
			
 
				+                      |    |                |
			
 
				+                      +----+                +-- DMA DDADR loads DDADR_STOP
			
 
				+
			
 
				+      - pxa_dma_add_tail_buf() is called, the Videobuffer2 "finisher" is
			
 
				+        replaced by a "linker" to Videobuffer3 (creation of new_link)
			
 
				+      - pxa_videobuf_queue() finishes
			
 
				+      - the DMA irq handler is called, which terminates Videobuffer2
			
 
				+      - Videobuffer3 capture is not scheduled on DMA chain (as it stopped !!!)
			
 
				+
			
 
				+         Videobuffer 1         Videobuffer 2         Videobuffer 3
			
 
				+     +---------+----+---+  +----+----+----+---+  +----+----+----+---+
			
 
				+     | d0 | .. | dN | l |  | d0 | .. | dN | l |  | d0 | .. | dN | f |
			
 
				+     +---------+----+-|-+  ^----+----+----+-|-+  ^----+----+----+---+
			
 
				+                      |    |                |    |
			
 
				+                      +----+                +----+
			
 
				+                                           new_link
			
 
				+                                          DMA DDADR still is DDADR_STOP
			
 
				+
			
 
				+      - pxa_camera_check_link_miss() is called
			
 
				+        This checks if the DMA is finished and a buffer is still on the
			
 
				+        pcdev->capture list. If that's the case, the capture will be restarted,
			
 
				+        and Videobuffer3 is scheduled on DMA chain.
			
 
				+      - the DMA irq handler finishes
			
 
				+
			
 
				+     Note: if DMA stops just after pxa_camera_check_link_miss() reads DDADR()
			
 
				+     value, we have the guarantee that the DMA irq handler will be called back
			
 
				+     when the DMA will finish the buffer, and pxa_camera_check_link_miss() will
			
 
				+     be called again, to reschedule Videobuffer3.
			
 
				+
			
 
				+--
			
 
				+Author: Robert Jarzmik <robert.jarzmik@free.fr>
			
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -90,7 +90,7 @@ up before calling v4l2_device_register then it will be untouched. If dev is
 
				 NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register.
			
 
				 
			
 
				 The first 'dev' argument is normally the struct device pointer of a pci_dev,
			
 
				-usb_device or platform_device. It is rare for dev to be NULL, but it happens
			
 
				+usb_interface or platform_device. It is rare for dev to be NULL, but it happens
			
 
				 with ISA devices or when one device creates multiple PCI devices, thus making
			
 
				 it impossible to associate v4l2_dev with a particular parent.
			
 
				 
			
@@ -351,17 +351,6 @@ And this to go from an i2c_client to a v4l2_subdev struct:
 
				 
			
 
				 	struct v4l2_subdev *sd = i2c_get_clientdata(client);
			
 
				 
			
 
				-Finally you need to make a command function to make driver->command()
			
 
				-call the right subdev_ops functions:
			
 
				-
			
 
				-static int subdev_command(struct i2c_client *client, unsigned cmd, void *arg)
			
 
				-{
			
 
				-	return v4l2_subdev_command(i2c_get_clientdata(client), cmd, arg);
			
 
				-}
			
 
				-
			
 
				-If driver->command is never used then you can leave this out. Eventually the
			
 
				-driver->command usage should be removed from v4l.
			
 
				-
			
 
				 Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback
			
 
				 is called. This will unregister the sub-device from the bridge driver. It is
			
 
				 safe to call this even if the sub-device was never registered.
			
@@ -375,14 +364,12 @@ from the remove() callback ensures that this is always done correctly.
 
				 
			
 
				 The bridge driver also has some helper functions it can use:
			
 
				 
			
 
				-struct v4l2_subdev *sd = v4l2_i2c_new_subdev(adapter, "module_foo", "chipid", 0x36);
			
 
				+struct v4l2_subdev *sd = v4l2_i2c_new_subdev(v4l2_dev, adapter,
			
 
				+	       "module_foo", "chipid", 0x36);
			
 
				 
			
 
				 This loads the given module (can be NULL if no module needs to be loaded) and
			
 
				 calls i2c_new_device() with the given i2c_adapter and chip/address arguments.
			
 
				-If all goes well, then it registers the subdev with the v4l2_device. It gets
			
 
				-the v4l2_device by calling i2c_get_adapdata(adapter), so you should make sure
			
 
				-to call i2c_set_adapdata(adapter, v4l2_device) when you setup the i2c_adapter
			
 
				-in your driver.
			
 
				+If all goes well, then it registers the subdev with the v4l2_device.
			
 
				 
			
 
				 You can also use v4l2_i2c_new_probed_subdev() which is very similar to
			
 
				 v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses
			
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -1,5 +1,7 @@
 
				 00-INDEX
			
 
				 	- this file.
			
 
				+active_mm.txt
			
 
				+	- An explanation from Linus about tsk->active_mm vs tsk->mm.
			
 
				 balance
			
 
				 	- various information on memory balancing.
			
 
				 hugetlbpage.txt
			
--- a/Documentation/vm/active_mm.txt
+++ b/Documentation/vm/active_mm.txt
@@ -0,0 +1,83 @@
 
				+List:       linux-kernel
			
 
				+Subject:    Re: active_mm
			
 
				+From:       Linus Torvalds <torvalds () transmeta ! com>
			
 
				+Date:       1999-07-30 21:36:24
			
 
				+
			
 
				+Cc'd to linux-kernel, because I don't write explanations all that often,
			
 
				+and when I do I feel better about more people reading them.
			
 
				+
			
 
				+On Fri, 30 Jul 1999, David Mosberger wrote:
			
 
				+>
			
 
				+> Is there a brief description someplace on how "mm" vs. "active_mm" in
			
 
				+> the task_struct are supposed to be used?  (My apologies if this was
			
 
				+> discussed on the mailing lists---I just returned from vacation and
			
 
				+> wasn't able to follow linux-kernel for a while).
			
 
				+
			
 
				+Basically, the new setup is:
			
 
				+
			
 
				+ - we have "real address spaces" and "anonymous address spaces". The
			
 
				+   difference is that an anonymous address space doesn't care about the
			
 
				+   user-level page tables at all, so when we do a context switch into an
			
 
				+   anonymous address space we just leave the previous address space
			
 
				+   active.
			
 
				+
			
 
				+   The obvious use for a "anonymous address space" is any thread that
			
 
				+   doesn't need any user mappings - all kernel threads basically fall into
			
 
				+   this category, but even "real" threads can temporarily say that for
			
 
				+   some amount of time they are not going to be interested in user space,
			
 
				+   and that the scheduler might as well try to avoid wasting time on
			
 
				+   switching the VM state around. Currently only the old-style bdflush
			
 
				+   sync does that.
			
 
				+
			
 
				+ - "tsk->mm" points to the "real address space". For an anonymous process,
			
 
				+   tsk->mm will be NULL, for the logical reason that an anonymous process
			
 
				+   really doesn't _have_ a real address space at all.
			
 
				+
			
 
				+ - however, we obviously need to keep track of which address space we
			
 
				+   "stole" for such an anonymous user. For that, we have "tsk->active_mm",
			
 
				+   which shows what the currently active address space is.
			
 
				+
			
 
				+   The rule is that for a process with a real address space (ie tsk->mm is
			
 
				+   non-NULL) the active_mm obviously always has to be the same as the real
			
 
				+   one.
			
 
				+
			
 
				+   For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
			
 
				+   "borrowed" mm while the anonymous process is running. When the
			
 
				+   anonymous process gets scheduled away, the borrowed address space is
			
 
				+   returned and cleared.
			
 
				+
			
 
				+To support all that, the "struct mm_struct" now has two counters: a
			
 
				+"mm_users" counter that is how many "real address space users" there are,
			
 
				+and a "mm_count" counter that is the number of "lazy" users (ie anonymous
			
 
				+users) plus one if there are any real users.
			
 
				+
			
 
				+Usually there is at least one real user, but it could be that the real
			
 
				+user exited on another CPU while a lazy user was still active, so you do
			
 
				+actually get cases where you have a address space that is _only_ used by
			
 
				+lazy users. That is often a short-lived state, because once that thread
			
 
				+gets scheduled away in favour of a real thread, the "zombie" mm gets
			
 
				+released because "mm_users" becomes zero.
			
 
				+
			
 
				+Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
			
 
				+more. "init_mm" should be considered just a "lazy context when no other
			
 
				+context is available", and in fact it is mainly used just at bootup when
			
 
				+no real VM has yet been created. So code that used to check
			
 
				+
			
 
				+	if (current->mm == &init_mm)
			
 
				+
			
 
				+should generally just do
			
 
				+
			
 
				+	if (!current->mm)
			
 
				+
			
 
				+instead (which makes more sense anyway - the test is basically one of "do
			
 
				+we have a user context", and is generally done by the page fault handler
			
 
				+and things like that).
			
 
				+
			
 
				+Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
			
 
				+because it slightly changes the interfaces to accomodate the alpha (who
			
 
				+would have thought it, but the alpha actually ends up having one of the
			
 
				+ugliest context switch codes - unlike the other architectures where the MM
			
 
				+and register state is separate, the alpha PALcode joins the two, and you
			
 
				+need to switch both together).
			
 
				+
			
 
				+(From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
			
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -1,588 +1,691 @@
 
				-
			
 
				-This document describes the Linux memory management "Unevictable LRU"
			
 
				-infrastructure and the use of this infrastructure to manage several types
			
 
				-of "unevictable" pages.  The document attempts to provide the overall
			
 
				-rationale behind this mechanism and the rationale for some of the design
			
 
				-decisions that drove the implementation.  The latter design rationale is
			
 
				-discussed in the context of an implementation description.  Admittedly, one
			
 
				-can obtain the implementation details--the "what does it do?"--by reading the
			
 
				-code.  One hopes that the descriptions below add value by provide the answer
			
 
				-to "why does it do that?".
			
 
				-
			
 
				-Unevictable LRU Infrastructure:
			
 
				-
			
 
				-The Unevictable LRU adds an additional LRU list to track unevictable pages
			
 
				-and to hide these pages from vmscan.  This mechanism is based on a patch by
			
 
				-Larry Woodman of Red Hat to address several scalability problems with page
			
 
				+			==============================
			
 
				+			UNEVICTABLE LRU INFRASTRUCTURE
			
 
				+			==============================
			
 
				+
			
 
				+========
			
 
				+CONTENTS
			
 
				+========
			
 
				+
			
 
				+ (*) The Unevictable LRU
			
 
				+
			
 
				+     - The unevictable page list.
			
 
				+     - Memory control group interaction.
			
 
				+     - Marking address spaces unevictable.
			
 
				+     - Detecting Unevictable Pages.
			
 
				+     - vmscan's handling of unevictable pages.
			
 
				+
			
 
				+ (*) mlock()'d pages.
			
 
				+
			
 
				+     - History.
			
 
				+     - Basic management.
			
 
				+     - mlock()/mlockall() system call handling.
			
 
				+     - Filtering special vmas.
			
 
				+     - munlock()/munlockall() system call handling.
			
 
				+     - Migrating mlocked pages.
			
 
				+     - mmap(MAP_LOCKED) system call handling.
			
 
				+     - munmap()/exit()/exec() system call handling.
			
 
				+     - try_to_unmap().
			
 
				+     - try_to_munlock() reverse map scan.
			
 
				+     - Page reclaim in shrink_*_list().
			
 
				+
			
 
				+
			
 
				+============
			
 
				+INTRODUCTION
			
 
				+============
			
 
				+
			
 
				+This document describes the Linux memory manager's "Unevictable LRU"
			
 
				+infrastructure and the use of this to manage several types of "unevictable"
			
 
				+pages.
			
 
				+
			
 
				+The document attempts to provide the overall rationale behind this mechanism
			
 
				+and the rationale for some of the design decisions that drove the
			
 
				+implementation.  The latter design rationale is discussed in the context of an
			
 
				+implementation description.  Admittedly, one can obtain the implementation
			
 
				+details - the "what does it do?" - by reading the code.  One hopes that the
			
 
				+descriptions below add value by provide the answer to "why does it do that?".
			
 
				+
			
 
				+
			
 
				+===================
			
 
				+THE UNEVICTABLE LRU
			
 
				+===================
			
 
				+
			
 
				+The Unevictable LRU facility adds an additional LRU list to track unevictable
			
 
				+pages and to hide these pages from vmscan.  This mechanism is based on a patch
			
 
				+by Larry Woodman of Red Hat to address several scalability problems with page
			
 
				 reclaim in Linux.  The problems have been observed at customer sites on large
			
 
				-memory x86_64 systems.  For example, a non-numal x86_64 platform with 128GB
			
 
				-of main memory will have over 32 million 4k pages in a single zone.  When a
			
 
				-large fraction of these pages are not evictable for any reason [see below],
			
 
				-vmscan will spend a lot of time scanning the LRU lists looking for the small
			
 
				-fraction of pages that are evictable.  This can result in a situation where
			
 
				-all cpus are spending 100% of their time in vmscan for hours or days on end,
			
 
				-with the system completely unresponsive.
			
 
				-
			
 
				-The Unevictable LRU infrastructure addresses the following classes of
			
 
				-unevictable pages:
			
 
				-
			
 
				-+ page owned by ramfs
			
 
				-+ page mapped into SHM_LOCKed shared memory regions
			
 
				-+ page mapped into VM_LOCKED [mlock()ed] vmas
			
 
				-
			
 
				-The infrastructure might be able to handle other conditions that make pages
			
 
				+memory x86_64 systems.
			
 
				+
			
 
				+To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
			
 
				+main memory will have over 32 million 4k pages in a single zone.  When a large
			
 
				+fraction of these pages are not evictable for any reason [see below], vmscan
			
 
				+will spend a lot of time scanning the LRU lists looking for the small fraction
			
 
				+of pages that are evictable.  This can result in a situation where all CPUs are
			
 
				+spending 100% of their time in vmscan for hours or days on end, with the system
			
 
				+completely unresponsive.
			
 
				+
			
 
				+The unevictable list addresses the following classes of unevictable pages:
			
 
				+
			
 
				+ (*) Those owned by ramfs.
			
 
				+
			
 
				+ (*) Those mapped into SHM_LOCK'd shared memory regions.
			
 
				+
			
 
				+ (*) Those mapped into VM_LOCKED [mlock()ed] VMAs.
			
 
				+
			
 
				+The infrastructure may also be able to handle other conditions that make pages
			
 
				 unevictable, either by definition or by circumstance, in the future.
			
 
				 
			
 
				 
			
 
				-The Unevictable LRU List
			
 
				+THE UNEVICTABLE PAGE LIST
			
 
				+-------------------------
			
 
				 
			
 
				 The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
			
 
				 called the "unevictable" list and an associated page flag, PG_unevictable, to
			
 
				-indicate that the page is being managed on the unevictable list.  The
			
 
				-PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active
			
 
				-flag in that it indicates on which LRU list a page resides when PG_lru is set.
			
 
				-The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU
			
 
				-Kconfig option.
			
 
				+indicate that the page is being managed on the unevictable list.
			
 
				+
			
 
				+The PG_unevictable flag is analogous to, and mutually exclusive with, the
			
 
				+PG_active flag in that it indicates on which LRU list a page resides when
			
 
				+PG_lru is set.  The unevictable list is compile-time configurable based on the
			
 
				+UNEVICTABLE_LRU Kconfig option.
			
 
				 
			
 
				 The Unevictable LRU infrastructure maintains unevictable pages on an additional
			
 
				 LRU list for a few reasons:
			
 
				 
			
 
				-1) We get to "treat unevictable pages just like we treat other pages in the
			
 
				-   system, which means we get to use the same code to manipulate them, the
			
 
				-   same code to isolate them (for migrate, etc.), the same code to keep track
			
 
				-   of the statistics, etc..." [Rik van Riel]
			
 
				+ (1) We get to "treat unevictable pages just like we treat other pages in the
			
 
				+     system - which means we get to use the same code to manipulate them, the
			
 
				+     same code to isolate them (for migrate, etc.), the same code to keep track
			
 
				+     of the statistics, etc..." [Rik van Riel]
			
 
				+
			
 
				+ (2) We want to be able to migrate unevictable pages between nodes for memory
			
 
				+     defragmentation, workload management and memory hotplug.  The linux kernel
			
 
				+     can only migrate pages that it can successfully isolate from the LRU
			
 
				+     lists.  If we were to maintain pages elsewhere than on an LRU-like list,
			
 
				+     where they can be found by isolate_lru_page(), we would prevent their
			
 
				+     migration, unless we reworked migration code to find the unevictable pages
			
 
				+     itself.
			
 
				 
			
 
				-2) We want to be able to migrate unevictable pages between nodes--for memory
			
 
				-   defragmentation, workload management and memory hotplug.  The linux kernel
			
 
				-   can only migrate pages that it can successfully isolate from the lru lists.
			
 
				-   If we were to maintain pages elsewise than on an lru-like list, where they
			
 
				-   can be found by isolate_lru_page(), we would prevent their migration, unless
			
 
				-   we reworked migration code to find the unevictable pages.
			
 
				 
			
 
				+The unevictable list does not differentiate between file-backed and anonymous,
			
 
				+swap-backed pages.  This differentiation is only important while the pages are,
			
 
				+in fact, evictable.
			
 
				 
			
 
				-The unevictable LRU list does not differentiate between file backed and swap
			
 
				-backed [anon] pages.  This differentiation is only important while the pages
			
 
				-are, in fact, evictable.
			
 
				+The unevictable list benefits from the "arrayification" of the per-zone LRU
			
 
				+lists and statistics originally proposed and posted by Christoph Lameter.
			
 
				 
			
 
				-The unevictable LRU list benefits from the "arrayification" of the per-zone
			
 
				-LRU lists and statistics originally proposed and posted by Christoph Lameter.
			
 
				+The unevictable list does not use the LRU pagevec mechanism. Rather,
			
 
				+unevictable pages are placed directly on the page's zone's unevictable list
			
 
				+under the zone lru_lock.  This allows us to prevent the stranding of pages on
			
 
				+the unevictable list when one task has the page isolated from the LRU and other
			
 
				+tasks are changing the "evictability" state of the page.
			
 
				 
			
 
				-The unevictable list does not use the lru pagevec mechanism. Rather,
			
 
				-unevictable pages are placed directly on the page's zone's unevictable
			
 
				-list under the zone lru_lock.  The reason for this is to prevent stranding
			
 
				-of pages on the unevictable list when one task has the page isolated from the
			
 
				-lru and other tasks are changing the "evictability" state of the page.
			
 
				 
			
 
				+MEMORY CONTROL GROUP INTERACTION
			
 
				+--------------------------------
			
 
				 
			
 
				-Unevictable LRU and Memory Controller Interaction
			
 
				+The unevictable LRU facility interacts with the memory control group [aka
			
 
				+memory controller; see Documentation/cgroups/memory.txt] by extending the
			
 
				+lru_list enum.
			
 
				+
			
 
				+The memory controller data structure automatically gets a per-zone unevictable
			
 
				+list as a result of the "arrayification" of the per-zone LRU lists (one per
			
 
				+lru_list enum element).  The memory controller tracks the movement of pages to
			
 
				+and from the unevictable list.
			
 
				 
			
 
				-The memory controller data structure automatically gets a per zone unevictable
			
 
				-lru list as a result of the "arrayification" of the per-zone LRU lists.  The
			
 
				-memory controller tracks the movement of pages to and from the unevictable list.
			
 
				 When a memory control group comes under memory pressure, the controller will
			
 
				 not attempt to reclaim pages on the unevictable list.  This has a couple of
			
 
				-effects.  Because the pages are "hidden" from reclaim on the unevictable list,
			
 
				-the reclaim process can be more efficient, dealing only with pages that have
			
 
				-a chance of being reclaimed.  On the other hand, if too many of the pages
			
 
				-charged to the control group are unevictable, the evictable portion of the
			
 
				-working set of the tasks in the control group may not fit into the available
			
 
				-memory.  This can cause the control group to thrash or to oom-kill tasks.
			
 
				-
			
 
				-
			
 
				-Unevictable LRU:  Detecting Unevictable Pages
			
 
				-
			
 
				-The function page_evictable(page, vma) in vmscan.c determines whether a
			
 
				-page is evictable or not.  For ramfs pages and pages in SHM_LOCKed regions,
			
 
				-page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's
			
 
				-address space using a wrapper function.  Wrapper functions are used to set,
			
 
				-clear and test the flag to reduce the requirement for #ifdef's throughout the
			
 
				-source code.  AS_UNEVICTABLE is set on ramfs inode/mapping when it is created.
			
 
				-This flag remains for the life of the inode.
			
 
				-
			
 
				-For shared memory regions, AS_UNEVICTABLE is set when an application
			
 
				-successfully SHM_LOCKs the region and is removed when the region is
			
 
				-SHM_UNLOCKed.  Note that shmctl(SHM_LOCK, ...) does not populate the page
			
 
				-tables for the region as does, for example, mlock().   So, we make no special
			
 
				-effort to push any pages in the SHM_LOCKed region to the unevictable list.
			
 
				-Vmscan will do this when/if it encounters the pages during reclaim.  On
			
 
				-SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the
			
 
				-unevictable list if no other condition keeps them unevictable.  If a SHM_LOCKed
			
 
				-region is destroyed, the pages are also "rescued" from the unevictable list in
			
 
				-the process of freeing them.
			
 
				-
			
 
				-page_evictable() detects mlock()ed pages by testing an additional page flag,
			
 
				-PG_mlocked via the PageMlocked() wrapper.  If the page is NOT mlocked, and a
			
 
				-non-NULL vma is supplied, page_evictable() will check whether the vma is
			
 
				+effects:
			
 
				+
			
 
				+ (1) Because the pages are "hidden" from reclaim on the unevictable list, the
			
 
				+     reclaim process can be more efficient, dealing only with pages that have a
			
 
				+     chance of being reclaimed.
			
 
				+
			
 
				+ (2) On the other hand, if too many of the pages charged to the control group
			
 
				+     are unevictable, the evictable portion of the working set of the tasks in
			
 
				+     the control group may not fit into the available memory.  This can cause
			
 
				+     the control group to thrash or to OOM-kill tasks.
			
 
				+
			
 
				+
			
 
				+MARKING ADDRESS SPACES UNEVICTABLE
			
 
				+----------------------------------
			
 
				+
			
 
				+For facilities such as ramfs none of the pages attached to the address space
			
 
				+may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
			
 
				+address space flag is provided, and this can be manipulated by a filesystem
			
 
				+using a number of wrapper functions:
			
 
				+
			
 
				+ (*) void mapping_set_unevictable(struct address_space *mapping);
			
 
				+
			
 
				+	Mark the address space as being completely unevictable.
			
 
				+
			
 
				+ (*) void mapping_clear_unevictable(struct address_space *mapping);
			
 
				+
			
 
				+	Mark the address space as being evictable.
			
 
				+
			
 
				+ (*) int mapping_unevictable(struct address_space *mapping);
			
 
				+
			
 
				+	Query the address space, and return true if it is completely
			
 
				+	unevictable.
			
 
				+
			
 
				+These are currently used in two places in the kernel:
			
 
				+
			
 
				+ (1) By ramfs to mark the address spaces of its inodes when they are created,
			
 
				+     and this mark remains for the life of the inode.
			
 
				+
			
 
				+ (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
			
 
				+
			
 
				+     Note that SHM_LOCK is not required to page in the locked pages if they're
			
 
				+     swapped out; the application must touch the pages manually if it wants to
			
 
				+     ensure they're in memory.
			
 
				+
			
 
				+
			
 
				+DETECTING UNEVICTABLE PAGES
			
 
				+---------------------------
			
 
				+
			
 
				+The function page_evictable() in vmscan.c determines whether a page is
			
 
				+evictable or not using the query function outlined above [see section "Marking
			
 
				+address spaces unevictable"] to check the AS_UNEVICTABLE flag.
			
 
				+
			
 
				+For address spaces that are so marked after being populated (as SHM regions
			
 
				+might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
			
 
				+the page tables for the region as does, for example, mlock(), nor need it make
			
 
				+any special effort to push any pages in the SHM_LOCK'd area to the unevictable
			
 
				+list.  Instead, vmscan will do this if and when it encounters the pages during
			
 
				+a reclamation scan.
			
 
				+
			
 
				+On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan
			
 
				+the pages in the region and "rescue" them from the unevictable list if no other
			
 
				+condition is keeping them unevictable.  If an unevictable region is destroyed,
			
 
				+the pages are also "rescued" from the unevictable list in the process of
			
 
				+freeing them.
			
 
				+
			
 
				+page_evictable() also checks for mlocked pages by testing an additional page
			
 
				+flag, PG_mlocked (as wrapped by PageMlocked()).  If the page is NOT mlocked,
			
 
				+and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is
			
 
				 VM_LOCKED via is_mlocked_vma().  is_mlocked_vma() will SetPageMlocked() and
			
 
				 update the appropriate statistics if the vma is VM_LOCKED.  This method allows
			
 
				 efficient "culling" of pages in the fault path that are being faulted in to
			
 
				-VM_LOCKED vmas.
			
 
				+VM_LOCKED VMAs.
			
 
				 
			
 
				 
			
 
				-Unevictable Pages and Vmscan [shrink_*_list()]
			
 
				+VMSCAN'S HANDLING OF UNEVICTABLE PAGES
			
 
				+--------------------------------------
			
 
				 
			
 
				 If unevictable pages are culled in the fault path, or moved to the unevictable
			
 
				-list at mlock() or mmap() time, vmscan will never encounter the pages until
			
 
				-they have become evictable again, for example, via munlock() and have been
			
 
				-"rescued" from the unevictable list.  However, there may be situations where we
			
 
				-decide, for the sake of expediency, to leave a unevictable page on one of the
			
 
				-regular active/inactive LRU lists for vmscan to deal with.  Vmscan checks for
			
 
				-such pages in all of the shrink_{active|inactive|page}_list() functions and
			
 
				-will "cull" such pages that it encounters--that is, it diverts those pages to
			
 
				-the unevictable list for the zone being scanned.
			
 
				-
			
 
				-There may be situations where a page is mapped into a VM_LOCKED vma, but the
			
 
				-page is not marked as PageMlocked.  Such pages will make it all the way to
			
 
				+list at mlock() or mmap() time, vmscan will not encounter the pages until they
			
 
				+have become evictable again (via munlock() for example) and have been "rescued"
			
 
				+from the unevictable list.  However, there may be situations where we decide,
			
 
				+for the sake of expediency, to leave a unevictable page on one of the regular
			
 
				+active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
			
 
				+pages in all of the shrink_{active|inactive|page}_list() functions and will
			
 
				+"cull" such pages that it encounters: that is, it diverts those pages to the
			
 
				+unevictable list for the zone being scanned.
			
 
				+
			
 
				+There may be situations where a page is mapped into a VM_LOCKED VMA, but the
			
 
				+page is not marked as PG_mlocked.  Such pages will make it all the way to
			
 
				 shrink_page_list() where they will be detected when vmscan walks the reverse
			
 
				-map in try_to_unmap().  If try_to_unmap() returns SWAP_MLOCK, shrink_page_list()
			
 
				-will cull the page at that point.
			
 
				+map in try_to_unmap().  If try_to_unmap() returns SWAP_MLOCK,
			
 
				+shrink_page_list() will cull the page at that point.
			
 
				 
			
 
				-To "cull" an unevictable page, vmscan simply puts the page back on the lru
			
 
				-list using putback_lru_page()--the inverse operation to isolate_lru_page()--
			
 
				-after dropping the page lock.  Because the condition which makes the page
			
 
				-unevictable may change once the page is unlocked, putback_lru_page() will
			
 
				-recheck the unevictable state of a page that it places on the unevictable lru
			
 
				-list.  If the page has become unevictable, putback_lru_page() removes it from
			
 
				-the list and retries, including the page_unevictable() test.  Because such a
			
 
				-race is a rare event and movement of pages onto the unevictable list should be
			
 
				-rare, these extra evictabilty checks should not occur in the majority of calls
			
 
				-to putback_lru_page().
			
 
				+To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
			
 
				+using putback_lru_page() - the inverse operation to isolate_lru_page() - after
			
 
				+dropping the page lock.  Because the condition which makes the page unevictable
			
 
				+may change once the page is unlocked, putback_lru_page() will recheck the
			
 
				+unevictable state of a page that it places on the unevictable list.  If the
			
 
				+page has become unevictable, putback_lru_page() removes it from the list and
			
 
				+retries, including the page_unevictable() test.  Because such a race is a rare
			
 
				+event and movement of pages onto the unevictable list should be rare, these
			
 
				+extra evictabilty checks should not occur in the majority of calls to
			
 
				+putback_lru_page().
			
 
				 
			
 
				 
			
 
				-Mlocked Page:  Prior Work
			
 
				+=============
			
 
				+MLOCKED PAGES
			
 
				+=============
			
 
				 
			
 
				-The "Unevictable Mlocked Pages" infrastructure is based on work originally
			
 
				+The unevictable page list is also useful for mlock(), in addition to ramfs and
			
 
				+SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
			
 
				+NOMMU situations, all mappings are effectively mlocked.
			
 
				+
			
 
				+
			
 
				+HISTORY
			
 
				+-------
			
 
				+
			
 
				+The "Unevictable mlocked Pages" infrastructure is based on work originally
			
 
				 posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
			
 
				-Nick posted his patch as an alternative to a patch posted by Christoph
			
 
				-Lameter to achieve the same objective--hiding mlocked pages from vmscan.
			
 
				-In Nick's patch, he used one of the struct page lru list link fields as a count
			
 
				-of VM_LOCKED vmas that map the page.  This use of the link field for a count
			
 
				-prevented the management of the pages on an LRU list.  Thus, mlocked pages were
			
 
				-not migratable as isolate_lru_page() could not find them and the lru list link
			
 
				-field was not available to the migration subsystem.  Nick resolved this by
			
 
				-putting mlocked pages back on the lru list before attempting to isolate them,
			
 
				-thus abandoning the count of VM_LOCKED vmas.  When Nick's patch was integrated
			
 
				-with the Unevictable LRU work, the count was replaced by walking the reverse
			
 
				-map to determine whether any VM_LOCKED vmas mapped the page.  More on this
			
 
				-below.
			
 
				-
			
 
				-
			
 
				-Mlocked Pages:  Basic Management
			
 
				-
			
 
				-Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of
			
 
				-unevictable pages.  When such a page has been "noticed" by the memory
			
 
				-management subsystem, the page is marked with the PG_mlocked [PageMlocked()]
			
 
				-flag.  A PageMlocked() page will be placed on the unevictable LRU list when
			
 
				-it is added to the LRU.   Pages can be "noticed" by memory management in
			
 
				-several places:
			
 
				-
			
 
				-1) in the mlock()/mlockall() system call handlers.
			
 
				-2) in the mmap() system call handler when mmap()ing a region with the
			
 
				-   MAP_LOCKED flag, or mmap()ing a region in a task that has called
			
 
				-   mlockall() with the MCL_FUTURE flag.  Both of these conditions result
			
 
				-   in the VM_LOCKED flag being set for the vma.
			
 
				-3) in the fault path, if mlocked pages are "culled" in the fault path,
			
 
				-   and when a VM_LOCKED stack segment is expanded.
			
 
				-4) as mentioned above, in vmscan:shrink_page_list() when attempting to
			
 
				-   reclaim a page in a VM_LOCKED vma via try_to_unmap().
			
 
				-
			
 
				-Mlocked pages become unlocked and rescued from the unevictable list when:
			
 
				-
			
 
				-1) mapped in a range unlocked via the munlock()/munlockall() system calls.
			
 
				-2) munmapped() out of the last VM_LOCKED vma that maps the page, including
			
 
				-   unmapping at task exit.
			
 
				-3) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file.
			
 
				-4) before a page is COWed in a VM_LOCKED vma.
			
 
				-
			
 
				-
			
 
				-Mlocked Pages:  mlock()/mlockall() System Call Handling
			
 
				+Nick posted his patch as an alternative to a patch posted by Christoph Lameter
			
 
				+to achieve the same objective: hiding mlocked pages from vmscan.
			
 
				+
			
 
				+In Nick's patch, he used one of the struct page LRU list link fields as a count
			
 
				+of VM_LOCKED VMAs that map the page.  This use of the link field for a count
			
 
				+prevented the management of the pages on an LRU list, and thus mlocked pages
			
 
				+were not migratable as isolate_lru_page() could not find them, and the LRU list
			
 
				+link field was not available to the migration subsystem.
			
 
				+
			
 
				+Nick resolved this by putting mlocked pages back on the lru list before
			
 
				+attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs.  When
			
 
				+Nick's patch was integrated with the Unevictable LRU work, the count was
			
 
				+replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
			
 
				+mapped the page.  More on this below.
			
 
				+
			
 
				+
			
 
				+BASIC MANAGEMENT
			
 
				+----------------
			
 
				+
			
 
				+mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
			
 
				+pages.  When such a page has been "noticed" by the memory management subsystem,
			
 
				+the page is marked with the PG_mlocked flag.  This can be manipulated using the
			
 
				+PageMlocked() functions.
			
 
				+
			
 
				+A PG_mlocked page will be placed on the unevictable list when it is added to
			
 
				+the LRU.  Such pages can be "noticed" by memory management in several places:
			
 
				+
			
 
				+ (1) in the mlock()/mlockall() system call handlers;
			
 
				+
			
 
				+ (2) in the mmap() system call handler when mmapping a region with the
			
 
				+     MAP_LOCKED flag;
			
 
				+
			
 
				+ (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
			
 
				+     flag
			
 
				+
			
 
				+ (4) in the fault path, if mlocked pages are "culled" in the fault path,
			
 
				+     and when a VM_LOCKED stack segment is expanded; or
			
 
				+
			
 
				+ (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
			
 
				+     reclaim a page in a VM_LOCKED VMA via try_to_unmap()
			
 
				+
			
 
				+all of which result in the VM_LOCKED flag being set for the VMA if it doesn't
			
 
				+already have it set.
			
 
				+
			
 
				+mlocked pages become unlocked and rescued from the unevictable list when:
			
 
				+
			
 
				+ (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
			
 
				+
			
 
				+ (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
			
 
				+     unmapping at task exit;
			
 
				+
			
 
				+ (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
			
 
				+     or
			
 
				+
			
 
				+ (4) before a page is COW'd in a VM_LOCKED VMA.
			
 
				+
			
 
				+
			
 
				+mlock()/mlockall() SYSTEM CALL HANDLING
			
 
				+---------------------------------------
			
 
				 
			
 
				 Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
			
 
				-for each vma in the range specified by the call.  In the case of mlockall(),
			
 
				+for each VMA in the range specified by the call.  In the case of mlockall(),
			
 
				 this is the entire active address space of the task.  Note that mlock_fixup()
			
 
				-is used for both mlock()ing and munlock()ing a range of memory.  A call to
			
 
				-mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED
			
 
				-is treated as a no-op--mlock_fixup() simply returns.
			
 
				-
			
 
				-If the vma passes some filtering described in "Mlocked Pages:  Filtering Vmas"
			
 
				-below, mlock_fixup() will attempt to merge the vma with its neighbors or split
			
 
				-off a subset of the vma if the range does not cover the entire vma.  Once the
			
 
				-vma has been merged or split or neither, mlock_fixup() will call
			
 
				-__mlock_vma_pages_range() to fault in the pages via get_user_pages() and
			
 
				-to mark the pages as mlocked via mlock_vma_page().
			
 
				-
			
 
				-Note that the vma being mlocked might be mapped with PROT_NONE.  In this case,
			
 
				-get_user_pages() will be unable to fault in the pages.  That's OK.  If pages
			
 
				-do end up getting faulted into this VM_LOCKED vma, we'll handle them in the
			
 
				+is used for both mlocking and munlocking a range of memory.  A call to mlock()
			
 
				+an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is
			
 
				+treated as a no-op, and mlock_fixup() simply returns.
			
 
				+
			
 
				+If the VMA passes some filtering as described in "Filtering Special Vmas"
			
 
				+below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
			
 
				+off a subset of the VMA if the range does not cover the entire VMA.  Once the
			
 
				+VMA has been merged or split or neither, mlock_fixup() will call
			
 
				+__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to
			
 
				+mark the pages as mlocked via mlock_vma_page().
			
 
				+
			
 
				+Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
			
 
				+get_user_pages() will be unable to fault in the pages.  That's okay.  If pages
			
 
				+do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the
			
 
				 fault path or in vmscan.
			
 
				 
			
 
				 Also note that a page returned by get_user_pages() could be truncated or
			
 
				-migrated out from under us, while we're trying to mlock it.  To detect
			
 
				-this, __mlock_vma_pages_range() tests the page_mapping after acquiring
			
 
				-the page lock.  If the page is still associated with its mapping, we'll
			
 
				-go ahead and call mlock_vma_page().  If the mapping is gone, we just
			
 
				-unlock the page and move on.  Worse case, this results in page mapped
			
 
				-in a VM_LOCKED vma remaining on a normal LRU list without being
			
 
				-PageMlocked().  Again, vmscan will detect and cull such pages.
			
 
				-
			
 
				-mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will
			
 
				-TestSetPageMlocked() for each page returned by get_user_pages().  We use
			
 
				-TestSetPageMlocked() because the page might already be mlocked by another
			
 
				-task/vma and we don't want to do extra work.  We especially do not want to
			
 
				-count an mlocked page more than once in the statistics.  If the page was
			
 
				-already mlocked, mlock_vma_page() is done.
			
 
				+migrated out from under us, while we're trying to mlock it.  To detect this,
			
 
				+__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock.
			
 
				+If the page is still associated with its mapping, we'll go ahead and call
			
 
				+mlock_vma_page().  If the mapping is gone, we just unlock the page and move on.
			
 
				+In the worst case, this will result in a page mapped in a VM_LOCKED VMA
			
 
				+remaining on a normal LRU list without being PageMlocked().  Again, vmscan will
			
 
				+detect and cull such pages.
			
 
				+
			
 
				+mlock_vma_page() will call TestSetPageMlocked() for each page returned by
			
 
				+get_user_pages().  We use TestSetPageMlocked() because the page might already
			
 
				+be mlocked by another task/VMA and we don't want to do extra work.  We
			
 
				+especially do not want to count an mlocked page more than once in the
			
 
				+statistics.  If the page was already mlocked, mlock_vma_page() need do nothing
			
 
				+more.
			
 
				 
			
 
				 If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
			
 
				 page from the LRU, as it is likely on the appropriate active or inactive list
			
 
				-at that time.  If the isolate_lru_page() succeeds, mlock_vma_page() will
			
 
				-putback the page--putback_lru_page()--which will notice that the page is now
			
 
				-mlocked and divert the page to the zone's unevictable LRU list.  If
			
 
				+at that time.  If the isolate_lru_page() succeeds, mlock_vma_page() will put
			
 
				+back the page - by calling putback_lru_page() - which will notice that the page
			
 
				+is now mlocked and divert the page to the zone's unevictable list.  If
			
 
				 mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
			
 
				-it later if/when it attempts to reclaim the page.
			
 
				+it later if and when it attempts to reclaim the page.
			
 
				 
			
 
				 
			
 
				-Mlocked Pages:  Filtering Special Vmas
			
 
				+FILTERING SPECIAL VMAS
			
 
				+----------------------
			
 
				 
			
 
				-mlock_fixup() filters several classes of "special" vmas:
			
 
				+mlock_fixup() filters several classes of "special" VMAs:
			
 
				 
			
 
				-1) vmas with VM_IO|VM_PFNMAP set are skipped entirely.  The pages behind
			
 
				+1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely.  The pages behind
			
 
				    these mappings are inherently pinned, so we don't need to mark them as
			
 
				-   mlocked.  In any case, most of the pages have no struct page in which to
			
 
				-   so mark the page.  Because of this, get_user_pages() will fail for these
			
 
				-   vmas, so there is no sense in attempting to visit them.
			
 
				-
			
 
				-2) vmas mapping hugetlbfs page are already effectively pinned into memory.
			
 
				-   We don't need nor want to mlock() these pages.  However, to preserve the
			
 
				-   prior behavior of mlock()--before the unevictable/mlock changes--
			
 
				-   mlock_fixup() will call make_pages_present() in the hugetlbfs vma range
			
 
				-   to allocate the huge pages and populate the ptes.
			
 
				-
			
 
				-3) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of
			
 
				-   kernel pages, such as the vdso page, relay channel pages, etc.  These pages
			
 
				+   mlocked.  In any case, most of the pages have no struct page in which to so
			
 
				+   mark the page.  Because of this, get_user_pages() will fail for these VMAs,
			
 
				+   so there is no sense in attempting to visit them.
			
 
				+
			
 
				+2) VMAs mapping hugetlbfs page are already effectively pinned into memory.  We
			
 
				+   neither need nor want to mlock() these pages.  However, to preserve the
			
 
				+   prior behavior of mlock() - before the unevictable/mlock changes -
			
 
				+   mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
			
 
				+   allocate the huge pages and populate the ptes.
			
 
				+
			
 
				+3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of
			
 
				+   kernel pages, such as the VDSO page, relay channel pages, etc.  These pages
			
 
				    are inherently unevictable and are not managed on the LRU lists.
			
 
				-   mlock_fixup() treats these vmas the same as hugetlbfs vmas.  It calls
			
 
				+   mlock_fixup() treats these VMAs the same as hugetlbfs VMAs.  It calls
			
 
				    make_pages_present() to populate the ptes.
			
 
				 
			
 
				-Note that for all of these special vmas, mlock_fixup() does not set the
			
 
				+Note that for all of these special VMAs, mlock_fixup() does not set the
			
 
				 VM_LOCKED flag.  Therefore, we won't have to deal with them later during
			
 
				-munlock() or munmap()--for example, at task exit.  Neither does mlock_fixup()
			
 
				-account these vmas against the task's "locked_vm".
			
 
				-
			
 
				-Mlocked Pages:  Downgrading the Mmap Semaphore.
			
 
				-
			
 
				-mlock_fixup() must be called with the mmap semaphore held for write, because
			
 
				-it may have to merge or split vmas.  However, mlocking a large region of
			
 
				-memory can take a long time--especially if vmscan must reclaim pages to
			
 
				-satisfy the regions requirements.  Faulting in a large region with the mmap
			
 
				-semaphore held for write can hold off other faults on the address space, in
			
 
				-the case of a multi-threaded task.  It can also hold off scans of the task's
			
 
				-address space via /proc.  While testing under heavy load, it was observed that
			
 
				-the ps(1) command could be held off for many minutes while a large segment was
			
 
				-mlock()ed down.
			
 
				-
			
 
				-To address this issue, and to make the system more responsive during mlock()ing
			
 
				-of large segments, mlock_fixup() downgrades the mmap semaphore to read mode
			
 
				-during the call to __mlock_vma_pages_range().  This works fine.  However, the
			
 
				-callers of mlock_fixup() expect the semaphore to be returned in write mode.
			
 
				-So, mlock_fixup() "upgrades" the semphore to write mode.  Linux does not
			
 
				-support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore
			
 
				-and reacquire it in write mode.  In a multi-threaded task, it is possible for
			
 
				-the task memory map to change while the semaphore is dropped.  Therefore,
			
 
				-mlock_fixup() looks up the vma at the range start address after reacquiring
			
 
				-the semaphore in write mode and verifies that it still covers the original
			
 
				-range.  If not, mlock_fixup() returns an error [-EAGAIN].  All callers of
			
 
				-mlock_fixup() have been changed to deal with this new error condition.
			
 
				-
			
 
				-Note:  when munlocking a region, all of the pages should already be resident--
			
 
				-unless we have racing threads mlocking() and munlocking() regions.  So,
			
 
				-unlocking should not have to wait for page allocations nor faults  of any kind.
			
 
				-Therefore mlock_fixup() does not downgrade the semaphore for munlock().
			
 
				-
			
 
				-
			
 
				-Mlocked Pages:  munlock()/munlockall() System Call Handling
			
 
				-
			
 
				-The munlock() and munlockall() system calls are handled by the same functions--
			
 
				-do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock
			
 
				-vs lock operation indicated by an argument.  So, these system calls are also
			
 
				-handled by mlock_fixup().  Again, if called for an already munlock()ed vma,
			
 
				-mlock_fixup() simply returns.  Because of the vma filtering discussed above,
			
 
				-VM_LOCKED will not be set in any "special" vmas.  So, these vmas will be
			
 
				+munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
			
 
				+VMAs against the task's "locked_vm".
			
 
				+
			
 
				+
			
 
				+munlock()/munlockall() SYSTEM CALL HANDLING
			
 
				+-------------------------------------------
			
 
				+
			
 
				+The munlock() and munlockall() system calls are handled by the same functions -
			
 
				+do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs
			
 
				+lock operation indicated by an argument.  So, these system calls are also
			
 
				+handled by mlock_fixup().  Again, if called for an already munlocked VMA,
			
 
				+mlock_fixup() simply returns.  Because of the VMA filtering discussed above,
			
 
				+VM_LOCKED will not be set in any "special" VMAs.  So, these VMAs will be
			
 
				 ignored for munlock.
			
 
				 
			
 
				-If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off
			
 
				-the specified range.  The range is then munlocked via the function
			
 
				-__mlock_vma_pages_range()--the same function used to mlock a vma range--
			
 
				+If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
			
 
				+specified range.  The range is then munlocked via the function
			
 
				+__mlock_vma_pages_range() - the same function used to mlock a VMA range -
			
 
				 passing a flag to indicate that munlock() is being performed.
			
 
				 
			
 
				-Because the vma access protections could have been changed to PROT_NONE after
			
 
				+Because the VMA access protections could have been changed to PROT_NONE after
			
 
				 faulting in and mlocking pages, get_user_pages() was unreliable for visiting
			
 
				-these pages for munlocking.  Because we don't want to leave pages mlocked(),
			
 
				+these pages for munlocking.  Because we don't want to leave pages mlocked,
			
 
				 get_user_pages() was enhanced to accept a flag to ignore the permissions when
			
 
				-fetching the pages--all of which should be resident as a result of previous
			
 
				-mlock()ing.
			
 
				+fetching the pages - all of which should be resident as a result of previous
			
 
				+mlocking.
			
 
				 
			
 
				 For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
			
 
				 munlock_vma_page().  munlock_vma_page() unconditionally clears the PG_mlocked
			
 
				-flag using TestClearPageMlocked().  As with mlock_vma_page(), munlock_vma_page()
			
 
				-use the Test*PageMlocked() function to handle the case where the page might
			
 
				-have already been unlocked by another task.  If the page was mlocked,
			
 
				-munlock_vma_page() updates that zone statistics for the number of mlocked
			
 
				-pages.  Note, however, that at this point we haven't checked whether the page
			
 
				-is mapped by other VM_LOCKED vmas.
			
 
				-
			
 
				-We can't call try_to_munlock(), the function that walks the reverse map to check
			
 
				-for other VM_LOCKED vmas, without first isolating the page from the LRU.
			
 
				+flag using TestClearPageMlocked().  As with mlock_vma_page(),
			
 
				+munlock_vma_page() use the Test*PageMlocked() function to handle the case where
			
 
				+the page might have already been unlocked by another task.  If the page was
			
 
				+mlocked, munlock_vma_page() updates that zone statistics for the number of
			
 
				+mlocked pages.  Note, however, that at this point we haven't checked whether
			
 
				+the page is mapped by other VM_LOCKED VMAs.
			
 
				+
			
 
				+We can't call try_to_munlock(), the function that walks the reverse map to
			
 
				+check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
			
 
				 try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
			
 
				-not be on an lru list.  [More on these below.]  However, the call to
			
 
				-isolate_lru_page() could fail, in which case we couldn't try_to_munlock().
			
 
				-So, we go ahead and clear PG_mlocked up front, as this might be the only chance
			
 
				-we have.  If we can successfully isolate the page, we go ahead and
			
 
				+not be on an LRU list [more on these below].  However, the call to
			
 
				+isolate_lru_page() could fail, in which case we couldn't try_to_munlock().  So,
			
 
				+we go ahead and clear PG_mlocked up front, as this might be the only chance we
			
 
				+have.  If we can successfully isolate the page, we go ahead and
			
 
				 try_to_munlock(), which will restore the PG_mlocked flag and update the zone
			
 
				-page statistics if it finds another vma holding the page mlocked.  If we fail
			
 
				+page statistics if it finds another VMA holding the page mlocked.  If we fail
			
 
				 to isolate the page, we'll have left a potentially mlocked page on the LRU.
			
 
				-This is fine, because we'll catch it later when/if vmscan tries to reclaim the
			
 
				-page.  This should be relatively rare.
			
 
				-
			
 
				-Mlocked Pages:  Migrating Them...
			
 
				-
			
 
				-A page that is being migrated has been isolated from the lru lists and is
			
 
				-held locked across unmapping of the page, updating the page's mapping
			
 
				-[address_space] entry and copying the contents and state, until the
			
 
				-page table entry has been replaced with an entry that refers to the new
			
 
				-page.  Linux supports migration of mlocked pages and other unevictable
			
 
				-pages.  This involves simply moving the PageMlocked and PageUnevictable states
			
 
				-from the old page to the new page.
			
 
				-
			
 
				-Note that page migration can race with mlocking or munlocking of the same
			
 
				-page.  This has been discussed from the mlock/munlock perspective in the
			
 
				-respective sections above.  Both processes [migration, m[un]locking], hold
			
 
				-the page locked.  This provides the first level of synchronization.  Page
			
 
				-migration zeros out the page_mapping of the old page before unlocking it,
			
 
				-so m[un]lock can skip these pages by testing the page mapping under page
			
 
				-lock.
			
 
				-
			
 
				-When completing page migration, we place the new and old pages back onto the
			
 
				-lru after dropping the page lock.  The "unneeded" page--old page on success,
			
 
				-new page on failure--will be freed when the reference count held by the
			
 
				-migration process is released.  To ensure that we don't strand pages on the
			
 
				-unevictable list because of a race between munlock and migration, page
			
 
				-migration uses the putback_lru_page() function to add migrated pages back to
			
 
				-the lru.
			
 
				-
			
 
				-
			
 
				-Mlocked Pages:  mmap(MAP_LOCKED) System Call Handling
			
 
				+This is fine, because we'll catch it later if and if vmscan tries to reclaim
			
 
				+the page.  This should be relatively rare.
			
 
				+
			
 
				+
			
 
				+MIGRATING MLOCKED PAGES
			
 
				+-----------------------
			
 
				+
			
 
				+A page that is being migrated has been isolated from the LRU lists and is held
			
 
				+locked across unmapping of the page, updating the page's address space entry
			
 
				+and copying the contents and state, until the page table entry has been
			
 
				+replaced with an entry that refers to the new page.  Linux supports migration
			
 
				+of mlocked pages and other unevictable pages.  This involves simply moving the
			
 
				+PG_mlocked and PG_unevictable states from the old page to the new page.
			
 
				+
			
 
				+Note that page migration can race with mlocking or munlocking of the same page.
			
 
				+This has been discussed from the mlock/munlock perspective in the respective
			
 
				+sections above.  Both processes (migration and m[un]locking) hold the page
			
 
				+locked.  This provides the first level of synchronization.  Page migration
			
 
				+zeros out the page_mapping of the old page before unlocking it, so m[un]lock
			
 
				+can skip these pages by testing the page mapping under page lock.
			
 
				+
			
 
				+To complete page migration, we place the new and old pages back onto the LRU
			
 
				+after dropping the page lock.  The "unneeded" page - old page on success, new
			
 
				+page on failure - will be freed when the reference count held by the migration
			
 
				+process is released.  To ensure that we don't strand pages on the unevictable
			
 
				+list because of a race between munlock and migration, page migration uses the
			
 
				+putback_lru_page() function to add migrated pages back to the LRU.
			
 
				+
			
 
				+
			
 
				+mmap(MAP_LOCKED) SYSTEM CALL HANDLING
			
 
				+-------------------------------------
			
 
				 
			
 
				 In addition the the mlock()/mlockall() system calls, an application can request
			
 
				-that a region of memory be mlocked using the MAP_LOCKED flag with the mmap()
			
 
				+that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
			
 
				 call.  Furthermore, any mmap() call or brk() call that expands the heap by a
			
 
				 task that has previously called mlockall() with the MCL_FUTURE flag will result
			
 
				-in the newly mapped memory being mlocked.  Before the unevictable/mlock changes,
			
 
				-the kernel simply called make_pages_present() to allocate pages and populate
			
 
				-the page table.
			
 
				+in the newly mapped memory being mlocked.  Before the unevictable/mlock
			
 
				+changes, the kernel simply called make_pages_present() to allocate pages and
			
 
				+populate the page table.
			
 
				 
			
 
				 To mlock a range of memory under the unevictable/mlock infrastructure, the
			
 
				 mmap() handler and task address space expansion functions call
			
 
				 mlock_vma_pages_range() specifying the vma and the address range to mlock.
			
 
				-mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in
			
 
				-"Mlocked Pages:  Filtering Vmas".  It will clear the VM_LOCKED flag, which will
			
 
				-have already been set by the caller, in filtered vmas.  Thus these vma's need
			
 
				-not be visited for munlock when the region is unmapped.
			
 
				+mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in
			
 
				+"Filtering Special VMAs".  It will clear the VM_LOCKED flag, which will have
			
 
				+already been set by the caller, in filtered VMAs.  Thus these VMA's need not be
			
 
				+visited for munlock when the region is unmapped.
			
 
				 
			
 
				-For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
			
 
				+For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
			
 
				 fault/allocate the pages and mlock them.  Again, like mlock_fixup(),
			
 
				 mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
			
 
				-attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore
			
 
				+attempting to fault/allocate and mlock the pages and "upgrades" the semaphore
			
 
				 back to write mode before returning.
			
 
				 
			
 
				-The callers of mlock_vma_pages_range() will have already added the memory
			
 
				-range to be mlocked to the task's "locked_vm".  To account for filtered vmas,
			
 
				+The callers of mlock_vma_pages_range() will have already added the memory range
			
 
				+to be mlocked to the task's "locked_vm".  To account for filtered VMAs,
			
 
				 mlock_vma_pages_range() returns the number of pages NOT mlocked.  All of the
			
 
				-callers then subtract a non-negative return value from the task's locked_vm.
			
 
				-A negative return value represent an error--for example, from get_user_pages()
			
 
				-attempting to fault in a vma with PROT_NONE access.  In this case, we leave
			
 
				-the memory range accounted as locked_vm, as the protections could be changed
			
 
				-later and pages allocated into that region.
			
 
				+callers then subtract a non-negative return value from the task's locked_vm.  A
			
 
				+negative return value represent an error - for example, from get_user_pages()
			
 
				+attempting to fault in a VMA with PROT_NONE access.  In this case, we leave the
			
 
				+memory range accounted as locked_vm, as the protections could be changed later
			
 
				+and pages allocated into that region.
			
 
				 
			
 
				 
			
 
				-Mlocked Pages:  munmap()/exit()/exec() System Call Handling
			
 
				+munmap()/exit()/exec() SYSTEM CALL HANDLING
			
 
				+-------------------------------------------
			
 
				 
			
 
				 When unmapping an mlocked region of memory, whether by an explicit call to
			
 
				 munmap() or via an internal unmap from exit() or exec() processing, we must
			
 
				-munlock the pages if we're removing the last VM_LOCKED vma that maps the pages.
			
 
				+munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
			
 
				 Before the unevictable/mlock changes, mlocking did not mark the pages in any
			
 
				 way, so unmapping them required no processing.
			
 
				 
			
 
				 To munlock a range of memory under the unevictable/mlock infrastructure, the
			
 
				-munmap() hander and task address space tear down function call
			
 
				+munmap() handler and task address space call tear down function
			
 
				 munlock_vma_pages_all().  The name reflects the observation that one always
			
 
				-specifies the entire vma range when munlock()ing during unmap of a region.
			
 
				-Because of the vma filtering when mlocking() regions, only "normal" vmas that
			
 
				+specifies the entire VMA range when munlock()ing during unmap of a region.
			
 
				+Because of the VMA filtering when mlocking() regions, only "normal" VMAs that
			
 
				 actually contain mlocked pages will be passed to munlock_vma_pages_all().
			
 
				 
			
 
				-munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup()
			
 
				+munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup()
			
 
				 for the munlock case, calls __munlock_vma_pages_range() to walk the page table
			
 
				-for the vma's memory range and munlock_vma_page() each resident page mapped by
			
 
				-the vma.  This effectively munlocks the page, only if this is the last
			
 
				-VM_LOCKED vma that maps the page.
			
 
				-
			
 
				+for the VMA's memory range and munlock_vma_page() each resident page mapped by
			
 
				+the VMA.  This effectively munlocks the page, only if this is the last
			
 
				+VM_LOCKED VMA that maps the page.
			
 
				 
			
 
				-Mlocked Page:  try_to_unmap()
			
 
				 
			
 
				-[Note:  the code changes represented by this section are really quite small
			
 
				-compared to the text to describe what happening and why, and to discuss the
			
 
				-implications.]
			
 
				+try_to_unmap()
			
 
				+--------------
			
 
				 
			
 
				-Pages can, of course, be mapped into multiple vmas.  Some of these vmas may
			
 
				+Pages can, of course, be mapped into multiple VMAs.  Some of these VMAs may
			
 
				 have VM_LOCKED flag set.  It is possible for a page mapped into one or more
			
 
				-VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one
			
 
				-of the active or inactive LRU lists.  This could happen if, for example, a
			
 
				-task in the process of munlock()ing the page could not isolate the page from
			
 
				-the LRU.  As a result, vmscan/shrink_page_list() might encounter such a page
			
 
				-as described in "Unevictable Pages and Vmscan [shrink_*_list()]".  To
			
 
				-handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED
			
 
				-vmas while it is walking a page's reverse map.
			
 
				+VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one
			
 
				+of the active or inactive LRU lists.  This could happen if, for example, a task
			
 
				+in the process of munlocking the page could not isolate the page from the LRU.
			
 
				+As a result, vmscan/shrink_page_list() might encounter such a page as described
			
 
				+in section "vmscan's handling of unevictable pages".  To handle this situation,
			
 
				+try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse
			
 
				+map.
			
 
				 
			
 
				 try_to_unmap() is always called, by either vmscan for reclaim or for page
			
 
				-migration, with the argument page locked and isolated from the LRU.  BUG_ON()
			
 
				-assertions enforce this requirement.  Separate functions handle anonymous and
			
 
				-mapped file pages, as these types of pages have different reverse map
			
 
				-mechanisms.
			
 
				-
			
 
				-	try_to_unmap_anon()
			
 
				-
			
 
				-To unmap anonymous pages, each vma in the list anchored in the anon_vma must be
			
 
				-visited--at least until a VM_LOCKED vma is encountered.  If the page is being
			
 
				-unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked
			
 
				-pages are migratable.  However, for reclaim, if the page is mapped into a
			
 
				-VM_LOCKED vma, the scan stops.  try_to_unmap() attempts to acquire the mmap
			
 
				-semphore of the mm_struct to which the vma belongs in read mode.  If this is
			
 
				-successful, try_to_unmap() will mlock the page via mlock_vma_page()--we
			
 
				-wouldn't have gotten to try_to_unmap() if the page were already mlocked--and
			
 
				-will return SWAP_MLOCK, indicating that the page is unevictable.  If the
			
 
				-mmap semaphore cannot be acquired, we are not sure whether the page is really
			
 
				-unevictable or not.  In this case, try_to_unmap() will return SWAP_AGAIN.
			
 
				-
			
 
				-	try_to_unmap_file() -- linear mappings
			
 
				-
			
 
				-Unmapping of a mapped file page works the same, except that the scan visits
			
 
				-all vmas that maps the page's index/page offset in the page's mapping's
			
 
				-reverse map priority search tree.  It must also visit each vma in the page's
			
 
				-mapping's non-linear list, if the list is non-empty.  As for anonymous pages,
			
 
				-on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will
			
 
				-attempt to acquire the associated mm_struct's mmap semaphore to mlock the page,
			
 
				-returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not.
			
 
				-
			
 
				-	try_to_unmap_file() -- non-linear mappings
			
 
				-
			
 
				-If a page's mapping contains a non-empty non-linear mapping vma list, then
			
 
				-try_to_un{map|lock}() must also visit each vma in that list to determine
			
 
				-whether the page is mapped in a VM_LOCKED vma.  Again, the scan must visit
			
 
				-all vmas in the non-linear list to ensure that the pages is not/should not be
			
 
				-mlocked.  If a VM_LOCKED vma is found in the list, the scan could terminate.
			
 
				-However, there is no easy way to determine whether the page is actually mapped
			
 
				-in a given vma--either for unmapping or testing whether the VM_LOCKED vma
			
 
				-actually pins the page.
			
 
				-
			
 
				-So, try_to_unmap_file() handles non-linear mappings by scanning a certain
			
 
				-number of pages--a "cluster"--in each non-linear vma associated with the page's
			
 
				-mapping, for each file mapped page that vmscan tries to unmap.  If this happens
			
 
				-to unmap the page we're trying to unmap, try_to_unmap() will notice this on
			
 
				-return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS.  Otherwise, it
			
 
				-will return SWAP_AGAIN, causing vmscan to recirculate this page.  We take
			
 
				-advantage of the cluster scan in try_to_unmap_cluster() as follows:
			
 
				-
			
 
				-For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap
			
 
				-semaphore of the associated mm_struct for read without blocking.  If this
			
 
				-attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will
			
 
				-retain the mmap semaphore for the scan; otherwise it drops it here.  Then,
			
 
				-for each page in the cluster, if we're holding the mmap semaphore for a locked
			
 
				-vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page.  This
			
 
				-call is a no-op if the page is already locked, but will mlock any pages in
			
 
				-the non-linear mapping that happen to be unlocked.  If one of the pages so
			
 
				-mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will
			
 
				-return SWAP_MLOCK, rather than the default SWAP_AGAIN.  This will allow vmscan
			
 
				-to cull the page, rather than recirculating it on the inactive list.  Again,
			
 
				-if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns
			
 
				-SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but
			
 
				-couldn't be mlocked.
			
 
				-
			
 
				-
			
 
				-Mlocked pages:  try_to_munlock() Reverse Map Scan
			
 
				-
			
 
				-TODO/FIXME:  a better name might be page_mlocked()--analogous to the
			
 
				-page_referenced() reverse map walker.
			
 
				-
			
 
				-When munlock_vma_page()--see "Mlocked Pages:  munlock()/munlockall()
			
 
				-System Call Handling" above--tries to munlock a page, it needs to
			
 
				-determine whether or not the page is mapped by any VM_LOCKED vma, without
			
 
				-actually attempting to unmap all ptes from the page.  For this purpose, the
			
 
				-unevictable/mlock infrastructure introduced a variant of try_to_unmap() called
			
 
				-try_to_munlock().
			
 
				+migration, with the argument page locked and isolated from the LRU.  Separate
			
 
				+functions handle anonymous and mapped file pages, as these types of pages have
			
 
				+different reverse map mechanisms.
			
 
				+
			
 
				+ (*) try_to_unmap_anon()
			
 
				+
			
 
				+     To unmap anonymous pages, each VMA in the list anchored in the anon_vma
			
 
				+     must be visited - at least until a VM_LOCKED VMA is encountered.  If the
			
 
				+     page is being unmapped for migration, VM_LOCKED VMAs do not stop the
			
 
				+     process because mlocked pages are migratable.  However, for reclaim, if
			
 
				+     the page is mapped into a VM_LOCKED VMA, the scan stops.
			
 
				+
			
 
				+     try_to_unmap_anon() attempts to acquire in read mode the mmap semphore of
			
 
				+     the mm_struct to which the VMA belongs.  If this is successful, it will
			
 
				+     mlock the page via mlock_vma_page() - we wouldn't have gotten to
			
 
				+     try_to_unmap_anon() if the page were already mlocked - and will return
			
 
				+     SWAP_MLOCK, indicating that the page is unevictable.
			
 
				+
			
 
				+     If the mmap semaphore cannot be acquired, we are not sure whether the page
			
 
				+     is really unevictable or not.  In this case, try_to_unmap_anon() will
			
 
				+     return SWAP_AGAIN.
			
 
				+
			
 
				+ (*) try_to_unmap_file() - linear mappings
			
 
				+
			
 
				+     Unmapping of a mapped file page works the same as for anonymous mappings,
			
 
				+     except that the scan visits all VMAs that map the page's index/page offset
			
 
				+     in the page's mapping's reverse map priority search tree.  It also visits
			
 
				+     each VMA in the page's mapping's non-linear list, if the list is
			
 
				+     non-empty.
			
 
				+
			
 
				+     As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
			
 
				+     page, try_to_unmap_file() will attempt to acquire the associated
			
 
				+     mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
			
 
				+     is successful, and SWAP_AGAIN, if not.
			
 
				+
			
 
				+ (*) try_to_unmap_file() - non-linear mappings
			
 
				+
			
 
				+     If a page's mapping contains a non-empty non-linear mapping VMA list, then
			
 
				+     try_to_un{map|lock}() must also visit each VMA in that list to determine
			
 
				+     whether the page is mapped in a VM_LOCKED VMA.  Again, the scan must visit
			
 
				+     all VMAs in the non-linear list to ensure that the pages is not/should not
			
 
				+     be mlocked.
			
 
				+
			
 
				+     If a VM_LOCKED VMA is found in the list, the scan could terminate.
			
 
				+     However, there is no easy way to determine whether the page is actually
			
 
				+     mapped in a given VMA - either for unmapping or testing whether the
			
 
				+     VM_LOCKED VMA actually pins the page.
			
 
				+
			
 
				+     try_to_unmap_file() handles non-linear mappings by scanning a certain
			
 
				+     number of pages - a "cluster" - in each non-linear VMA associated with the
			
 
				+     page's mapping, for each file mapped page that vmscan tries to unmap.  If
			
 
				+     this happens to unmap the page we're trying to unmap, try_to_unmap() will
			
 
				+     notice this on return (page_mapcount(page) will be 0) and return
			
 
				+     SWAP_SUCCESS.  Otherwise, it will return SWAP_AGAIN, causing vmscan to
			
 
				+     recirculate this page.  We take advantage of the cluster scan in
			
 
				+     try_to_unmap_cluster() as follows:
			
 
				+
			
 
				+	For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
			
 
				+	mmap semaphore of the associated mm_struct for read without blocking.
			
 
				+
			
 
				+	If this attempt is successful and the VMA is VM_LOCKED,
			
 
				+	try_to_unmap_cluster() will retain the mmap semaphore for the scan;
			
 
				+	otherwise it drops it here.
			
 
				+
			
 
				+	Then, for each page in the cluster, if we're holding the mmap semaphore
			
 
				+	for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
			
 
				+	mlock the page.  This call is a no-op if the page is already locked,
			
 
				+	but will mlock any pages in the non-linear mapping that happen to be
			
 
				+	unlocked.
			
 
				+
			
 
				+	If one of the pages so mlocked is the page passed in to try_to_unmap(),
			
 
				+	try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
			
 
				+	SWAP_AGAIN.  This will allow vmscan to cull the page, rather than
			
 
				+	recirculating it on the inactive list.
			
 
				+
			
 
				+	Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
			
 
				+	returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
			
 
				+	VMA, but couldn't be mlocked.
			
 
				+
			
 
				+
			
 
				+try_to_munlock() REVERSE MAP SCAN
			
 
				+---------------------------------
			
 
				+
			
 
				+ [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
			
 
				+     page_referenced() reverse map walker.
			
 
				+
			
 
				+When munlock_vma_page() [see section "munlock()/munlockall() System Call
			
 
				+Handling" above] tries to munlock a page, it needs to determine whether or not
			
 
				+the page is mapped by any VM_LOCKED VMA without actually attempting to unmap
			
 
				+all PTEs from the page.  For this purpose, the unevictable/mlock infrastructure
			
 
				+introduced a variant of try_to_unmap() called try_to_munlock().
			
 
				 
			
 
				 try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
			
 
				 mapped file pages with an additional argument specifing unlock versus unmap
			
 
				 processing.  Again, these functions walk the respective reverse maps looking
			
 
				-for VM_LOCKED vmas.  When such a vma is found for anonymous pages and file
			
 
				+for VM_LOCKED VMAs.  When such a VMA is found for anonymous pages and file
			
 
				 pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
			
 
				 attempt to acquire the associated mmap semphore, mlock the page via
			
 
				 mlock_vma_page() and return SWAP_MLOCK.  This effectively undoes the
			
 
				 pre-clearing of the page's PG_mlocked done by munlock_vma_page.
			
 
				 
			
 
				-If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap
			
 
				-semaphore, it will return SWAP_AGAIN.  This will allow shrink_page_list()
			
 
				-to recycle the page on the inactive list and hope that it has better luck
			
 
				-with the page next time.
			
 
				-
			
 
				-For file pages mapped into non-linear vmas, the try_to_munlock() logic works
			
 
				-slightly differently.  On encountering a VM_LOCKED non-linear vma that might
			
 
				-map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking
			
 
				-the page.  munlock_vma_page() will just leave the page unlocked and let
			
 
				-vmscan deal with it--the usual fallback position.
			
 
				-
			
 
				-Note that try_to_munlock()'s reverse map walk must visit every vma in a pages'
			
 
				-reverse map to determine that a page is NOT mapped into any VM_LOCKED vma.
			
 
				-However, the scan can terminate when it encounters a VM_LOCKED vma and can
			
 
				-successfully acquire the vma's mmap semphore for read and mlock the page.
			
 
				-Although try_to_munlock() can be called many [very many!] times when
			
 
				-munlock()ing a large region or tearing down a large address space that has been
			
 
				-mlocked via mlockall(), overall this is a fairly rare event.
			
 
				-
			
 
				-Mlocked Page:  Page Reclaim in shrink_*_list()
			
 
				-
			
 
				-shrink_active_list() culls any obviously unevictable pages--i.e.,
			
 
				-!page_evictable(page, NULL)--diverting these to the unevictable lru
			
 
				-list.  However, shrink_active_list() only sees unevictable pages that
			
 
				-made it onto the active/inactive lru lists.  Note that these pages do not
			
 
				-have PageUnevictable set--otherwise, they would be on the unevictable list and
			
 
				-shrink_active_list would never see them.
			
 
				+If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
			
 
				+semaphore, it will return SWAP_AGAIN.  This will allow shrink_page_list() to
			
 
				+recycle the page on the inactive list and hope that it has better luck with the
			
 
				+page next time.
			
 
				+
			
 
				+For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
			
 
				+slightly differently.  On encountering a VM_LOCKED non-linear VMA that might
			
 
				+map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
			
 
				+page.  munlock_vma_page() will just leave the page unlocked and let vmscan deal
			
 
				+with it - the usual fallback position.
			
 
				+
			
 
				+Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
			
 
				+reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
			
 
				+However, the scan can terminate when it encounters a VM_LOCKED VMA and can
			
 
				+successfully acquire the VMA's mmap semphore for read and mlock the page.
			
 
				+Although try_to_munlock() might be called a great many times when munlocking a
			
 
				+large region or tearing down a large address space that has been mlocked via
			
 
				+mlockall(), overall this is a fairly rare event.
			
 
				+
			
 
				+
			
 
				+PAGE RECLAIM IN shrink_*_list()
			
 
				+-------------------------------
			
 
				+
			
 
				+shrink_active_list() culls any obviously unevictable pages - i.e.
			
 
				+!page_evictable(page, NULL) - diverting these to the unevictable list.
			
 
				+However, shrink_active_list() only sees unevictable pages that made it onto the
			
 
				+active/inactive lru lists.  Note that these pages do not have PageUnevictable
			
 
				+set - otherwise they would be on the unevictable list and shrink_active_list
			
 
				+would never see them.
			
 
				 
			
 
				 Some examples of these unevictable pages on the LRU lists are:
			
 
				 
			
 
				-1) ramfs pages that have been placed on the lru lists when first allocated.
			
 
				+ (1) ramfs pages that have been placed on the LRU lists when first allocated.
			
 
				+
			
 
				+ (2) SHM_LOCK'd shared memory pages.  shmctl(SHM_LOCK) does not attempt to
			
 
				+     allocate or fault in the pages in the shared memory region.  This happens
			
 
				+     when an application accesses the page the first time after SHM_LOCK'ing
			
 
				+     the segment.
			
 
				 
			
 
				-2) SHM_LOCKed shared memory pages.  shmctl(SHM_LOCK) does not attempt to
			
 
				-   allocate or fault in the pages in the shared memory region.  This happens
			
 
				-   when an application accesses the page the first time after SHM_LOCKing
			
 
				-   the segment.
			
 
				+ (3) mlocked pages that could not be isolated from the LRU and moved to the
			
 
				+     unevictable list in mlock_vma_page().
			
 
				 
			
 
				-3) Mlocked pages that could not be isolated from the lru and moved to the
			
 
				-   unevictable list in mlock_vma_page().
			
 
				+ (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
			
 
				+     acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
			
 
				+     munlock_vma_page() was forced to let the page back on to the normal LRU
			
 
				+     list for vmscan to handle.
			
 
				 
			
 
				-3) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't
			
 
				-   acquire the vma's mmap semaphore to test the flags and set PageMlocked.
			
 
				-   munlock_vma_page() was forced to let the page back on to the normal
			
 
				-   LRU list for vmscan to handle.
			
 
				+shrink_inactive_list() also diverts any unevictable pages that it finds on the
			
 
				+inactive lists to the appropriate zone's unevictable list.
			
 
				 
			
 
				-shrink_inactive_list() also culls any unevictable pages that it finds on
			
 
				-the inactive lists, again diverting them to the appropriate zone's unevictable
			
 
				-lru list.  shrink_inactive_list() should only see SHM_LOCKed pages that became
			
 
				-SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or
			
 
				-pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from
			
 
				-the lru to recheck via try_to_munlock().  shrink_inactive_list() won't notice
			
 
				-the latter, but will pass on to shrink_page_list().
			
 
				+shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
			
 
				+after shrink_active_list() had moved them to the inactive list, or pages mapped
			
 
				+into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
			
 
				+recheck via try_to_munlock().  shrink_inactive_list() won't notice the latter,
			
 
				+but will pass on to shrink_page_list().
			
 
				 
			
 
				 shrink_page_list() again culls obviously unevictable pages that it could
			
 
				 encounter for similar reason to shrink_inactive_list().  Pages mapped into
			
 
				-VM_LOCKED vmas but without PG_mlocked set will make it all the way to
			
 
				+VM_LOCKED VMAs but without PG_mlocked set will make it all the way to
			
 
				 try_to_unmap().  shrink_page_list() will divert them to the unevictable list
			
 
				 when try_to_unmap() returns SWAP_MLOCK, as discussed above.
			
--- a/MAINTAINERS
+++ b/MAINTAINERS
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 
				 VERSION = 2
			
 
				 PATCHLEVEL = 6
			
 
				-SUBLEVEL = 29
			
 
				-EXTRAVERSION =
			
 
				-NAME = Temporary Tasmanian Devil
			
 
				+SUBLEVEL = 30
			
 
				+EXTRAVERSION = -rc6
			
 
				+NAME = Vindictive Armadillo
			
 
				 
			
 
				 # *DOCUMENTATION*
			
 
				 # To see a list of typical targets execute "make help"
			
@@ -169,7 +169,7 @@ SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
 
				 				  -e s/arm.*/arm/ -e s/sa110/arm/ \
			
 
				 				  -e s/s390x/s390/ -e s/parisc64/parisc/ \
			
 
				 				  -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
			
 
				-				  -e s/sh.*/sh/ )
			
 
				+				  -e s/sh[234].*/sh/ )
			
 
				 
			
 
				 # Cross compiling and selecting different set of gcc/bin-utils
			
 
				 # ---------------------------------------------------------------------------
			
@@ -210,6 +210,11 @@ ifeq ($(ARCH),sparc64)
 
				        SRCARCH := sparc
			
 
				 endif
			
 
				 
			
 
				+# Additional ARCH settings for sh
			
 
				+ifeq ($(ARCH),sh64)
			
 
				+       SRCARCH := sh
			
 
				+endif
			
 
				+
			
 
				 # Where to locate arch specific headers
			
 
				 hdr-arch  := $(SRCARCH)
			
 
				 
			
@@ -567,7 +572,7 @@ KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
 
				 # disable pointer signed / unsigned warnings in gcc 4.0
			
 
				 KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
			
 
				 
			
 
				-# disable invalid "can't wrap" optimzations for signed / pointers
			
 
				+# disable invalid "can't wrap" optimizations for signed / pointers
			
 
				 KBUILD_CFLAGS	+= $(call cc-option,-fwrapv)
			
 
				 
			
 
				 # revert to pre-gcc-4.4 behaviour of .eh_frame
			
@@ -597,6 +602,10 @@ LDFLAGS_BUILD_ID = $(patsubst -Wl$(comma)%,%,\
 
				 LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
			
 
				 LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
			
 
				 
			
 
				+ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
			
 
				+LDFLAGS_vmlinux	+= -X
			
 
				+endif
			
 
				+
			
 
				 # Default kernel image to build when no specific target is given.
			
 
				 # KBUILD_IMAGE may be overruled on the command line or
			
 
				 # set in the environment
			
@@ -1191,7 +1200,7 @@ CLEAN_FILES +=	vmlinux System.map \
 
				                 .tmp_kallsyms* .tmp_version .tmp_vmlinux* .tmp_System.map
			
 
				 
			
 
				 # Directories & files removed with 'make mrproper'
			
 
				-MRPROPER_DIRS  += include/config include2 usr/include
			
 
				+MRPROPER_DIRS  += include/config include2 usr/include include/generated
			
 
				 MRPROPER_FILES += .config .config.old include/asm .version .old_version \
			
 
				                   include/linux/autoconf.h include/linux/version.h      \
			
 
				                   include/linux/utsrelease.h                            \
			
@@ -1284,7 +1293,7 @@ help:
 
				 	@echo  '  dir/            - Build all files in dir and below'
			
 
				 	@echo  '  dir/file.[ois]  - Build specified target only'
			
 
				 	@echo  '  dir/file.ko     - Build module including final link'
			
 
				-	@echo  '  prepare         - Set up for building external modules'
			
 
				+	@echo  '  modules_prepare - Set up for building external modules'
			
 
				 	@echo  '  tags/TAGS	  - Generate tags file for editors'
			
 
				 	@echo  '  cscope	  - Generate cscope index'
			
 
				 	@echo  '  kernelrelease	  - Output the release version string'
			
@@ -1412,7 +1421,9 @@ $(clean-dirs):
 
				 	$(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@)
			
 
				 
			
 
				 clean:	rm-dirs := $(MODVERDIR)
			
 
				-clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers
			
 
				+clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers \
			
 
				+                   $(KBUILD_EXTMOD)/Module.markers \
			
 
				+                   $(KBUILD_EXTMOD)/modules.order
			
 
				 clean: $(clean-dirs)
			
 
				 	$(call cmd,rmdirs)
			
 
				 	$(call cmd,rmfiles)
			
@@ -1587,5 +1598,5 @@ PHONY += FORCE
 
				 FORCE:
			
 
				 
			
 
				 # Declare the contents of the .PHONY variable as phony.  We keep that
			
 
				-# information in a variable se we can use it in if_changed and friends.
			
 
				+# information in a variable so we can use it in if_changed and friends.
			
 
				 .PHONY: $(PHONY)
			
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -6,6 +6,7 @@ config OPROFILE
 
				 	tristate "OProfile system profiling (EXPERIMENTAL)"
			
 
				 	depends on PROFILING
			
 
				 	depends on HAVE_OPROFILE
			
 
				+	depends on TRACING_SUPPORT
			
 
				 	select TRACING
			
 
				 	select RING_BUFFER
			
 
				 	help
			
@@ -108,3 +109,6 @@ config HAVE_CLK
 
				 
			
 
				 config HAVE_DMA_API_DEBUG
			
 
				 	bool
			
 
				+
			
 
				+config HAVE_DEFAULT_NO_SPIN_MUTEXES
			
 
				+	bool
			
--- a/arch/alpha/include/asm/barrier.h
+++ b/arch/alpha/include/asm/barrier.h
@@ -16,11 +16,13 @@ __asm__ __volatile__("wmb": : :"memory")
 
				 __asm__ __volatile__("mb": : :"memory")
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				+#define __ASM_SMP_MB	"\tmb\n"
			
 
				 #define smp_mb()	mb()
			
 
				 #define smp_rmb()	rmb()
			
 
				 #define smp_wmb()	wmb()
			
 
				 #define smp_read_barrier_depends()	read_barrier_depends()
			
 
				 #else
			
 
				+#define __ASM_SMP_MB
			
 
				 #define smp_mb()	barrier()
			
 
				 #define smp_rmb()	barrier()
			
 
				 #define smp_wmb()	barrier()
			
--- a/arch/alpha/include/asm/ftrace.h
+++ b/arch/alpha/include/asm/ftrace.h
@@ -0,0 +1 @@
 
				+/* empty */
			
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -1,6 +1,116 @@
 
				-#ifndef _ASM_FUTEX_H
			
 
				-#define _ASM_FUTEX_H
			
 
				+#ifndef _ASM_ALPHA_FUTEX_H
			
 
				+#define _ASM_ALPHA_FUTEX_H
			
 
				 
			
 
				-#include <asm-generic/futex.h>
			
 
				+#ifdef __KERNEL__
			
 
				 
			
 
				-#endif
			
 
				+#include <linux/futex.h>
			
 
				+#include <linux/uaccess.h>
			
 
				+#include <asm/errno.h>
			
 
				+#include <asm/barrier.h>
			
 
				+
			
 
				+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)	\
			
 
				+	__asm__ __volatile__(					\
			
 
				+		__ASM_SMP_MB					\
			
 
				+	"1:	ldl_l	%0,0(%2)\n"				\
			
 
				+		insn						\
			
 
				+	"2:	stl_c	%1,0(%2)\n"				\
			
 
				+	"	beq	%1,4f\n"				\
			
 
				+	"	mov	$31,%1\n"				\
			
 
				+	"3:	.subsection 2\n"				\
			
 
				+	"4:	br	1b\n"					\
			
 
				+	"	.previous\n"					\
			
 
				+	"	.section __ex_table,\"a\"\n"			\
			
 
				+	"	.long	1b-.\n"					\
			
 
				+	"	lda	$31,3b-1b(%1)\n"			\
			
 
				+	"	.long	2b-.\n"					\
			
 
				+	"	lda	$31,3b-2b(%1)\n"			\
			
 
				+	"	.previous\n"					\
			
 
				+	:	"=&r" (oldval), "=&r"(ret)			\
			
 
				+	:	"r" (uaddr), "r"(oparg)				\
			
 
				+	:	"memory")
			
 
				+
			
 
				+static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
			
 
				+{
			
 
				+	int op = (encoded_op >> 28) & 7;
			
 
				+	int cmp = (encoded_op >> 24) & 15;
			
 
				+	int oparg = (encoded_op << 8) >> 20;
			
 
				+	int cmparg = (encoded_op << 20) >> 20;
			
 
				+	int oldval = 0, ret;
			
 
				+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
			
 
				+		oparg = 1 << oparg;
			
 
				+
			
 
				+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	pagefault_disable();
			
 
				+
			
 
				+	switch (op) {
			
 
				+	case FUTEX_OP_SET:
			
 
				+		__futex_atomic_op("mov %3,%1\n", ret, oldval, uaddr, oparg);
			
 
				+		break;
			
 
				+	case FUTEX_OP_ADD:
			
 
				+		__futex_atomic_op("addl %0,%3,%1\n", ret, oldval, uaddr, oparg);
			
 
				+		break;
			
 
				+	case FUTEX_OP_OR:
			
 
				+		__futex_atomic_op("or %0,%3,%1\n", ret, oldval, uaddr, oparg);
			
 
				+		break;
			
 
				+	case FUTEX_OP_ANDN:
			
 
				+		__futex_atomic_op("andnot %0,%3,%1\n", ret, oldval, uaddr, oparg);
			
 
				+		break;
			
 
				+	case FUTEX_OP_XOR:
			
 
				+		__futex_atomic_op("xor %0,%3,%1\n", ret, oldval, uaddr, oparg);
			
 
				+		break;
			
 
				+	default:
			
 
				+		ret = -ENOSYS;
			
 
				+	}
			
 
				+
			
 
				+	pagefault_enable();
			
 
				+
			
 
				+	if (!ret) {
			
 
				+		switch (cmp) {
			
 
				+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
			
 
				+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
			
 
				+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
			
 
				+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
			
 
				+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
			
 
				+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
			
 
				+		default: ret = -ENOSYS;
			
 
				+		}
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static inline int
			
 
				+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
			
 
				+{
			
 
				+	int prev, cmp;
			
 
				+
			
 
				+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	__asm__ __volatile__ (
			
 
				+		__ASM_SMP_MB
			
 
				+	"1:	ldl_l	%0,0(%2)\n"
			
 
				+	"	cmpeq	%0,%3,%1\n"
			
 
				+	"	beq	%1,3f\n"
			
 
				+	"	mov	%4,%1\n"
			
 
				+	"2:	stl_c	%1,0(%2)\n"
			
 
				+	"	beq	%1,4f\n"
			
 
				+	"3:	.subsection 2\n"
			
 
				+	"4:	br	1b\n"
			
 
				+	"	.previous\n"
			
 
				+	"	.section __ex_table,\"a\"\n"
			
 
				+	"	.long	1b-.\n"
			
 
				+	"	lda	$31,3b-1b(%0)\n"
			
 
				+	"	.long	2b-.\n"
			
 
				+	"	lda	$31,3b-2b(%0)\n"
			
 
				+	"	.previous\n"
			
 
				+	:	"=&r"(prev), "=&r"(cmp)
			
 
				+	:	"r"(uaddr), "r"((long)oldval), "r"(newval)
			
 
				+	:	"memory");
			
 
				+
			
 
				+	return prev;
			
 
				+}
			
 
				+
			
 
				+#endif /* __KERNEL__ */
			
 
				+#endif /* _ASM_ALPHA_FUTEX_H */
			
--- a/arch/alpha/include/asm/hardirq.h
+++ b/arch/alpha/include/asm/hardirq.h
@@ -14,17 +14,4 @@ typedef struct {
 
				 
			
 
				 void ack_bad_irq(unsigned int irq);
			
 
				 
			
 
				-#define HARDIRQ_BITS	12
			
 
				-
			
 
				-/*
			
 
				- * The hardirq mask has to be large enough to have
			
 
				- * space for potentially nestable IRQ sources in the system
			
 
				- * to nest on a single CPU. On Alpha, interrupts are masked at the CPU
			
 
				- * by IPL as well as at the system level. We only have 8 IPLs (UNIX PALcode)
			
 
				- * so we really only have 8 nestable IRQs, but allow some overhead
			
 
				- */
			
 
				-#if (1 << HARDIRQ_BITS) < 16
			
 
				-#error HARDIRQ_BITS is too low!
			
 
				-#endif
			
 
				-
			
 
				 #endif /* _ALPHA_HARDIRQ_H */
			
--- a/arch/alpha/include/asm/percpu.h
+++ b/arch/alpha/include/asm/percpu.h
@@ -1,7 +1,9 @@
 
				 #ifndef __ALPHA_PERCPU_H
			
 
				 #define __ALPHA_PERCPU_H
			
 
				+
			
 
				 #include <linux/compiler.h>
			
 
				 #include <linux/threads.h>
			
 
				+#include <linux/percpu-defs.h>
			
 
				 
			
 
				 /*
			
 
				  * Determine the real variable name from the name visible in the
			
@@ -73,6 +75,28 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 
				 
			
 
				 #endif /* SMP */
			
 
				 
			
 
				-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name)
			
 
				+#ifdef CONFIG_SMP
			
 
				+#define PER_CPU_BASE_SECTION ".data.percpu"
			
 
				+#else
			
 
				+#define PER_CPU_BASE_SECTION ".data"
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+
			
 
				+#ifdef MODULE
			
 
				+#define PER_CPU_SHARED_ALIGNED_SECTION ""
			
 
				+#else
			
 
				+#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
			
 
				+#endif
			
 
				+#define PER_CPU_FIRST_SECTION ".first"
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#define PER_CPU_SHARED_ALIGNED_SECTION ""
			
 
				+#define PER_CPU_FIRST_SECTION ""
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#define PER_CPU_ATTRIBUTES
			
 
				 
			
 
				 #endif /* __ALPHA_PERCPU_H */
			
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -507,5 +507,7 @@ struct exception_table_entry
 
				 	(pc) + (_fixup)->fixup.bits.nextinsn;			\
			
 
				 })
			
 
				 
			
 
				+#define ARCH_HAS_SORT_EXTABLE
			
 
				+#define ARCH_HAS_SEARCH_EXTABLE
			
 
				 
			
 
				 #endif /* __ALPHA_UACCESS_H */
			
--- a/arch/alpha/kernel/Makefile
+++ b/arch/alpha/kernel/Makefile
@@ -8,7 +8,7 @@ EXTRA_CFLAGS	:= -Werror -Wno-sign-compare
 
				 
			
 
				 obj-y    := entry.o traps.o process.o init_task.o osf_sys.o irq.o \
			
 
				 	    irq_alpha.o signal.o setup.o ptrace.o time.o \
			
 
				-	    alpha_ksyms.o systbls.o err_common.o io.o binfmt_loader.o
			
 
				+	    alpha_ksyms.o systbls.o err_common.o io.o
			
 
				 
			
 
				 obj-$(CONFIG_VGA_HOSE)	+= console.o
			
 
				 obj-$(CONFIG_SMP)	+= smp.o
			
@@ -43,6 +43,10 @@ else
 
				 # Misc support
			
 
				 obj-$(CONFIG_ALPHA_SRM)		+= srmcons.o
			
 
				 
			
 
				+ifdef CONFIG_BINFMT_AOUT
			
 
				+obj-y	+= binfmt_loader.o
			
 
				+endif
			
 
				+
			
 
				 # Core logic support
			
 
				 obj-$(CONFIG_ALPHA_APECS)	+= core_apecs.o
			
 
				 obj-$(CONFIG_ALPHA_CIA)		+= core_cia.o
			
--- a/arch/alpha/kernel/binfmt_loader.c
+++ b/arch/alpha/kernel/binfmt_loader.c
@@ -46,6 +46,6 @@ static struct linux_binfmt loader_format = {
 
				 
			
 
				 static int __init init_loader_binfmt(void)
			
 
				 {
			
 
				-	return register_binfmt(&loader_format);
			
 
				+	return insert_binfmt(&loader_format);
			
 
				 }
			
 
				 arch_initcall(init_loader_binfmt);
			
--- a/arch/alpha/kernel/err_ev6.c
+++ b/arch/alpha/kernel/err_ev6.c
@@ -229,7 +229,7 @@ ev6_process_logout_frame(struct el_common *mchk_header, int print)
 
				 }
			
 
				 
			
 
				 void
			
 
				-ev6_machine_check(u64 vector, u64 la_ptr)
			
 
				+ev6_machine_check(unsigned long vector, unsigned long la_ptr)
			
 
				 {
			
 
				 	struct el_common *mchk_header = (struct el_common *)la_ptr;
			
 
				 
			
--- a/arch/alpha/kernel/err_ev7.c
+++ b/arch/alpha/kernel/err_ev7.c
@@ -117,7 +117,7 @@ ev7_collect_logout_frame_subpackets(struct el_subpacket *el_ptr,
 
				 }
			
 
				 
			
 
				 void
			
 
				-ev7_machine_check(u64 vector, u64 la_ptr)
			
 
				+ev7_machine_check(unsigned long vector, unsigned long la_ptr)
			
 
				 {
			
 
				 	struct el_subpacket *el_ptr = (struct el_subpacket *)la_ptr;
			
 
				 	char *saved_err_prefix = err_print_prefix;
			
@@ -246,7 +246,7 @@ ev7_process_pal_subpacket(struct el_subpacket *header)
 
				 
			
 
				 	switch(header->type) {
			
 
				 	case EL_TYPE__PAL__LOGOUT_FRAME:
			
 
				-		printk("%s*** MCHK occurred on LPID %ld (RBOX %llx)\n",
			
 
				+		printk("%s*** MCHK occurred on LPID %lld (RBOX %llx)\n",
			
 
				 		       err_print_prefix,
			
 
				 		       packet->by_type.logout.whami, 
			
 
				 		       packet->by_type.logout.rbox_whami);
			
--- a/arch/alpha/kernel/err_impl.h
+++ b/arch/alpha/kernel/err_impl.h
@@ -60,26 +60,26 @@ extern struct ev7_lf_subpackets *
 
				 ev7_collect_logout_frame_subpackets(struct el_subpacket *,
			
 
				 				    struct ev7_lf_subpackets *);
			
 
				 extern void ev7_register_error_handlers(void);
			
 
				-extern void ev7_machine_check(u64, u64);
			
 
				+extern void ev7_machine_check(unsigned long, unsigned long);
			
 
				 
			
 
				 /*
			
 
				  * err_ev6.c
			
 
				  */
			
 
				 extern void ev6_register_error_handlers(void);
			
 
				 extern int ev6_process_logout_frame(struct el_common *, int);
			
 
				-extern void ev6_machine_check(u64, u64);
			
 
				+extern void ev6_machine_check(unsigned long, unsigned long);
			
 
				 
			
 
				 /*
			
 
				  * err_marvel.c
			
 
				  */
			
 
				-extern void marvel_machine_check(u64, u64);
			
 
				+extern void marvel_machine_check(unsigned long, unsigned long);
			
 
				 extern void marvel_register_error_handlers(void);
			
 
				 
			
 
				 /*
			
 
				  * err_titan.c
			
 
				  */
			
 
				 extern int titan_process_logout_frame(struct el_common *, int);
			
 
				-extern void titan_machine_check(u64, u64);
			
 
				+extern void titan_machine_check(unsigned long, unsigned long);
			
 
				 extern void titan_register_error_handlers(void);
			
 
				 extern int privateer_process_logout_frame(struct el_common *, int);
			
 
				-extern void privateer_machine_check(u64, u64);
			
 
				+extern void privateer_machine_check(unsigned long, unsigned long);
			
--- a/arch/alpha/kernel/err_marvel.c
+++ b/arch/alpha/kernel/err_marvel.c
@@ -1042,7 +1042,7 @@ marvel_process_logout_frame(struct ev7_lf_subpackets *lf_subpackets, int print)
 
				 }
			
 
				 
			
 
				 void
			
 
				-marvel_machine_check(u64 vector, u64 la_ptr)
			
 
				+marvel_machine_check(unsigned long vector, unsigned long la_ptr)
			
 
				 {
			
 
				 	struct el_subpacket *el_ptr = (struct el_subpacket *)la_ptr;
			
 
				 	int (*process_frame)(struct ev7_lf_subpackets *, int) = NULL;
			
--- a/arch/alpha/kernel/err_titan.c
+++ b/arch/alpha/kernel/err_titan.c
@@ -380,7 +380,7 @@ titan_process_logout_frame(struct el_common *mchk_header, int print)
 
				 }
			
 
				 
			
 
				 void
			
 
				-titan_machine_check(u64 vector, u64 la_ptr)
			
 
				+titan_machine_check(unsigned long vector, unsigned long la_ptr)
			
 
				 {
			
 
				 	struct el_common *mchk_header = (struct el_common *)la_ptr;
			
 
				 	struct el_TITAN_sysdata_mcheck *tmchk =
			
@@ -702,7 +702,7 @@ privateer_process_logout_frame(struct el_common *mchk_header, int print)
 
				 }
			
 
				 
			
 
				 void
			
 
				-privateer_machine_check(u64 vector, u64 la_ptr)
			
 
				+privateer_machine_check(unsigned long vector, unsigned long la_ptr)
			
 
				 {
			
 
				 	struct el_common *mchk_header = (struct el_common *)la_ptr;
			
 
				 	struct el_TITAN_sysdata_mcheck *tmchk =
			
--- a/arch/alpha/kernel/head.S
+++ b/arch/alpha/kernel/head.S
@@ -7,10 +7,11 @@
 
				  * the kernel global pointer and jump to the kernel entry-point.
			
 
				  */
			
 
				 
			
 
				+#include <linux/init.h>
			
 
				 #include <asm/system.h>
			
 
				 #include <asm/asm-offsets.h>
			
 
				 
			
 
				-.section .text.head, "ax"
			
 
				+__HEAD
			
 
				 .globl swapper_pg_dir
			
 
				 .globl _stext
			
 
				 swapper_pg_dir=SWAPPER_PGD
			
--- a/arch/alpha/kernel/proto.h
+++ b/arch/alpha/kernel/proto.h
@@ -36,7 +36,6 @@ extern void cia_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
				 extern struct pci_ops irongate_pci_ops;
			
 
				 extern int irongate_pci_clr_err(void);
			
 
				 extern void irongate_init_arch(void);
			
 
				-extern void irongate_machine_check(u64, u64);
			
 
				 #define irongate_pci_tbi ((void *)0)
			
 
				 
			
 
				 /* core_lca.c */
			
@@ -49,7 +48,7 @@ extern void lca_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
				 extern struct pci_ops marvel_pci_ops;
			
 
				 extern void marvel_init_arch(void);
			
 
				 extern void marvel_kill_arch(int);
			
 
				-extern void marvel_machine_check(u64, u64);
			
 
				+extern void marvel_machine_check(unsigned long, unsigned long);
			
 
				 extern void marvel_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
			
 
				 extern int marvel_pa_to_nid(unsigned long);
			
 
				 extern int marvel_cpuid_to_nid(int);
			
@@ -86,7 +85,7 @@ extern void t2_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
 
				 extern struct pci_ops titan_pci_ops;
			
 
				 extern void titan_init_arch(void);
			
 
				 extern void titan_kill_arch(int);
			
 
				-extern void titan_machine_check(u64, u64);
			
 
				+extern void titan_machine_check(unsigned long, unsigned long);
			
 
				 extern void titan_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
			
 
				 extern struct _alpha_agp_info *titan_agp_info(void);
			
 
				 
			
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -16,7 +16,7 @@ SECTIONS
 
				 
			
 
				 	_text = .;	/* Text and read-only data */
			
 
				 	.text : {
			
 
				-	*(.text.head)
			
 
				+		HEAD_TEXT
			
 
				 		TEXT_TEXT
			
 
				 		SCHED_TEXT
			
 
				 		LOCK_TEXT
			
--- a/arch/alpha/mm/extable.c
+++ b/arch/alpha/mm/extable.c
@@ -3,11 +3,49 @@
 
				  */
			
 
				 
			
 
				 #include <linux/module.h>
			
 
				+#include <linux/sort.h>
			
 
				 #include <asm/uaccess.h>
			
 
				 
			
 
				+static inline unsigned long ex_to_addr(const struct exception_table_entry *x)
			
 
				+{
			
 
				+	return (unsigned long)&x->insn + x->insn;
			
 
				+}
			
 
				+
			
 
				+static void swap_ex(void *a, void *b, int size)
			
 
				+{
			
 
				+	struct exception_table_entry *ex_a = a, *ex_b = b;
			
 
				+	unsigned long addr_a = ex_to_addr(ex_a), addr_b = ex_to_addr(ex_b);
			
 
				+	unsigned int t = ex_a->fixup.unit;
			
 
				+
			
 
				+	ex_a->fixup.unit = ex_b->fixup.unit;
			
 
				+	ex_b->fixup.unit = t;
			
 
				+	ex_a->insn = (int)(addr_b - (unsigned long)&ex_a->insn);
			
 
				+	ex_b->insn = (int)(addr_a - (unsigned long)&ex_b->insn);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The exception table needs to be sorted so that the binary
			
 
				+ * search that we use to find entries in it works properly.
			
 
				+ * This is used both for the kernel exception table and for
			
 
				+ * the exception tables of modules that get loaded.
			
 
				+ */
			
 
				+static int cmp_ex(const void *a, const void *b)
			
 
				+{
			
 
				+	const struct exception_table_entry *x = a, *y = b;
			
 
				+
			
 
				+	/* avoid overflow */
			
 
				+	if (ex_to_addr(x) > ex_to_addr(y))
			
 
				+		return 1;
			
 
				+	if (ex_to_addr(x) < ex_to_addr(y))
			
 
				+		return -1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 void sort_extable(struct exception_table_entry *start,
			
 
				 		  struct exception_table_entry *finish)
			
 
				 {
			
 
				+	sort(start, finish - start, sizeof(struct exception_table_entry),
			
 
				+	     cmp_ex, swap_ex);
			
 
				 }
			
 
				 
			
 
				 const struct exception_table_entry *
			
@@ -20,7 +58,7 @@ search_extable(const struct exception_table_entry *first,
 
				 		unsigned long mid_value;
			
 
				 
			
 
				 		mid = (last - first) / 2 + first;
			
 
				-		mid_value = (unsigned long)&mid->insn + mid->insn;
			
 
				+		mid_value = ex_to_addr(mid);
			
 
				                 if (mid_value == value)
			
 
				                         return mid;
			
 
				                 else if (mid_value < value)
			
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -454,6 +454,7 @@ config ARCH_MXC
 
				 	select ARCH_MTD_XIP
			
 
				 	select GENERIC_GPIO
			
 
				 	select ARCH_REQUIRE_GPIOLIB
			
 
				+	select HAVE_CLK
			
 
				 	help
			
 
				 	  Support for Freescale MXC/iMX-based family of processors
			
 
				 
			
@@ -486,8 +487,6 @@ config ARCH_PXA
 
				 	select HAVE_CLK
			
 
				 	select COMMON_CLKDEV
			
 
				 	select ARCH_REQUIRE_GPIOLIB
			
 
				-	select HAVE_CLK
			
 
				-	select COMMON_CLKDEV
			
 
				 	select GENERIC_TIME
			
 
				 	select GENERIC_CLOCKEVENTS
			
 
				 	select TICK_ONESHOT
			
@@ -585,6 +584,8 @@ config ARCH_DAVINCI
 
				 	select ARCH_REQUIRE_GPIOLIB
			
 
				 	select HAVE_CLK
			
 
				 	select ZONE_DMA
			
 
				+	select HAVE_IDE
			
 
				+	select COMMON_CLKDEV
			
 
				 	help
			
 
				 	  Support for TI's DaVinci platform.
			
 
				 
			
@@ -740,6 +741,56 @@ if !MMU
 
				 source "arch/arm/Kconfig-nommu"
			
 
				 endif
			
 
				 
			
 
				+config ARM_ERRATA_411920
			
 
				+	bool "ARM errata: Invalidation of the Instruction Cache operation can fail"
			
 
				+	depends on CPU_V6 && !SMP
			
 
				+	help
			
 
				+	  Invalidation of the Instruction Cache operation can
			
 
				+	  fail. This erratum is present in 1136 (before r1p4), 1156 and 1176.
			
 
				+	  It does not affect the MPCore. This option enables the ARM Ltd.
			
 
				+	  recommended workaround.
			
 
				+
			
 
				+config ARM_ERRATA_430973
			
 
				+	bool "ARM errata: Stale prediction on replaced interworking branch"
			
 
				+	depends on CPU_V7
			
 
				+	help
			
 
				+	  This option enables the workaround for the 430973 Cortex-A8
			
 
				+	  (r1p0..r1p2) erratum. If a code sequence containing an ARM/Thumb
			
 
				+	  interworking branch is replaced with another code sequence at the
			
 
				+	  same virtual address, whether due to self-modifying code or virtual
			
 
				+	  to physical address re-mapping, Cortex-A8 does not recover from the
			
 
				+	  stale interworking branch prediction. This results in Cortex-A8
			
 
				+	  executing the new code sequence in the incorrect ARM or Thumb state.
			
 
				+	  The workaround enables the BTB/BTAC operations by setting ACTLR.IBE
			
 
				+	  and also flushes the branch target cache at every context switch.
			
 
				+	  Note that setting specific bits in the ACTLR register may not be
			
 
				+	  available in non-secure mode.
			
 
				+
			
 
				+config ARM_ERRATA_458693
			
 
				+	bool "ARM errata: Processor deadlock when a false hazard is created"
			
 
				+	depends on CPU_V7
			
 
				+	help
			
 
				+	  This option enables the workaround for the 458693 Cortex-A8 (r2p0)
			
 
				+	  erratum. For very specific sequences of memory operations, it is
			
 
				+	  possible for a hazard condition intended for a cache line to instead
			
 
				+	  be incorrectly associated with a different cache line. This false
			
 
				+	  hazard might then cause a processor deadlock. The workaround enables
			
 
				+	  the L1 caching of the NEON accesses and disables the PLD instruction
			
 
				+	  in the ACTLR register. Note that setting specific bits in the ACTLR
			
 
				+	  register may not be available in non-secure mode.
			
 
				+
			
 
				+config ARM_ERRATA_460075
			
 
				+	bool "ARM errata: Data written to the L2 cache can be overwritten with stale data"
			
 
				+	depends on CPU_V7
			
 
				+	help
			
 
				+	  This option enables the workaround for the 460075 Cortex-A8 (r2p0)
			
 
				+	  erratum. Any asynchronous access to the L2 cache may encounter a
			
 
				+	  situation in which recent store transactions to the L2 cache are lost
			
 
				+	  and overwritten with stale memory contents from external memory. The
			
 
				+	  workaround disables the write-allocate mode for the L2 cache via the
			
 
				+	  ACTLR register. Note that setting specific bits in the ACTLR register
			
 
				+	  may not be available in non-secure mode.
			
 
				+
			
 
				 endmenu
			
 
				 
			
 
				 source "arch/arm/common/Kconfig"
			
@@ -1171,12 +1222,6 @@ config CPU_FREQ_IMX
 
				 
			
 
				 	  If in doubt, say N.
			
 
				 
			
 
				-config CPU_FREQ_PXA
			
 
				-	bool
			
 
				-	depends on CPU_FREQ && ARCH_PXA && PXA25x
			
 
				-	default y
			
 
				-	select CPU_FREQ_DEFAULT_GOV_USERSPACE
			
 
				-
			
 
				 endif
			
 
				 
			
 
				 source "drivers/cpuidle/Kconfig"
			
--- a/arch/arm/boot/compressed/misc.c
+++ b/arch/arm/boot/compressed/misc.c
@@ -18,7 +18,10 @@
 
				 
			
 
				 unsigned int __machine_arch_type;
			
 
				 
			
 
				-#include <linux/string.h>
			
 
				+#include <linux/compiler.h>	/* for inline */
			
 
				+#include <linux/types.h>	/* for size_t */
			
 
				+#include <linux/stddef.h>	/* for NULL */
			
 
				+#include <asm/string.h>
			
 
				 
			
 
				 #ifdef STANDALONE_DEBUG
			
 
				 #define putstr printf
			
--- a/arch/arm/common/vic.c
+++ b/arch/arm/common/vic.c
@@ -85,12 +85,11 @@ void __init vic_init(void __iomem *base, unsigned int irq_start,
 
				 	writel(32, base + VIC_PL190_DEF_VECT_ADDR);
			
 
				 
			
 
				 	for (i = 0; i < 32; i++) {
			
 
				-		unsigned int irq = irq_start + i;
			
 
				-
			
 
				-		set_irq_chip(irq, &vic_chip);
			
 
				-		set_irq_chip_data(irq, base);
			
 
				-
			
 
				 		if (vic_sources & (1 << i)) {
			
 
				+			unsigned int irq = irq_start + i;
			
 
				+
			
 
				+			set_irq_chip(irq, &vic_chip);
			
 
				+			set_irq_chip_data(irq, base);
			
 
				 			set_irq_handler(irq, handle_level_irq);
			
 
				 			set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
			
 
				 		}