15 years ago · 6e80e8ed5e
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
 
															 You can do a very simple testing of running two dd threads in two different
														
 
															 cgroups. Here is what you can do.
														
 
															+- Enable Block IO controller
														
 
															+	CONFIG_BLK_CGROUP=y
														
 
															+
														
 
															 - Enable group scheduling in CFQ
														
 
															 	CONFIG_CFQ_GROUP_IOSCHED=y
														
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
 
															 Various user visible config options
														
 
															 ===================================
														
 
															-CONFIG_CFQ_GROUP_IOSCHED
														
 
															-	- Enables group scheduling in CFQ. Currently only 1 level of group
														
 
															-	  creation is allowed.
														
 
															-
														
 
															-CONFIG_DEBUG_CFQ_IOSCHED
														
 
															-	- Enables some debugging messages in blktrace. Also creates extra
														
 
															-	  cgroup file blkio.dequeue.
														
 
															-
														
 
															-Config options selected automatically
														
 
															-=====================================
														
 
															-These config options are not user visible and are selected/deselected
														
 
															-automatically based on IO scheduler configuration.
														
 
															-
														
 
															 CONFIG_BLK_CGROUP
														
 
															-	- Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
														
 
															+	- Block IO controller.
														
 
															 CONFIG_DEBUG_BLK_CGROUP
														
 
															-	- Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
														
 
															+	- Debug help. Right now some additional stats file show up in cgroup
														
 
															+	  if this option is enabled.
														
 
															+
														
 
															+CONFIG_CFQ_GROUP_IOSCHED
														
 
															+	- Enables group scheduling in CFQ. Currently only 1 level of group
														
 
															+	  creation is allowed.
														
 
															 Details of cgroup files
														
 
															 =======================
														
 
															 - blkio.weight
														
 
															-	- Specifies per cgroup weight.
														
 
															-
														
 
															+	- Specifies per cgroup weight. This is default weight of the group
														
 
															+	  on all the devices until and unless overridden by per device rule.
														
 
															+	  (See blkio.weight_device).
														
 
															 	  Currently allowed range of weights is from 100 to 1000.
														
 
															+- blkio.weight_device
														
 
															+	- One can specify per cgroup per device rules using this interface.
														
 
															+	  These rules override the default value of group weight as specified
														
 
															+	  by blkio.weight.
														
 
															+
														
 
															+	  Following is the format.
														
 
															+
														
 
															+	  #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
														
 
															+	  Configure weight=300 on /dev/sdb (8:16) in this cgroup
														
 
															+	  # echo 8:16 300 > blkio.weight_device
														
 
															+	  # cat blkio.weight_device
														
 
															+	  dev     weight
														
 
															+	  8:16    300
														
 
															+
														
 
															+	  Configure weight=500 on /dev/sda (8:0) in this cgroup
														
 
															+	  # echo 8:0 500 > blkio.weight_device
														
 
															+	  # cat blkio.weight_device
														
 
															+	  dev     weight
														
 
															+	  8:0     500
														
 
															+	  8:16    300
														
 
															+
														
 
															+	  Remove specific weight for /dev/sda in this cgroup
														
 
															+	  # echo 8:0 0 > blkio.weight_device
														
 
															+	  # cat blkio.weight_device
														
 
															+	  dev     weight
														
 
															+	  8:16    300
														
 
															+
														
 
															 - blkio.time
														
 
															 	- disk time allocated to cgroup per device in milliseconds. First
														
 
															 	  two fields specify the major and minor number of the device and
														
@@ -92,13 +115,105 @@ Details of cgroup files
 
															 	  third field specifies the number of sectors transferred by the
														
 
															 	  group to/from the device.
														
 
															+- blkio.io_service_bytes
														
 
															+	- Number of bytes transferred to/from the disk by the group. These
														
 
															+	  are further divided by the type of operation - read or write, sync
														
 
															+	  or async. First two fields specify the major and minor number of the
														
 
															+	  device, third field specifies the operation type and the fourth field
														
 
															+	  specifies the number of bytes.
														
 
															+
														
 
															+- blkio.io_serviced
														
 
															+	- Number of IOs completed to/from the disk by the group. These
														
 
															+	  are further divided by the type of operation - read or write, sync
														
 
															+	  or async. First two fields specify the major and minor number of the
														
 
															+	  device, third field specifies the operation type and the fourth field
														
 
															+	  specifies the number of IOs.
														
 
															+
														
 
															+- blkio.io_service_time
														
 
															+	- Total amount of time between request dispatch and request completion
														
 
															+	  for the IOs done by this cgroup. This is in nanoseconds to make it
														
 
															+	  meaningful for flash devices too. For devices with queue depth of 1,
														
 
															+	  this time represents the actual service time. When queue_depth > 1,
														
 
															+	  that is no longer true as requests may be served out of order. This
														
 
															+	  may cause the service time for a given IO to include the service time
														
 
															+	  of multiple IOs when served out of order which may result in total
														
 
															+	  io_service_time > actual time elapsed. This time is further divided by
														
 
															+	  the type of operation - read or write, sync or async. First two fields
														
 
															+	  specify the major and minor number of the device, third field
														
 
															+	  specifies the operation type and the fourth field specifies the
														
 
															+	  io_service_time in ns.
														
 
															+
														
 
															+- blkio.io_wait_time
														
 
															+	- Total amount of time the IOs for this cgroup spent waiting in the
														
 
															+	  scheduler queues for service. This can be greater than the total time
														
 
															+	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
														
 
															+	  measure of total time the cgroup spent waiting but rather a measure of
														
 
															+	  the wait_time for its individual IOs. For devices with queue_depth > 1
														
 
															+	  this metric does not include the time spent waiting for service once
														
 
															+	  the IO is dispatched to the device but till it actually gets serviced
														
 
															+	  (there might be a time lag here due to re-ordering of requests by the
														
 
															+	  device). This is in nanoseconds to make it meaningful for flash
														
 
															+	  devices too. This time is further divided by the type of operation -
														
 
															+	  read or write, sync or async. First two fields specify the major and
														
 
															+	  minor number of the device, third field specifies the operation type
														
 
															+	  and the fourth field specifies the io_wait_time in ns.
														
 
															+
														
 
															+- blkio.io_merged
														
 
															+	- Total number of bios/requests merged into requests belonging to this
														
 
															+	  cgroup. This is further divided by the type of operation - read or
														
 
															+	  write, sync or async.
														
 
															+
														
 
															+- blkio.io_queued
														
 
															+	- Total number of requests queued up at any given instant for this
														
 
															+	  cgroup. This is further divided by the type of operation - read or
														
 
															+	  write, sync or async.
														
 
															+
														
 
															+- blkio.avg_queue_size
														
 
															+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
														
 
															+	  The average queue size for this cgroup over the entire time of this
														
 
															+	  cgroup's existence. Queue size samples are taken each time one of the
														
 
															+	  queues of this cgroup gets a timeslice.
														
 
															+
														
 
															+- blkio.group_wait_time
														
 
															+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
														
 
															+	  This is the amount of time the cgroup had to wait since it became busy
														
 
															+	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
														
 
															+	  its queues. This is different from the io_wait_time which is the
														
 
															+	  cumulative total of the amount of time spent by each IO in that cgroup
														
 
															+	  waiting in the scheduler queue. This is in nanoseconds. If this is
														
 
															+	  read when the cgroup is in a waiting (for timeslice) state, the stat
														
 
															+	  will only report the group_wait_time accumulated till the last time it
														
 
															+	  got a timeslice and will not include the current delta.
														
 
															+
														
 
															+- blkio.empty_time
														
 
															+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
														
 
															+	  This is the amount of time a cgroup spends without any pending
														
 
															+	  requests when not being served, i.e., it does not include any time
														
 
															+	  spent idling for one of the queues of the cgroup. This is in
														
 
															+	  nanoseconds. If this is read when the cgroup is in an empty state,
														
 
															+	  the stat will only report the empty_time accumulated till the last
														
 
															+	  time it had a pending request and will not include the current delta.
														
 
															+
														
 
															+- blkio.idle_time
														
 
															+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
														
 
															+	  This is the amount of time spent by the IO scheduler idling for a
														
 
															+	  given cgroup in anticipation of a better request than the exising ones
														
 
															+	  from other queues/cgroups. This is in nanoseconds. If this is read
														
 
															+	  when the cgroup is in an idling state, the stat will only report the
														
 
															+	  idle_time accumulated till the last idle period and will not include
														
 
															+	  the current delta.
														
 
															+
														
 
															 - blkio.dequeue
														
 
															-	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
														
 
															+	- Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
														
 
															 	  gives the statistics about how many a times a group was dequeued
														
 
															 	  from service tree of the device. First two fields specify the major
														
 
															 	  and minor number of the device and third field specifies the number
														
 
															 	  of times a group was dequeued from a particular device.
														
 
															+- blkio.reset_stats
														
 
															+	- Writing an int to this file will result in resetting all the stats
														
 
															+	  for that cgroup.
														
 
															+
														
 
															 CFQ sysfs tunable
														
 
															 =================
														
 
															 /sys/block/<disk>/queue/iosched/group_isolation
														
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
 
															 	T10/SCSI Data Integrity Field or the T13/ATA External Path
														
 
															 	Protection.  If in doubt, say N.
														
 
															-config BLK_CGROUP
														
 
															-	tristate "Block cgroup support"
														
 
															-	depends on CGROUPS
														
 
															-	depends on CFQ_GROUP_IOSCHED
														
 
															-	default n
														
 
															-	---help---
														
 
															-	Generic block IO controller cgroup interface. This is the common
														
 
															-	cgroup interface which should be used by various IO controlling
														
 
															-	policies.
														
 
															-
														
 
															-	Currently, CFQ IO scheduler uses it to recognize task groups and
														
 
															-	control disk bandwidth allocation (proportional time slice allocation)
														
 
															-	to such task groups.
														
 
															-
														
 
															-config DEBUG_BLK_CGROUP
														
 
															-	bool
														
 
															-	depends on BLK_CGROUP
														
 
															-	default n
														
 
															-	---help---
														
 
															-	Enable some debugging help. Currently it stores the cgroup path
														
 
															-	in the blk group which can be used by cfq for tracing various
														
 
															-	group related activity.
														
 
															-
														
 
															 endif # BLOCK
														
 
															 config BLOCK_COMPAT
														
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
 
															 config IOSCHED_CFQ
														
 
															 	tristate "CFQ I/O scheduler"
														
 
															-	select BLK_CGROUP if CFQ_GROUP_IOSCHED
														
 
															+	# If BLK_CGROUP is a module, CFQ has to be built as module.
														
 
															+	depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
														
 
															 	default y
														
 
															 	---help---
														
 
															 	  The CFQ I/O scheduler tries to distribute bandwidth equally
														
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
 
															 	  This is the default I/O scheduler.
														
 
															+	  Note: If BLK_CGROUP=m, then CFQ can be built only as module.
														
 
															+
														
 
															 config CFQ_GROUP_IOSCHED
														
 
															 	bool "CFQ Group Scheduling support"
														
 
															-	depends on IOSCHED_CFQ && CGROUPS
														
 
															+	depends on IOSCHED_CFQ && BLK_CGROUP
														
 
															 	default n
														
 
															 	---help---
														
 
															 	  Enable group IO scheduling in CFQ.
														
 
															-config DEBUG_CFQ_IOSCHED
														
 
															-	bool "Debug CFQ Scheduling"
														
 
															-	depends on CFQ_GROUP_IOSCHED
														
 
															-	select DEBUG_BLK_CGROUP
														
 
															-	default n
														
 
															-	---help---
														
 
															-	  Enable CFQ IO scheduling debugging in CFQ. Currently it makes
														
 
															-	  blktrace output more verbose.
														
 
															-
														
 
															 choice
														
 
															 	prompt "Default I/O scheduler"
														
 
															 	default DEFAULT_CFQ
														
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 
															 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
														
 
															 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
														
 
															 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
														
 
															-			blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
														
 
															+			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
														
 
															 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
														
 
															 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
														
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
 
															 			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
														
 
															 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
														
 
															 	}
														
 
															-
														
 
															-	complete(bio->bi_private);
														
 
															+	if (bio->bi_private)
														
 
															+		complete(bio->bi_private);
														
 
															+	bio_put(bio);
														
 
															 }
														
 
															 /**
														
 
															  * blkdev_issue_flush - queue a flush
														
 
															  * @bdev:	blockdev to issue flush for
														
 
															+ * @gfp_mask:	memory allocation flags (for bio_alloc)
														
 
															  * @error_sector:	error sector
														
 
															+ * @flags:	BLKDEV_IFL_* flags to control behaviour
														
 
															  *
														
 
															  * Description:
														
 
															  *    Issue a flush for the block device in question. Caller can supply
														
 
															  *    room for storing the error offset in case of a flush error, if they
														
 
															- *    wish to.
														
 
															+ *    wish to. If WAIT flag is not passed then caller may check only what
														
 
															+ *    request was pushed in some internal queue for later handling.
														
 
															  */
														
 
															-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
														
 
															+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
														
 
															+		sector_t *error_sector, unsigned long flags)
														
 
															 {
														
 
															 	DECLARE_COMPLETION_ONSTACK(wait);
														
 
															 	struct request_queue *q;
														
 
															 	struct bio *bio;
														
 
															-	int ret;
														
 
															+	int ret = 0;
														
 
															 	if (bdev->bd_disk == NULL)
														
 
															 		return -ENXIO;
														
@@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 
															 	if (!q)
														
 
															 		return -ENXIO;
														
 
															-	bio = bio_alloc(GFP_KERNEL, 0);
														
 
															+	bio = bio_alloc(gfp_mask, 0);
														
 
															 	bio->bi_end_io = bio_end_empty_barrier;
														
 
															-	bio->bi_private = &wait;
														
 
															 	bio->bi_bdev = bdev;
														
 
															-	submit_bio(WRITE_BARRIER, bio);
														
 
															-
														
 
															-	wait_for_completion(&wait);
														
 
															+	if (test_bit(BLKDEV_WAIT, &flags))
														
 
															+		bio->bi_private = &wait;
														
 
															-	/*
														
 
															-	 * The driver must store the error location in ->bi_sector, if
														
 
															-	 * it supports it. For non-stacked drivers, this should be copied
														
 
															-	 * from blk_rq_pos(rq).
														
 
															-	 */
														
 
															-	if (error_sector)
														
 
															-		*error_sector = bio->bi_sector;
														
 
															+	bio_get(bio);
														
 
															+	submit_bio(WRITE_BARRIER, bio);
														
 
															+	if (test_bit(BLKDEV_WAIT, &flags)) {
														
 
															+		wait_for_completion(&wait);
														
 
															+		/*
														
 
															+		 * The driver must store the error location in ->bi_sector, if
														
 
															+		 * it supports it. For non-stacked drivers, this should be
														
 
															+		 * copied from blk_rq_pos(rq).
														
 
															+		 */
														
 
															+		if (error_sector)
														
 
															+			*error_sector = bio->bi_sector;
														
 
															+	}
														
 
															-	ret = 0;
														
 
															 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
														
 
															 		ret = -EOPNOTSUPP;
														
 
															 	else if (!bio_flagged(bio, BIO_UPTODATE))
														
@@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 
															 	return ret;
														
 
															 }
														
 
															 EXPORT_SYMBOL(blkdev_issue_flush);
														
 
															-
														
 
															-static void blkdev_discard_end_io(struct bio *bio, int err)
														
 
															-{
														
 
															-	if (err) {
														
 
															-		if (err == -EOPNOTSUPP)
														
 
															-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
														
 
															-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
														
 
															-	}
														
 
															-
														
 
															-	if (bio->bi_private)
														
 
															-		complete(bio->bi_private);
														
 
															-	__free_page(bio_page(bio));
														
 
															-
														
 
															-	bio_put(bio);
														
 
															-}
														
 
															-
														
 
															-/**
														
 
															- * blkdev_issue_discard - queue a discard
														
 
															- * @bdev:	blockdev to issue discard for
														
 
															- * @sector:	start sector
														
 
															- * @nr_sects:	number of sectors to discard
														
 
															- * @gfp_mask:	memory allocation flags (for bio_alloc)
														
 
															- * @flags:	DISCARD_FL_* flags to control behaviour
														
 
															- *
														
 
															- * Description:
														
 
															- *    Issue a discard request for the sectors in question.
														
 
															- */
														
 
															-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
														
 
															-		sector_t nr_sects, gfp_t gfp_mask, int flags)
														
 
															-{
														
 
															-	DECLARE_COMPLETION_ONSTACK(wait);
														
 
															-	struct request_queue *q = bdev_get_queue(bdev);
														
 
															-	int type = flags & DISCARD_FL_BARRIER ?
														
 
															-		DISCARD_BARRIER : DISCARD_NOBARRIER;
														
 
															-	struct bio *bio;
														
 
															-	struct page *page;
														
 
															-	int ret = 0;
														
 
															-
														
 
															-	if (!q)
														
 
															-		return -ENXIO;
														
 
															-
														
 
															-	if (!blk_queue_discard(q))
														
 
															-		return -EOPNOTSUPP;
														
 
															-
														
 
															-	while (nr_sects && !ret) {
														
 
															-		unsigned int sector_size = q->limits.logical_block_size;
														
 
															-		unsigned int max_discard_sectors =
														
 
															-			min(q->limits.max_discard_sectors, UINT_MAX >> 9);
														
 
															-
														
 
															-		bio = bio_alloc(gfp_mask, 1);
														
 
															-		if (!bio)
														
 
															-			goto out;
														
 
															-		bio->bi_sector = sector;
														
 
															-		bio->bi_end_io = blkdev_discard_end_io;
														
 
															-		bio->bi_bdev = bdev;
														
 
															-		if (flags & DISCARD_FL_WAIT)
														
 
															-			bio->bi_private = &wait;
														
 
															-
														
 
															-		/*
														
 
															-		 * Add a zeroed one-sector payload as that's what
														
 
															-		 * our current implementations need.  If we'll ever need
														
 
															-		 * more the interface will need revisiting.
														
 
															-		 */
														
 
															-		page = alloc_page(gfp_mask | __GFP_ZERO);
														
 
															-		if (!page)
														
 
															-			goto out_free_bio;
														
 
															-		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
														
 
															-			goto out_free_page;
														
 
															-
														
 
															-		/*
														
 
															-		 * And override the bio size - the way discard works we
														
 
															-		 * touch many more blocks on disk than the actual payload
														
 
															-		 * length.
														
 
															-		 */
														
 
															-		if (nr_sects > max_discard_sectors) {
														
 
															-			bio->bi_size = max_discard_sectors << 9;
														
 
															-			nr_sects -= max_discard_sectors;
														
 
															-			sector += max_discard_sectors;
														
 
															-		} else {
														
 
															-			bio->bi_size = nr_sects << 9;
														
 
															-			nr_sects = 0;
														
 
															-		}
														
 
															-
														
 
															-		bio_get(bio);
														
 
															-		submit_bio(type, bio);
														
 
															-
														
 
															-		if (flags & DISCARD_FL_WAIT)
														
 
															-			wait_for_completion(&wait);
														
 
															-
														
 
															-		if (bio_flagged(bio, BIO_EOPNOTSUPP))
														
 
															-			ret = -EOPNOTSUPP;
														
 
															-		else if (!bio_flagged(bio, BIO_UPTODATE))
														
 
															-			ret = -EIO;
														
 
															-		bio_put(bio);
														
 
															-	}
														
 
															-	return ret;
														
 
															-out_free_page:
														
 
															-	__free_page(page);
														
 
															-out_free_bio:
														
 
															-	bio_put(bio);
														
 
															-out:
														
 
															-	return -ENOMEM;
														
 
															-}
														
 
															-EXPORT_SYMBOL(blkdev_issue_discard);
														
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,8 +15,12 @@
 
															 #include <linux/kdev_t.h>
														
 
															 #include <linux/module.h>
														
 
															 #include <linux/err.h>
														
 
															+#include <linux/blkdev.h>
														
 
															 #include <linux/slab.h>
														
 
															 #include "blk-cgroup.h"
														
 
															+#include <linux/genhd.h>
														
 
															+
														
 
															+#define MAX_KEY_LEN 100
														
 
															 static DEFINE_SPINLOCK(blkio_list_lock);
														
 
															 static LIST_HEAD(blkio_list);
														
@@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = {
 
															 };
														
 
															 EXPORT_SYMBOL_GPL(blkio_subsys);
														
 
															+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
														
 
															+					    struct blkio_policy_node *pn)
														
 
															+{
														
 
															+	list_add(&pn->node, &blkcg->policy_list);
														
 
															+}
														
 
															+
														
 
															+/* Must be called with blkcg->lock held */
														
 
															+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
														
 
															+{
														
 
															+	list_del(&pn->node);
														
 
															+}
														
 
															+
														
 
															+/* Must be called with blkcg->lock held */
														
 
															+static struct blkio_policy_node *
														
 
															+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
														
 
															+{
														
 
															+	struct blkio_policy_node *pn;
														
 
															+
														
 
															+	list_for_each_entry(pn, &blkcg->policy_list, node) {
														
 
															+		if (pn->dev == dev)
														
 
															+			return pn;
														
 
															+	}
														
 
															+
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
														
 
															 {
														
 
															 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
														
@@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
														
 
															-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
														
 
															-			unsigned long time, unsigned long sectors)
														
 
															+/*
														
 
															+ * Add to the appropriate stat variable depending on the request type.
														
 
															+ * This should be called with the blkg->stats_lock held.
														
 
															+ */
														
 
															+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
														
 
															+				bool sync)
														
 
															+{
														
 
															+	if (direction)
														
 
															+		stat[BLKIO_STAT_WRITE] += add;
														
 
															+	else
														
 
															+		stat[BLKIO_STAT_READ] += add;
														
 
															+	if (sync)
														
 
															+		stat[BLKIO_STAT_SYNC] += add;
														
 
															+	else
														
 
															+		stat[BLKIO_STAT_ASYNC] += add;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Decrements the appropriate stat variable if non-zero depending on the
														
 
															+ * request type. Panics on value being zero.
														
 
															+ * This should be called with the blkg->stats_lock held.
														
 
															+ */
														
 
															+static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
														
 
															+{
														
 
															+	if (direction) {
														
 
															+		BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
														
 
															+		stat[BLKIO_STAT_WRITE]--;
														
 
															+	} else {
														
 
															+		BUG_ON(stat[BLKIO_STAT_READ] == 0);
														
 
															+		stat[BLKIO_STAT_READ]--;
														
 
															+	}
														
 
															+	if (sync) {
														
 
															+		BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
														
 
															+		stat[BLKIO_STAT_SYNC]--;
														
 
															+	} else {
														
 
															+		BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
														
 
															+		stat[BLKIO_STAT_ASYNC]--;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															+/* This should be called with the blkg->stats_lock held. */
														
 
															+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
														
 
															+						struct blkio_group *curr_blkg)
														
 
															+{
														
 
															+	if (blkio_blkg_waiting(&blkg->stats))
														
 
															+		return;
														
 
															+	if (blkg == curr_blkg)
														
 
															+		return;
														
 
															+	blkg->stats.start_group_wait_time = sched_clock();
														
 
															+	blkio_mark_blkg_waiting(&blkg->stats);
														
 
															+}
														
 
															+
														
 
															+/* This should be called with the blkg->stats_lock held. */
														
 
															+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
														
 
															+{
														
 
															+	unsigned long long now;
														
 
															+
														
 
															+	if (!blkio_blkg_waiting(stats))
														
 
															+		return;
														
 
															+
														
 
															+	now = sched_clock();
														
 
															+	if (time_after64(now, stats->start_group_wait_time))
														
 
															+		stats->group_wait_time += now - stats->start_group_wait_time;
														
 
															+	blkio_clear_blkg_waiting(stats);
														
 
															+}
														
 
															+
														
 
															+/* This should be called with the blkg->stats_lock held. */
														
 
															+static void blkio_end_empty_time(struct blkio_group_stats *stats)
														
 
															+{
														
 
															+	unsigned long long now;
														
 
															+
														
 
															+	if (!blkio_blkg_empty(stats))
														
 
															+		return;
														
 
															+
														
 
															+	now = sched_clock();
														
 
															+	if (time_after64(now, stats->start_empty_time))
														
 
															+		stats->empty_time += now - stats->start_empty_time;
														
 
															+	blkio_clear_blkg_empty(stats);
														
 
															+}
														
 
															+
														
 
															+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	BUG_ON(blkio_blkg_idling(&blkg->stats));
														
 
															+	blkg->stats.start_idle_time = sched_clock();
														
 
															+	blkio_mark_blkg_idling(&blkg->stats);
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
														
 
															+
														
 
															+void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	unsigned long long now;
														
 
															+	struct blkio_group_stats *stats;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	stats = &blkg->stats;
														
 
															+	if (blkio_blkg_idling(stats)) {
														
 
															+		now = sched_clock();
														
 
															+		if (time_after64(now, stats->start_idle_time))
														
 
															+			stats->idle_time += now - stats->start_idle_time;
														
 
															+		blkio_clear_blkg_idling(stats);
														
 
															+	}
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
														
 
															+
														
 
															+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct blkio_group_stats *stats;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	stats = &blkg->stats;
														
 
															+	stats->avg_queue_size_sum +=
														
 
															+			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
														
 
															+			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
														
 
															+	stats->avg_queue_size_samples++;
														
 
															+	blkio_update_group_wait_time(stats);
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
														
 
															+
														
 
															+void blkiocg_set_start_empty_time(struct blkio_group *blkg)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct blkio_group_stats *stats;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	stats = &blkg->stats;
														
 
															+
														
 
															+	if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
														
 
															+			stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
														
 
															+		spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * group is already marked empty. This can happen if cfqq got new
														
 
															+	 * request in parent group and moved to this group while being added
														
 
															+	 * to service tree. Just ignore the event and move on.
														
 
															+	 */
														
 
															+	if(blkio_blkg_empty(stats)) {
														
 
															+		spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	stats->start_empty_time = sched_clock();
														
 
															+	blkio_mark_blkg_empty(stats);
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
														
 
															+
														
 
															+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
														
 
															+			unsigned long dequeue)
														
 
															+{
														
 
															+	blkg->stats.dequeue += dequeue;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
														
 
															+#else
														
 
															+static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
														
 
															+					struct blkio_group *curr_blkg) {}
														
 
															+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
														
 
															+#endif
														
 
															+
														
 
															+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
														
 
															+			struct blkio_group *curr_blkg, bool direction,
														
 
															+			bool sync)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
														
 
															+			sync);
														
 
															+	blkio_end_empty_time(&blkg->stats);
														
 
															+	blkio_set_start_group_wait_time(blkg, curr_blkg);
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
														
 
															+
														
 
															+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
														
 
															+						bool direction, bool sync)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
														
 
															+					direction, sync);
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
														
 
															+
														
 
															+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	blkg->stats.time += time;
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
														
 
															+
														
 
															+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
														
 
															+				uint64_t bytes, bool direction, bool sync)
														
 
															 {
														
 
															-	blkg->time += time;
														
 
															-	blkg->sectors += sectors;
														
 
															+	struct blkio_group_stats *stats;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	stats = &blkg->stats;
														
 
															+	stats->sectors += bytes >> 9;
														
 
															+	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
														
 
															+			sync);
														
 
															+	blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
														
 
															+			direction, sync);
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															 }
														
 
															-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
														
 
															+
														
 
															+void blkiocg_update_completion_stats(struct blkio_group *blkg,
														
 
															+	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
														
 
															+{
														
 
															+	struct blkio_group_stats *stats;
														
 
															+	unsigned long flags;
														
 
															+	unsigned long long now = sched_clock();
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	stats = &blkg->stats;
														
 
															+	if (time_after64(now, io_start_time))
														
 
															+		blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
														
 
															+				now - io_start_time, direction, sync);
														
 
															+	if (time_after64(io_start_time, start_time))
														
 
															+		blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
														
 
															+				io_start_time - start_time, direction, sync);
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
														
 
															+
														
 
															+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
														
 
															+					bool sync)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	spin_lock_irqsave(&blkg->stats_lock, flags);
														
 
															+	blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
														
 
															+			sync);
														
 
															+	spin_unlock_irqrestore(&blkg->stats_lock, flags);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
														
 
															 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
														
 
															 			struct blkio_group *blkg, void *key, dev_t dev)
														
@@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 
															 	unsigned long flags;
														
 
															 	spin_lock_irqsave(&blkcg->lock, flags);
														
 
															+	spin_lock_init(&blkg->stats_lock);
														
 
															 	rcu_assign_pointer(blkg->key, key);
														
 
															 	blkg->blkcg_id = css_id(&blkcg->css);
														
 
															 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
														
 
															 	spin_unlock_irqrestore(&blkcg->lock, flags);
														
 
															-#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															 	/* Need to take css reference ? */
														
 
															 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
														
 
															-#endif
														
 
															 	blkg->dev = dev;
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
														
@@ -101,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg)
 
															 	rcu_read_lock();
														
 
															 	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
														
 
															-	if (!css)
														
 
															-		goto out;
														
 
															-
														
 
															-	blkcg = container_of(css, struct blkio_cgroup, css);
														
 
															-	spin_lock_irqsave(&blkcg->lock, flags);
														
 
															-	if (!hlist_unhashed(&blkg->blkcg_node)) {
														
 
															-		__blkiocg_del_blkio_group(blkg);
														
 
															-		ret = 0;
														
 
															+	if (css) {
														
 
															+		blkcg = container_of(css, struct blkio_cgroup, css);
														
 
															+		spin_lock_irqsave(&blkcg->lock, flags);
														
 
															+		if (!hlist_unhashed(&blkg->blkcg_node)) {
														
 
															+			__blkiocg_del_blkio_group(blkg);
														
 
															+			ret = 0;
														
 
															+		}
														
 
															+		spin_unlock_irqrestore(&blkcg->lock, flags);
														
 
															 	}
														
 
															-	spin_unlock_irqrestore(&blkcg->lock, flags);
														
 
															-out:
														
 
															+
														
 
															 	rcu_read_unlock();
														
 
															 	return ret;
														
 
															 }
														
@@ -154,6 +428,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 
															 	struct blkio_group *blkg;
														
 
															 	struct hlist_node *n;
														
 
															 	struct blkio_policy_type *blkiop;
														
 
															+	struct blkio_policy_node *pn;
														
 
															 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
														
 
															 		return -EINVAL;
														
@@ -162,7 +437,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 
															 	spin_lock(&blkio_list_lock);
														
 
															 	spin_lock_irq(&blkcg->lock);
														
 
															 	blkcg->weight = (unsigned int)val;
														
 
															+
														
 
															 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
														
 
															+		pn = blkio_policy_search_node(blkcg, blkg->dev);
														
 
															+
														
 
															+		if (pn)
														
 
															+			continue;
														
 
															+
														
 
															 		list_for_each_entry(blkiop, &blkio_list, list)
														
 
															 			blkiop->ops.blkio_update_group_weight_fn(blkg,
														
 
															 					blkcg->weight);
														
@@ -172,13 +453,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 
															 	return 0;
														
 
															 }
														
 
															-#define SHOW_FUNCTION_PER_GROUP(__VAR)					\
														
 
															+static int
														
 
															+blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
														
 
															+{
														
 
															+	struct blkio_cgroup *blkcg;
														
 
															+	struct blkio_group *blkg;
														
 
															+	struct blkio_group_stats *stats;
														
 
															+	struct hlist_node *n;
														
 
															+	uint64_t queued[BLKIO_STAT_TOTAL];
														
 
															+	int i;
														
 
															+#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															+	bool idling, waiting, empty;
														
 
															+	unsigned long long now = sched_clock();
														
 
															+#endif
														
 
															+
														
 
															+	blkcg = cgroup_to_blkio_cgroup(cgroup);
														
 
															+	spin_lock_irq(&blkcg->lock);
														
 
															+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
														
 
															+		spin_lock(&blkg->stats_lock);
														
 
															+		stats = &blkg->stats;
														
 
															+#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															+		idling = blkio_blkg_idling(stats);
														
 
															+		waiting = blkio_blkg_waiting(stats);
														
 
															+		empty = blkio_blkg_empty(stats);
														
 
															+#endif
														
 
															+		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
														
 
															+			queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
														
 
															+		memset(stats, 0, sizeof(struct blkio_group_stats));
														
 
															+		for (i = 0; i < BLKIO_STAT_TOTAL; i++)
														
 
															+			stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
														
 
															+#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															+		if (idling) {
														
 
															+			blkio_mark_blkg_idling(stats);
														
 
															+			stats->start_idle_time = now;
														
 
															+		}
														
 
															+		if (waiting) {
														
 
															+			blkio_mark_blkg_waiting(stats);
														
 
															+			stats->start_group_wait_time = now;
														
 
															+		}
														
 
															+		if (empty) {
														
 
															+			blkio_mark_blkg_empty(stats);
														
 
															+			stats->start_empty_time = now;
														
 
															+		}
														
 
															+#endif
														
 
															+		spin_unlock(&blkg->stats_lock);
														
 
															+	}
														
 
															+	spin_unlock_irq(&blkcg->lock);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
														
 
															+				int chars_left, bool diskname_only)
														
 
															+{
														
 
															+	snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
														
 
															+	chars_left -= strlen(str);
														
 
															+	if (chars_left <= 0) {
														
 
															+		printk(KERN_WARNING
														
 
															+			"Possibly incorrect cgroup stat display format");
														
 
															+		return;
														
 
															+	}
														
 
															+	if (diskname_only)
														
 
															+		return;
														
 
															+	switch (type) {
														
 
															+	case BLKIO_STAT_READ:
														
 
															+		strlcat(str, " Read", chars_left);
														
 
															+		break;
														
 
															+	case BLKIO_STAT_WRITE:
														
 
															+		strlcat(str, " Write", chars_left);
														
 
															+		break;
														
 
															+	case BLKIO_STAT_SYNC:
														
 
															+		strlcat(str, " Sync", chars_left);
														
 
															+		break;
														
 
															+	case BLKIO_STAT_ASYNC:
														
 
															+		strlcat(str, " Async", chars_left);
														
 
															+		break;
														
 
															+	case BLKIO_STAT_TOTAL:
														
 
															+		strlcat(str, " Total", chars_left);
														
 
															+		break;
														
 
															+	default:
														
 
															+		strlcat(str, " Invalid", chars_left);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
														
 
															+				struct cgroup_map_cb *cb, dev_t dev)
														
 
															+{
														
 
															+	blkio_get_key_name(0, dev, str, chars_left, true);
														
 
															+	cb->fill(cb, str, val);
														
 
															+	return val;
														
 
															+}
														
 
															+
														
 
															+/* This should be called with blkg->stats_lock held */
														
 
															+static uint64_t blkio_get_stat(struct blkio_group *blkg,
														
 
															+		struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
														
 
															+{
														
 
															+	uint64_t disk_total;
														
 
															+	char key_str[MAX_KEY_LEN];
														
 
															+	enum stat_sub_type sub_type;
														
 
															+
														
 
															+	if (type == BLKIO_STAT_TIME)
														
 
															+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
														
 
															+					blkg->stats.time, cb, dev);
														
 
															+	if (type == BLKIO_STAT_SECTORS)
														
 
															+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
														
 
															+					blkg->stats.sectors, cb, dev);
														
 
															+#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															+	if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
														
 
															+		uint64_t sum = blkg->stats.avg_queue_size_sum;
														
 
															+		uint64_t samples = blkg->stats.avg_queue_size_samples;
														
 
															+		if (samples)
														
 
															+			do_div(sum, samples);
														
 
															+		else
														
 
															+			sum = 0;
														
 
															+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
														
 
															+	}
														
 
															+	if (type == BLKIO_STAT_GROUP_WAIT_TIME)
														
 
															+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
														
 
															+					blkg->stats.group_wait_time, cb, dev);
														
 
															+	if (type == BLKIO_STAT_IDLE_TIME)
														
 
															+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
														
 
															+					blkg->stats.idle_time, cb, dev);
														
 
															+	if (type == BLKIO_STAT_EMPTY_TIME)
														
 
															+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
														
 
															+					blkg->stats.empty_time, cb, dev);
														
 
															+	if (type == BLKIO_STAT_DEQUEUE)
														
 
															+		return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
														
 
															+					blkg->stats.dequeue, cb, dev);
														
 
															+#endif
														
 
															+
														
 
															+	for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
														
 
															+			sub_type++) {
														
 
															+		blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
														
 
															+		cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
														
 
															+	}
														
 
															+	disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
														
 
															+			blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
														
 
															+	blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
														
 
															+	cb->fill(cb, key_str, disk_total);
														
 
															+	return disk_total;
														
 
															+}
														
 
															+
														
 
															+#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total)		\
														
 
															 static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
														
 
															-			struct cftype *cftype, struct seq_file *m)	\
														
 
															+		struct cftype *cftype, struct cgroup_map_cb *cb)	\
														
 
															 {									\
														
 
															 	struct blkio_cgroup *blkcg;					\
														
 
															 	struct blkio_group *blkg;					\
														
 
															 	struct hlist_node *n;						\
														
 
															+	uint64_t cgroup_total = 0;					\
														
 
															 									\
														
 
															 	if (!cgroup_lock_live_group(cgroup))				\
														
 
															 		return -ENODEV;						\
														
@@ -186,32 +608,231 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
 
															 	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
														
 
															 	rcu_read_lock();						\
														
 
															 	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
														
 
															-		if (blkg->dev)						\
														
 
															-			seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev),	\
														
 
															-				 MINOR(blkg->dev), blkg->__VAR);	\
														
 
															+		if (blkg->dev) {					\
														
 
															+			spin_lock_irq(&blkg->stats_lock);		\
														
 
															+			cgroup_total += blkio_get_stat(blkg, cb,	\
														
 
															+						blkg->dev, type);	\
														
 
															+			spin_unlock_irq(&blkg->stats_lock);		\
														
 
															+		}							\
														
 
															 	}								\
														
 
															+	if (show_total)							\
														
 
															+		cb->fill(cb, "Total", cgroup_total);			\
														
 
															 	rcu_read_unlock();						\
														
 
															 	cgroup_unlock();						\
														
 
															 	return 0;							\
														
 
															 }
														
 
															-SHOW_FUNCTION_PER_GROUP(time);
														
 
															-SHOW_FUNCTION_PER_GROUP(sectors);
														
 
															+SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
														
 
															+SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
														
 
															+SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
														
 
															+SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
														
 
															+SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
														
 
															+SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
														
 
															+SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
														
 
															+SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
														
 
															 #ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															-SHOW_FUNCTION_PER_GROUP(dequeue);
														
 
															+SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
														
 
															+SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
														
 
															+SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
														
 
															+SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
														
 
															+SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
														
 
															 #endif
														
 
															 #undef SHOW_FUNCTION_PER_GROUP
														
 
															-#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
														
 
															-			unsigned long dequeue)
														
 
															+static int blkio_check_dev_num(dev_t dev)
														
 
															 {
														
 
															-	blkg->dequeue += dequeue;
														
 
															+	int part = 0;
														
 
															+	struct gendisk *disk;
														
 
															+
														
 
															+	disk = get_gendisk(dev, &part);
														
 
															+	if (!disk || part)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int blkio_policy_parse_and_set(char *buf,
														
 
															+				      struct blkio_policy_node *newpn)
														
 
															+{
														
 
															+	char *s[4], *p, *major_s = NULL, *minor_s = NULL;
														
 
															+	int ret;
														
 
															+	unsigned long major, minor, temp;
														
 
															+	int i = 0;
														
 
															+	dev_t dev;
														
 
															+
														
 
															+	memset(s, 0, sizeof(s));
														
 
															+
														
 
															+	while ((p = strsep(&buf, " ")) != NULL) {
														
 
															+		if (!*p)
														
 
															+			continue;
														
 
															+
														
 
															+		s[i++] = p;
														
 
															+
														
 
															+		/* Prevent from inputing too many things */
														
 
															+		if (i == 3)
														
 
															+			break;
														
 
															+	}
														
 
															+
														
 
															+	if (i != 2)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	p = strsep(&s[0], ":");
														
 
															+	if (p != NULL)
														
 
															+		major_s = p;
														
 
															+	else
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	minor_s = s[0];
														
 
															+	if (!minor_s)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	ret = strict_strtoul(major_s, 10, &major);
														
 
															+	if (ret)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	ret = strict_strtoul(minor_s, 10, &minor);
														
 
															+	if (ret)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	dev = MKDEV(major, minor);
														
 
															+
														
 
															+	ret = blkio_check_dev_num(dev);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	newpn->dev = dev;
														
 
															+
														
 
															+	if (s[1] == NULL)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	ret = strict_strtoul(s[1], 10, &temp);
														
 
															+	if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
														
 
															+	    temp > BLKIO_WEIGHT_MAX)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	newpn->weight =  temp;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
														
 
															+			      dev_t dev)
														
 
															+{
														
 
															+	struct blkio_policy_node *pn;
														
 
															+
														
 
															+	pn = blkio_policy_search_node(blkcg, dev);
														
 
															+	if (pn)
														
 
															+		return pn->weight;
														
 
															+	else
														
 
															+		return blkcg->weight;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(blkcg_get_weight);
														
 
															+
														
 
															+
														
 
															+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
														
 
															+				       const char *buffer)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	char *buf;
														
 
															+	struct blkio_policy_node *newpn, *pn;
														
 
															+	struct blkio_cgroup *blkcg;
														
 
															+	struct blkio_group *blkg;
														
 
															+	int keep_newpn = 0;
														
 
															+	struct hlist_node *n;
														
 
															+	struct blkio_policy_type *blkiop;
														
 
															+
														
 
															+	buf = kstrdup(buffer, GFP_KERNEL);
														
 
															+	if (!buf)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
														
 
															+	if (!newpn) {
														
 
															+		ret = -ENOMEM;
														
 
															+		goto free_buf;
														
 
															+	}
														
 
															+
														
 
															+	ret = blkio_policy_parse_and_set(buf, newpn);
														
 
															+	if (ret)
														
 
															+		goto free_newpn;
														
 
															+
														
 
															+	blkcg = cgroup_to_blkio_cgroup(cgrp);
														
 
															+
														
 
															+	spin_lock_irq(&blkcg->lock);
														
 
															+
														
 
															+	pn = blkio_policy_search_node(blkcg, newpn->dev);
														
 
															+	if (!pn) {
														
 
															+		if (newpn->weight != 0) {
														
 
															+			blkio_policy_insert_node(blkcg, newpn);
														
 
															+			keep_newpn = 1;
														
 
															+		}
														
 
															+		spin_unlock_irq(&blkcg->lock);
														
 
															+		goto update_io_group;
														
 
															+	}
														
 
															+
														
 
															+	if (newpn->weight == 0) {
														
 
															+		/* weight == 0 means deleteing a specific weight */
														
 
															+		blkio_policy_delete_node(pn);
														
 
															+		spin_unlock_irq(&blkcg->lock);
														
 
															+		goto update_io_group;
														
 
															+	}
														
 
															+	spin_unlock_irq(&blkcg->lock);
														
 
															+
														
 
															+	pn->weight = newpn->weight;
														
 
															+
														
 
															+update_io_group:
														
 
															+	/* update weight for each cfqg */
														
 
															+	spin_lock(&blkio_list_lock);
														
 
															+	spin_lock_irq(&blkcg->lock);
														
 
															+
														
 
															+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
														
 
															+		if (newpn->dev == blkg->dev) {
														
 
															+			list_for_each_entry(blkiop, &blkio_list, list)
														
 
															+				blkiop->ops.blkio_update_group_weight_fn(blkg,
														
 
															+							 newpn->weight ?
														
 
															+							 newpn->weight :
														
 
															+							 blkcg->weight);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	spin_unlock_irq(&blkcg->lock);
														
 
															+	spin_unlock(&blkio_list_lock);
														
 
															+
														
 
															+free_newpn:
														
 
															+	if (!keep_newpn)
														
 
															+		kfree(newpn);
														
 
															+free_buf:
														
 
															+	kfree(buf);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
														
 
															+				      struct seq_file *m)
														
 
															+{
														
 
															+	struct blkio_cgroup *blkcg;
														
 
															+	struct blkio_policy_node *pn;
														
 
															+
														
 
															+	seq_printf(m, "dev\tweight\n");
														
 
															+
														
 
															+	blkcg = cgroup_to_blkio_cgroup(cgrp);
														
 
															+	if (!list_empty(&blkcg->policy_list)) {
														
 
															+		spin_lock_irq(&blkcg->lock);
														
 
															+		list_for_each_entry(pn, &blkcg->policy_list, node) {
														
 
															+			seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
														
 
															+				   MINOR(pn->dev), pn->weight);
														
 
															+		}
														
 
															+		spin_unlock_irq(&blkcg->lock);
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															 }
														
 
															-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
														
 
															-#endif
														
 
															 struct cftype blkio_files[] = {
														
 
															+	{
														
 
															+		.name = "weight_device",
														
 
															+		.read_seq_string = blkiocg_weight_device_read,
														
 
															+		.write_string = blkiocg_weight_device_write,
														
 
															+		.max_write_len = 256,
														
 
															+	},
														
 
															 	{
														
 
															 		.name = "weight",
														
 
															 		.read_u64 = blkiocg_weight_read,
														
@@ -219,17 +840,61 @@ struct cftype blkio_files[] = {
 
															 	},
														
 
															 	{
														
 
															 		.name = "time",
														
 
															-		.read_seq_string = blkiocg_time_read,
														
 
															+		.read_map = blkiocg_time_read,
														
 
															 	},
														
 
															 	{
														
 
															 		.name = "sectors",
														
 
															-		.read_seq_string = blkiocg_sectors_read,
														
 
															+		.read_map = blkiocg_sectors_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "io_service_bytes",
														
 
															+		.read_map = blkiocg_io_service_bytes_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "io_serviced",
														
 
															+		.read_map = blkiocg_io_serviced_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "io_service_time",
														
 
															+		.read_map = blkiocg_io_service_time_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "io_wait_time",
														
 
															+		.read_map = blkiocg_io_wait_time_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "io_merged",
														
 
															+		.read_map = blkiocg_io_merged_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "io_queued",
														
 
															+		.read_map = blkiocg_io_queued_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "reset_stats",
														
 
															+		.write_u64 = blkiocg_reset_stats,
														
 
															 	},
														
 
															 #ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															-       {
														
 
															+	{
														
 
															+		.name = "avg_queue_size",
														
 
															+		.read_map = blkiocg_avg_queue_size_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "group_wait_time",
														
 
															+		.read_map = blkiocg_group_wait_time_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "idle_time",
														
 
															+		.read_map = blkiocg_idle_time_read,
														
 
															+	},
														
 
															+	{
														
 
															+		.name = "empty_time",
														
 
															+		.read_map = blkiocg_empty_time_read,
														
 
															+	},
														
 
															+	{
														
 
															 		.name = "dequeue",
														
 
															-		.read_seq_string = blkiocg_dequeue_read,
														
 
															-       },
														
 
															+		.read_map = blkiocg_dequeue_read,
														
 
															+	},
														
 
															 #endif
														
 
															 };
														
@@ -246,37 +911,42 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 
															 	struct blkio_group *blkg;
														
 
															 	void *key;
														
 
															 	struct blkio_policy_type *blkiop;
														
 
															+	struct blkio_policy_node *pn, *pntmp;
														
 
															 	rcu_read_lock();
														
 
															-remove_entry:
														
 
															-	spin_lock_irqsave(&blkcg->lock, flags);
														
 
															+	do {
														
 
															+		spin_lock_irqsave(&blkcg->lock, flags);
														
 
															+
														
 
															+		if (hlist_empty(&blkcg->blkg_list)) {
														
 
															+			spin_unlock_irqrestore(&blkcg->lock, flags);
														
 
															+			break;
														
 
															+		}
														
 
															+
														
 
															+		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
														
 
															+					blkcg_node);
														
 
															+		key = rcu_dereference(blkg->key);
														
 
															+		__blkiocg_del_blkio_group(blkg);
														
 
															-	if (hlist_empty(&blkcg->blkg_list)) {
														
 
															 		spin_unlock_irqrestore(&blkcg->lock, flags);
														
 
															-		goto done;
														
 
															-	}
														
 
															-	blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
														
 
															-				blkcg_node);
														
 
															-	key = rcu_dereference(blkg->key);
														
 
															-	__blkiocg_del_blkio_group(blkg);
														
 
															+		/*
														
 
															+		 * This blkio_group is being unlinked as associated cgroup is
														
 
															+		 * going away. Let all the IO controlling policies know about
														
 
															+		 * this event. Currently this is static call to one io
														
 
															+		 * controlling policy. Once we have more policies in place, we
														
 
															+		 * need some dynamic registration of callback function.
														
 
															+		 */
														
 
															+		spin_lock(&blkio_list_lock);
														
 
															+		list_for_each_entry(blkiop, &blkio_list, list)
														
 
															+			blkiop->ops.blkio_unlink_group_fn(key, blkg);
														
 
															+		spin_unlock(&blkio_list_lock);
														
 
															+	} while (1);
														
 
															-	spin_unlock_irqrestore(&blkcg->lock, flags);
														
 
															+	list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
														
 
															+		blkio_policy_delete_node(pn);
														
 
															+		kfree(pn);
														
 
															+	}
														
 
															-	/*
														
 
															-	 * This blkio_group is being unlinked as associated cgroup is going
														
 
															-	 * away. Let all the IO controlling policies know about this event.
														
 
															-	 *
														
 
															-	 * Currently this is static call to one io controlling policy. Once
														
 
															-	 * we have more policies in place, we need some dynamic registration
														
 
															-	 * of callback function.
														
 
															-	 */
														
 
															-	spin_lock(&blkio_list_lock);
														
 
															-	list_for_each_entry(blkiop, &blkio_list, list)
														
 
															-		blkiop->ops.blkio_unlink_group_fn(key, blkg);
														
 
															-	spin_unlock(&blkio_list_lock);
														
 
															-	goto remove_entry;
														
 
															-done:
														
 
															 	free_css_id(&blkio_subsys, &blkcg->css);
														
 
															 	rcu_read_unlock();
														
 
															 	if (blkcg != &blkio_root_cgroup)
														
@@ -307,6 +977,7 @@ done:
 
															 	spin_lock_init(&blkcg->lock);
														
 
															 	INIT_HLIST_HEAD(&blkcg->blkg_list);
														
 
															+	INIT_LIST_HEAD(&blkcg->policy_list);
														
 
															 	return &blkcg->css;
														
 
															 }
														
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,11 +23,84 @@ extern struct cgroup_subsys blkio_subsys;
 
															 #define blkio_subsys_id blkio_subsys.subsys_id
														
 
															 #endif
														
 
															+enum stat_type {
														
 
															+	/* Total time spent (in ns) between request dispatch to the driver and
														
 
															+	 * request completion for IOs doen by this cgroup. This may not be
														
 
															+	 * accurate when NCQ is turned on. */
														
 
															+	BLKIO_STAT_SERVICE_TIME = 0,
														
 
															+	/* Total bytes transferred */
														
 
															+	BLKIO_STAT_SERVICE_BYTES,
														
 
															+	/* Total IOs serviced, post merge */
														
 
															+	BLKIO_STAT_SERVICED,
														
 
															+	/* Total time spent waiting in scheduler queue in ns */
														
 
															+	BLKIO_STAT_WAIT_TIME,
														
 
															+	/* Number of IOs merged */
														
 
															+	BLKIO_STAT_MERGED,
														
 
															+	/* Number of IOs queued up */
														
 
															+	BLKIO_STAT_QUEUED,
														
 
															+	/* All the single valued stats go below this */
														
 
															+	BLKIO_STAT_TIME,
														
 
															+	BLKIO_STAT_SECTORS,
														
 
															+#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															+	BLKIO_STAT_AVG_QUEUE_SIZE,
														
 
															+	BLKIO_STAT_IDLE_TIME,
														
 
															+	BLKIO_STAT_EMPTY_TIME,
														
 
															+	BLKIO_STAT_GROUP_WAIT_TIME,
														
 
															+	BLKIO_STAT_DEQUEUE
														
 
															+#endif
														
 
															+};
														
 
															+
														
 
															+enum stat_sub_type {
														
 
															+	BLKIO_STAT_READ = 0,
														
 
															+	BLKIO_STAT_WRITE,
														
 
															+	BLKIO_STAT_SYNC,
														
 
															+	BLKIO_STAT_ASYNC,
														
 
															+	BLKIO_STAT_TOTAL
														
 
															+};
														
 
															+
														
 
															+/* blkg state flags */
														
 
															+enum blkg_state_flags {
														
 
															+	BLKG_waiting = 0,
														
 
															+	BLKG_idling,
														
 
															+	BLKG_empty,
														
 
															+};
														
 
															+
														
 
															 struct blkio_cgroup {
														
 
															 	struct cgroup_subsys_state css;
														
 
															 	unsigned int weight;
														
 
															 	spinlock_t lock;
														
 
															 	struct hlist_head blkg_list;
														
 
															+	struct list_head policy_list; /* list of blkio_policy_node */
														
 
															+};
														
 
															+
														
 
															+struct blkio_group_stats {
														
 
															+	/* total disk time and nr sectors dispatched by this group */
														
 
															+	uint64_t time;
														
 
															+	uint64_t sectors;
														
 
															+	uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
														
 
															+#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															+	/* Sum of number of IOs queued across all samples */
														
 
															+	uint64_t avg_queue_size_sum;
														
 
															+	/* Count of samples taken for average */
														
 
															+	uint64_t avg_queue_size_samples;
														
 
															+	/* How many times this group has been removed from service tree */
														
 
															+	unsigned long dequeue;
														
 
															+
														
 
															+	/* Total time spent waiting for it to be assigned a timeslice. */
														
 
															+	uint64_t group_wait_time;
														
 
															+	uint64_t start_group_wait_time;
														
 
															+
														
 
															+	/* Time spent idling for this blkio_group */
														
 
															+	uint64_t idle_time;
														
 
															+	uint64_t start_idle_time;
														
 
															+	/*
														
 
															+	 * Total time when we have requests queued and do not contain the
														
 
															+	 * current active queue.
														
 
															+	 */
														
 
															+	uint64_t empty_time;
														
 
															+	uint64_t start_empty_time;
														
 
															+	uint16_t flags;
														
 
															+#endif
														
 
															 };
														
 
															 struct blkio_group {
														
@@ -35,20 +108,25 @@ struct blkio_group {
 
															 	void *key;
														
 
															 	struct hlist_node blkcg_node;
														
 
															 	unsigned short blkcg_id;
														
 
															-#ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															 	/* Store cgroup path */
														
 
															 	char path[128];
														
 
															-	/* How many times this group has been removed from service tree */
														
 
															-	unsigned long dequeue;
														
 
															-#endif
														
 
															 	/* The device MKDEV(major, minor), this group has been created for */
														
 
															-	dev_t   dev;
														
 
															+	dev_t dev;
														
 
															-	/* total disk time and nr sectors dispatched by this group */
														
 
															-	unsigned long time;
														
 
															-	unsigned long sectors;
														
 
															+	/* Need to serialize the stats in the case of reset/update */
														
 
															+	spinlock_t stats_lock;
														
 
															+	struct blkio_group_stats stats;
														
 
															 };
														
 
															+struct blkio_policy_node {
														
 
															+	struct list_head node;
														
 
															+	dev_t dev;
														
 
															+	unsigned int weight;
														
 
															+};
														
 
															+
														
 
															+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
														
 
															+				     dev_t dev);
														
 
															+
														
 
															 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
														
 
															 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
														
 
															 						unsigned int weight);
														
@@ -67,6 +145,11 @@ struct blkio_policy_type {
 
															 extern void blkio_policy_register(struct blkio_policy_type *);
														
 
															 extern void blkio_policy_unregister(struct blkio_policy_type *);
														
 
															+static inline char *blkg_path(struct blkio_group *blkg)
														
 
															+{
														
 
															+	return blkg->path;
														
 
															+}
														
 
															+
														
 
															 #else
														
 
															 struct blkio_group {
														
@@ -78,6 +161,8 @@ struct blkio_policy_type {
 
															 static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
														
 
															 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
														
 
															+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
														
 
															+
														
 
															 #endif
														
 
															 #define BLKIO_WEIGHT_MIN	100
														
@@ -85,16 +170,42 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 
															 #define BLKIO_WEIGHT_DEFAULT	500
														
 
															 #ifdef CONFIG_DEBUG_BLK_CGROUP
														
 
															-static inline char *blkg_path(struct blkio_group *blkg)
														
 
															-{
														
 
															-	return blkg->path;
														
 
															-}
														
 
															-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
														
 
															+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
														
 
															+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
														
 
															 				unsigned long dequeue);
														
 
															+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
														
 
															+void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
														
 
															+void blkiocg_set_start_empty_time(struct blkio_group *blkg);
														
 
															+
														
 
															+#define BLKG_FLAG_FNS(name)						\
														
 
															+static inline void blkio_mark_blkg_##name(				\
														
 
															+		struct blkio_group_stats *stats)			\
														
 
															+{									\
														
 
															+	stats->flags |= (1 << BLKG_##name);				\
														
 
															+}									\
														
 
															+static inline void blkio_clear_blkg_##name(				\
														
 
															+		struct blkio_group_stats *stats)			\
														
 
															+{									\
														
 
															+	stats->flags &= ~(1 << BLKG_##name);				\
														
 
															+}									\
														
 
															+static inline int blkio_blkg_##name(struct blkio_group_stats *stats)	\
														
 
															+{									\
														
 
															+	return (stats->flags & (1 << BLKG_##name)) != 0;		\
														
 
															+}									\
														
 
															+
														
 
															+BLKG_FLAG_FNS(waiting)
														
 
															+BLKG_FLAG_FNS(idling)
														
 
															+BLKG_FLAG_FNS(empty)
														
 
															+#undef BLKG_FLAG_FNS
														
 
															 #else
														
 
															-static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
														
 
															-static inline void blkiocg_update_blkio_group_dequeue_stats(
														
 
															-			struct blkio_group *blkg, unsigned long dequeue) {}
														
 
															+static inline void blkiocg_update_avg_queue_size_stats(
														
 
															+						struct blkio_group *blkg) {}
														
 
															+static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
														
 
															+						unsigned long dequeue) {}
														
 
															+static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
														
 
															+{}
														
 
															+static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
														
 
															+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
														
 
															 #endif
														
 
															 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
														
@@ -105,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 
															 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
														
 
															 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
														
 
															 						void *key);
														
 
															-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
														
 
															-			unsigned long time, unsigned long sectors);
														
 
															+void blkiocg_update_timeslice_used(struct blkio_group *blkg,
														
 
															+					unsigned long time);
														
 
															+void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
														
 
															+						bool direction, bool sync);
														
 
															+void blkiocg_update_completion_stats(struct blkio_group *blkg,
														
 
															+	uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
														
 
															+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
														
 
															+					bool sync);
														
 
															+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
														
 
															+		struct blkio_group *curr_blkg, bool direction, bool sync);
														
 
															+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
														
 
															+					bool direction, bool sync);
														
 
															 #else
														
 
															 struct cgroup;
														
 
															 static inline struct blkio_cgroup *
														
 
															 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
														
 
															 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
														
 
															-			struct blkio_group *blkg, void *key, dev_t dev)
														
 
															-{
														
 
															-}
														
 
															+			struct blkio_group *blkg, void *key, dev_t dev) {}
														
 
															 static inline int
														
 
															 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
														
 
															 static inline struct blkio_group *
														
 
															 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
														
 
															-static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
														
 
															-			unsigned long time, unsigned long sectors)
														
 
															-{
														
 
															-}
														
 
															+static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
														
 
															+						unsigned long time) {}
														
 
															+static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
														
 
															+				uint64_t bytes, bool direction, bool sync) {}
														
 
															+static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
														
 
															+		uint64_t start_time, uint64_t io_start_time, bool direction,
														
 
															+		bool sync) {}
														
 
															+static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
														
 
															+						bool direction, bool sync) {}
														
 
															+static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
														
 
															+		struct blkio_group *curr_blkg, bool direction, bool sync) {}
														
 
															+static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
														
 
															+						bool direction, bool sync) {}
														
 
															 #endif
														
 
															 #endif /* _BLK_CGROUP_H */
														
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 
															 	rq->tag = -1;
														
 
															 	rq->ref_count = 1;
														
 
															 	rq->start_time = jiffies;
														
 
															+	set_start_time_ns(rq);
														
 
															 }
														
 
															 EXPORT_SYMBOL(blk_rq_init);
														
@@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
 
															 	 */
														
 
															 	blk_sync_queue(q);
														
 
															+	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
														
 
															 	mutex_lock(&q->sysfs_lock);
														
 
															 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
														
 
															 	mutex_unlock(&q->sysfs_lock);
														
@@ -510,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
															 		return NULL;
														
 
															 	}
														
 
															+	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
														
 
															+		    laptop_mode_timer_fn, (unsigned long) q);
														
 
															 	init_timer(&q->unplug_timer);
														
 
															 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
														
 
															 	INIT_LIST_HEAD(&q->timeout_list);
														
@@ -568,6 +572,22 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 
															 {
														
 
															 	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
														
 
															+	return blk_init_allocated_queue_node(q, rfn, lock, node_id);
														
 
															+}
														
 
															+EXPORT_SYMBOL(blk_init_queue_node);
														
 
															+
														
 
															+struct request_queue *
														
 
															+blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
														
 
															+			 spinlock_t *lock)
														
 
															+{
														
 
															+	return blk_init_allocated_queue_node(q, rfn, lock, -1);
														
 
															+}
														
 
															+EXPORT_SYMBOL(blk_init_allocated_queue);
														
 
															+
														
 
															+struct request_queue *
														
 
															+blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
														
 
															+			      spinlock_t *lock, int node_id)
														
 
															+{
														
 
															 	if (!q)
														
 
															 		return NULL;
														
@@ -601,7 +621,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 
															 	blk_put_queue(q);
														
 
															 	return NULL;
														
 
															 }
														
 
															-EXPORT_SYMBOL(blk_init_queue_node);
														
 
															+EXPORT_SYMBOL(blk_init_allocated_queue_node);
														
 
															 int blk_get_queue(struct request_queue *q)
														
 
															 {
														
@@ -1198,6 +1218,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
															 		if (!blk_rq_cpu_valid(req))
														
 
															 			req->cpu = bio->bi_comp_cpu;
														
 
															 		drive_stat_acct(req, 0);
														
 
															+		elv_bio_merged(q, req, bio);
														
 
															 		if (!attempt_back_merge(q, req))
														
 
															 			elv_merged_request(q, req, el_ret);
														
 
															 		goto out;
														
@@ -1231,6 +1252,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
															 		if (!blk_rq_cpu_valid(req))
														
 
															 			req->cpu = bio->bi_comp_cpu;
														
 
															 		drive_stat_acct(req, 0);
														
 
															+		elv_bio_merged(q, req, bio);
														
 
															 		if (!attempt_front_merge(q, req))
														
 
															 			elv_merged_request(q, req, el_ret);
														
 
															 		goto out;
														
@@ -1855,8 +1877,10 @@ void blk_dequeue_request(struct request *rq)
 
															 	 * and to it is freed is accounted as io that is in progress at
														
 
															 	 * the driver side.
														
 
															 	 */
														
 
															-	if (blk_account_rq(rq))
														
 
															+	if (blk_account_rq(rq)) {
														
 
															 		q->in_flight[rq_is_sync(rq)]++;
														
 
															+		set_io_start_time_ns(rq);
														
 
															+	}
														
 
															 }
														
 
															 /**
														
@@ -2098,7 +2122,7 @@ static void blk_finish_request(struct request *req, int error)
 
															 	BUG_ON(blk_queued_rq(req));
														
 
															 	if (unlikely(laptop_mode) && blk_fs_request(req))
														
 
															-		laptop_io_completion();
														
 
															+		laptop_io_completion(&req->q->backing_dev_info);
														
 
															 	blk_delete_timer(req);
														
@@ -2517,4 +2541,3 @@ int __init blk_dev_init(void)
 
															 	return 0;
														
 
															 }
														
 
															-
														
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -0,0 +1,233 @@
 
															+/*
														
 
															+ * Functions related to generic helpers functions
														
 
															+ */
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/bio.h>
														
 
															+#include <linux/blkdev.h>
														
 
															+#include <linux/scatterlist.h>
														
 
															+
														
 
															+#include "blk.h"
														
 
															+
														
 
															+static void blkdev_discard_end_io(struct bio *bio, int err)
														
 
															+{
														
 
															+	if (err) {
														
 
															+		if (err == -EOPNOTSUPP)
														
 
															+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
														
 
															+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
														
 
															+	}
														
 
															+
														
 
															+	if (bio->bi_private)
														
 
															+		complete(bio->bi_private);
														
 
															+	__free_page(bio_page(bio));
														
 
															+
														
 
															+	bio_put(bio);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * blkdev_issue_discard - queue a discard
														
 
															+ * @bdev:	blockdev to issue discard for
														
 
															+ * @sector:	start sector
														
 
															+ * @nr_sects:	number of sectors to discard
														
 
															+ * @gfp_mask:	memory allocation flags (for bio_alloc)
														
 
															+ * @flags:	BLKDEV_IFL_* flags to control behaviour
														
 
															+ *
														
 
															+ * Description:
														
 
															+ *    Issue a discard request for the sectors in question.
														
 
															+ */
														
 
															+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
														
 
															+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
														
 
															+{
														
 
															+	DECLARE_COMPLETION_ONSTACK(wait);
														
 
															+	struct request_queue *q = bdev_get_queue(bdev);
														
 
															+	int type = flags & BLKDEV_IFL_BARRIER ?
														
 
															+		DISCARD_BARRIER : DISCARD_NOBARRIER;
														
 
															+	struct bio *bio;
														
 
															+	struct page *page;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	if (!q)
														
 
															+		return -ENXIO;
														
 
															+
														
 
															+	if (!blk_queue_discard(q))
														
 
															+		return -EOPNOTSUPP;
														
 
															+
														
 
															+	while (nr_sects && !ret) {
														
 
															+		unsigned int sector_size = q->limits.logical_block_size;
														
 
															+		unsigned int max_discard_sectors =
														
 
															+			min(q->limits.max_discard_sectors, UINT_MAX >> 9);
														
 
															+
														
 
															+		bio = bio_alloc(gfp_mask, 1);
														
 
															+		if (!bio)
														
 
															+			goto out;
														
 
															+		bio->bi_sector = sector;
														
 
															+		bio->bi_end_io = blkdev_discard_end_io;
														
 
															+		bio->bi_bdev = bdev;
														
 
															+		if (flags & BLKDEV_IFL_WAIT)
														
 
															+			bio->bi_private = &wait;
														
 
															+
														
 
															+		/*
														
 
															+		 * Add a zeroed one-sector payload as that's what
														
 
															+		 * our current implementations need.  If we'll ever need
														
 
															+		 * more the interface will need revisiting.
														
 
															+		 */
														
 
															+		page = alloc_page(gfp_mask | __GFP_ZERO);
														
 
															+		if (!page)
														
 
															+			goto out_free_bio;
														
 
															+		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
														
 
															+			goto out_free_page;
														
 
															+
														
 
															+		/*
														
 
															+		 * And override the bio size - the way discard works we
														
 
															+		 * touch many more blocks on disk than the actual payload
														
 
															+		 * length.
														
 
															+		 */
														
 
															+		if (nr_sects > max_discard_sectors) {
														
 
															+			bio->bi_size = max_discard_sectors << 9;
														
 
															+			nr_sects -= max_discard_sectors;
														
 
															+			sector += max_discard_sectors;
														
 
															+		} else {
														
 
															+			bio->bi_size = nr_sects << 9;
														
 
															+			nr_sects = 0;
														
 
															+		}
														
 
															+
														
 
															+		bio_get(bio);
														
 
															+		submit_bio(type, bio);
														
 
															+
														
 
															+		if (flags & BLKDEV_IFL_WAIT)
														
 
															+			wait_for_completion(&wait);
														
 
															+
														
 
															+		if (bio_flagged(bio, BIO_EOPNOTSUPP))
														
 
															+			ret = -EOPNOTSUPP;
														
 
															+		else if (!bio_flagged(bio, BIO_UPTODATE))
														
 
															+			ret = -EIO;
														
 
															+		bio_put(bio);
														
 
															+	}
														
 
															+	return ret;
														
 
															+out_free_page:
														
 
															+	__free_page(page);
														
 
															+out_free_bio:
														
 
															+	bio_put(bio);
														
 
															+out:
														
 
															+	return -ENOMEM;
														
 
															+}
														
 
															+EXPORT_SYMBOL(blkdev_issue_discard);
														
 
															+
														
 
															+struct bio_batch
														
 
															+{
														
 
															+	atomic_t 		done;
														
 
															+	unsigned long 		flags;
														
 
															+	struct completion 	*wait;
														
 
															+	bio_end_io_t		*end_io;
														
 
															+};
														
 
															+
														
 
															+static void bio_batch_end_io(struct bio *bio, int err)
														
 
															+{
														
 
															+	struct bio_batch *bb = bio->bi_private;
														
 
															+
														
 
															+	if (err) {
														
 
															+		if (err == -EOPNOTSUPP)
														
 
															+			set_bit(BIO_EOPNOTSUPP, &bb->flags);
														
 
															+		else
														
 
															+			clear_bit(BIO_UPTODATE, &bb->flags);
														
 
															+	}
														
 
															+	if (bb) {
														
 
															+		if (bb->end_io)
														
 
															+			bb->end_io(bio, err);
														
 
															+		atomic_inc(&bb->done);
														
 
															+		complete(bb->wait);
														
 
															+	}
														
 
															+	bio_put(bio);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * blkdev_issue_zeroout generate number of zero filed write bios
														
 
															+ * @bdev:	blockdev to issue
														
 
															+ * @sector:	start sector
														
 
															+ * @nr_sects:	number of sectors to write
														
 
															+ * @gfp_mask:	memory allocation flags (for bio_alloc)
														
 
															+ * @flags:	BLKDEV_IFL_* flags to control behaviour
														
 
															+ *
														
 
															+ * Description:
														
 
															+ *  Generate and issue number of bios with zerofiled pages.
														
 
															+ *  Send barrier at the beginning and at the end if requested. This guarantie
														
 
															+ *  correct request ordering. Empty barrier allow us to avoid post queue flush.
														
 
															+ */
														
 
															+
														
 
															+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
														
 
															+			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	struct bio *bio;
														
 
															+	struct bio_batch bb;
														
 
															+	unsigned int sz, issued = 0;
														
 
															+	DECLARE_COMPLETION_ONSTACK(wait);
														
 
															+
														
 
															+	atomic_set(&bb.done, 0);
														
 
															+	bb.flags = 1 << BIO_UPTODATE;
														
 
															+	bb.wait = &wait;
														
 
															+	bb.end_io = NULL;
														
 
															+
														
 
															+	if (flags & BLKDEV_IFL_BARRIER) {
														
 
															+		/* issue async barrier before the data */
														
 
															+		ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
														
 
															+		if (ret)
														
 
															+			return ret;
														
 
															+	}
														
 
															+submit:
														
 
															+	while (nr_sects != 0) {
														
 
															+		bio = bio_alloc(gfp_mask,
														
 
															+				min(nr_sects, (sector_t)BIO_MAX_PAGES));
														
 
															+		if (!bio)
														
 
															+			break;
														
 
															+
														
 
															+		bio->bi_sector = sector;
														
 
															+		bio->bi_bdev   = bdev;
														
 
															+		bio->bi_end_io = bio_batch_end_io;
														
 
															+		if (flags & BLKDEV_IFL_WAIT)
														
 
															+			bio->bi_private = &bb;
														
 
															+
														
 
															+		while (nr_sects != 0) {
														
 
															+			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
														
 
															+			if (sz == 0)
														
 
															+				/* bio has maximum size possible */
														
 
															+				break;
														
 
															+			ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
														
 
															+			nr_sects -= ret >> 9;
														
 
															+			sector += ret >> 9;
														
 
															+			if (ret < (sz << 9))
														
 
															+				break;
														
 
															+		}
														
 
															+		issued++;
														
 
															+		submit_bio(WRITE, bio);
														
 
															+	}
														
 
															+	/*
														
 
															+	 * When all data bios are in flight. Send final barrier if requeted.
														
 
															+	 */
														
 
															+	if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
														
 
															+		ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
														
 
															+					flags & BLKDEV_IFL_WAIT);
														
 
															+
														
 
															+
														
 
															+	if (flags & BLKDEV_IFL_WAIT)
														
 
															+		/* Wait for bios in-flight */
														
 
															+		while ( issued != atomic_read(&bb.done))
														
 
															+			wait_for_completion(&wait);
														
 
															+
														
 
															+	if (!test_bit(BIO_UPTODATE, &bb.flags))
														
 
															+		/* One of bios in the batch was completed with error.*/
														
 
															+		ret = -EIO;
														
 
															+
														
 
															+	if (ret)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
														
 
															+		ret = -EOPNOTSUPP;
														
 
															+		goto out;
														
 
															+	}
														
 
															+	if (nr_sects != 0)
														
 
															+		goto submit;
														
 
															+out:
														
 
															+	return ret;
														
 
															+}
														
 
															+EXPORT_SYMBOL(blkdev_issue_zeroout);
														
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4;
 
															 #define RQ_CIC(rq)		\
														
 
															 	((struct cfq_io_context *) (rq)->elevator_private)
														
 
															 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
														
 
															+#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private3)
														
 
															 static struct kmem_cache *cfq_pool;
														
 
															 static struct kmem_cache *cfq_ioc_pool;
														
@@ -143,8 +144,6 @@ struct cfq_queue {
 
															 	struct cfq_queue *new_cfqq;
														
 
															 	struct cfq_group *cfqg;
														
 
															 	struct cfq_group *orig_cfqg;
														
 
															-	/* Sectors dispatched in current dispatch round */
														
 
															-	unsigned long nr_sectors;
														
 
															 };
														
 
															 /*
														
@@ -346,7 +345,7 @@ CFQ_CFQQ_FNS(deep);
 
															 CFQ_CFQQ_FNS(wait_busy);
														
 
															 #undef CFQ_CFQQ_FNS
														
 
															-#ifdef CONFIG_DEBUG_CFQ_IOSCHED
														
 
															+#ifdef CONFIG_CFQ_GROUP_IOSCHED
														
 
															 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
														
 
															 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
														
 
															 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
														
@@ -858,7 +857,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
															 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
														
 
															 		cfq_rb_erase(&cfqg->rb_node, st);
														
 
															 	cfqg->saved_workload_slice = 0;
														
 
															-	blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
														
 
															+	blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
														
 
															 }
														
 
															 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
														
@@ -884,8 +883,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 
															 			slice_used = cfqq->allocated_slice;
														
 
															 	}
														
 
															-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
														
 
															-				cfqq->nr_sectors);
														
 
															+	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
														
 
															 	return slice_used;
														
 
															 }
														
@@ -919,8 +917,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
															 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
														
 
															 					st->min_vdisktime);
														
 
															-	blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
														
 
															-						cfqq->nr_sectors);
														
 
															+	blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
														
 
															+	blkiocg_set_start_empty_time(&cfqg->blkg);
														
 
															 }
														
 
															 #ifdef CONFIG_CFQ_GROUP_IOSCHED
														
@@ -961,7 +959,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 
															 	if (!cfqg)
														
 
															 		goto done;
														
 
															-	cfqg->weight = blkcg->weight;
														
 
															 	for_each_cfqg_st(cfqg, i, j, st)
														
 
															 		*st = CFQ_RB_ROOT;
														
 
															 	RB_CLEAR_NODE(&cfqg->rb_node);
														
@@ -978,6 +975,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 
															 	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
														
 
															 	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
														
 
															 					MKDEV(major, minor));
														
 
															+	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
														
 
															 	/* Add group on cfqd list */
														
 
															 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
														
@@ -1004,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 
															 	return cfqg;
														
 
															 }
														
 
															+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
														
 
															+{
														
 
															+	atomic_inc(&cfqg->ref);
														
 
															+	return cfqg;
														
 
															+}
														
 
															+
														
 
															 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
														
 
															 {
														
 
															 	/* Currently, all async queues are mapped to root group */
														
@@ -1087,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 
															 {
														
 
															 	return &cfqd->root_group;
														
 
															 }
														
 
															+
														
 
															+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
														
 
															+{
														
 
															+	return cfqg;
														
 
															+}
														
 
															+
														
 
															 static inline void
														
 
															 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
														
 
															 	cfqq->cfqg = cfqg;
														
@@ -1389,7 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 
															 {
														
 
															 	elv_rb_del(&cfqq->sort_list, rq);
														
 
															 	cfqq->queued[rq_is_sync(rq)]--;
														
 
															+	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
														
 
															+						rq_is_sync(rq));
														
 
															 	cfq_add_rq_rb(rq);
														
 
															+	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
														
 
															+			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
														
 
															+			rq_is_sync(rq));
														
 
															 }
														
 
															 static struct request *
														
@@ -1445,6 +1460,8 @@ static void cfq_remove_request(struct request *rq)
 
															 	cfq_del_rq_rb(rq);
														
 
															 	cfqq->cfqd->rq_queued--;
														
 
															+	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
														
 
															+						rq_is_sync(rq));
														
 
															 	if (rq_is_meta(rq)) {
														
 
															 		WARN_ON(!cfqq->meta_pending);
														
 
															 		cfqq->meta_pending--;
														
@@ -1476,6 +1493,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 
															 	}
														
 
															 }
														
 
															+static void cfq_bio_merged(struct request_queue *q, struct request *req,
														
 
															+				struct bio *bio)
														
 
															+{
														
 
															+	blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
														
 
															+					cfq_bio_sync(bio));
														
 
															+}
														
 
															+
														
 
															 static void
														
 
															 cfq_merged_requests(struct request_queue *q, struct request *rq,
														
 
															 		    struct request *next)
														
@@ -1493,6 +1517,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 
															 	if (cfqq->next_rq == next)
														
 
															 		cfqq->next_rq = rq;
														
 
															 	cfq_remove_request(next);
														
 
															+	blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
														
 
															+					rq_is_sync(next));
														
 
															 }
														
 
															 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
														
@@ -1520,18 +1546,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 
															 	return cfqq == RQ_CFQQ(rq);
														
 
															 }
														
 
															+static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
														
 
															+{
														
 
															+	del_timer(&cfqd->idle_slice_timer);
														
 
															+	blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
														
 
															+}
														
 
															+
														
 
															 static void __cfq_set_active_queue(struct cfq_data *cfqd,
														
 
															 				   struct cfq_queue *cfqq)
														
 
															 {
														
 
															 	if (cfqq) {
														
 
															 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
														
 
															 				cfqd->serving_prio, cfqd->serving_type);
														
 
															+		blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
														
 
															 		cfqq->slice_start = 0;
														
 
															 		cfqq->dispatch_start = jiffies;
														
 
															 		cfqq->allocated_slice = 0;
														
 
															 		cfqq->slice_end = 0;
														
 
															 		cfqq->slice_dispatch = 0;
														
 
															-		cfqq->nr_sectors = 0;
														
 
															 		cfq_clear_cfqq_wait_request(cfqq);
														
 
															 		cfq_clear_cfqq_must_dispatch(cfqq);
														
@@ -1539,7 +1571,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 
															 		cfq_clear_cfqq_fifo_expire(cfqq);
														
 
															 		cfq_mark_cfqq_slice_new(cfqq);
														
 
															-		del_timer(&cfqd->idle_slice_timer);
														
 
															+		cfq_del_timer(cfqd, cfqq);
														
 
															 	}
														
 
															 	cfqd->active_queue = cfqq;
														
@@ -1555,7 +1587,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
															 	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
														
 
															 	if (cfq_cfqq_wait_request(cfqq))
														
 
															-		del_timer(&cfqd->idle_slice_timer);
														
 
															+		cfq_del_timer(cfqd, cfqq);
														
 
															 	cfq_clear_cfqq_wait_request(cfqq);
														
 
															 	cfq_clear_cfqq_wait_busy(cfqq);
														
@@ -1857,6 +1889,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 
															 	sl = cfqd->cfq_slice_idle;
														
 
															 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
														
 
															+	blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
														
 
															 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
														
 
															 }
														
@@ -1876,7 +1909,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
															 	elv_dispatch_sort(q, rq);
														
 
															 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
														
 
															-	cfqq->nr_sectors += blk_rq_sectors(rq);
														
 
															+	blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
														
 
															+					rq_data_dir(rq), rq_is_sync(rq));
														
 
															 }
														
 
															 /*
														
@@ -3185,11 +3219,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
															 		if (cfq_cfqq_wait_request(cfqq)) {
														
 
															 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
														
 
															 			    cfqd->busy_queues > 1) {
														
 
															-				del_timer(&cfqd->idle_slice_timer);
														
 
															+				cfq_del_timer(cfqd, cfqq);
														
 
															 				cfq_clear_cfqq_wait_request(cfqq);
														
 
															 				__blk_run_queue(cfqd->queue);
														
 
															-			} else
														
 
															+			} else {
														
 
															+				blkiocg_update_idle_time_stats(
														
 
															+						&cfqq->cfqg->blkg);
														
 
															 				cfq_mark_cfqq_must_dispatch(cfqq);
														
 
															+			}
														
 
															 		}
														
 
															 	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
														
 
															 		/*
														
@@ -3214,7 +3251,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 
															 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
														
 
															 	list_add_tail(&rq->queuelist, &cfqq->fifo);
														
 
															 	cfq_add_rq_rb(rq);
														
 
															-
														
 
															+	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
														
 
															+			&cfqd->serving_group->blkg, rq_data_dir(rq),
														
 
															+			rq_is_sync(rq));
														
 
															 	cfq_rq_enqueued(cfqd, cfqq, rq);
														
 
															 }
														
@@ -3300,6 +3339,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 
															 	WARN_ON(!cfqq->dispatched);
														
 
															 	cfqd->rq_in_driver--;
														
 
															 	cfqq->dispatched--;
														
 
															+	blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
														
 
															+			rq_io_start_time_ns(rq), rq_data_dir(rq),
														
 
															+			rq_is_sync(rq));
														
 
															 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
														
@@ -3440,6 +3482,10 @@ static void cfq_put_request(struct request *rq)
 
															 		rq->elevator_private = NULL;
														
 
															 		rq->elevator_private2 = NULL;
														
 
															+		/* Put down rq reference on cfqg */
														
 
															+		cfq_put_cfqg(RQ_CFQG(rq));
														
 
															+		rq->elevator_private3 = NULL;
														
 
															+
														
 
															 		cfq_put_queue(cfqq);
														
 
															 	}
														
 
															 }
														
@@ -3528,6 +3574,7 @@ new_queue:
 
															 	rq->elevator_private = cic;
														
 
															 	rq->elevator_private2 = cfqq;
														
 
															+	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
														
 
															 	return 0;
														
 
															 queue_fail:
														
@@ -3743,7 +3790,6 @@ static void *cfq_init_queue(struct request_queue *q)
 
															 	 * second, in order to have larger depth for async operations.
														
 
															 	 */
														
 
															 	cfqd->last_delayed_sync = jiffies - HZ;
														
 
															-	INIT_RCU_HEAD(&cfqd->rcu);
														
 
															 	return cfqd;
														
 
															 }
														
@@ -3872,6 +3918,7 @@ static struct elevator_type iosched_cfq = {
 
															 		.elevator_merged_fn =		cfq_merged_request,
														
 
															 		.elevator_merge_req_fn =	cfq_merged_requests,
														
 
															 		.elevator_allow_merge_fn =	cfq_allow_merge,
														
 
															+		.elevator_bio_merged_fn =	cfq_bio_merged,
														
 
															 		.elevator_dispatch_fn =		cfq_dispatch_requests,
														
 
															 		.elevator_add_req_fn =		cfq_insert_request,
														
 
															 		.elevator_activate_req_fn =	cfq_activate_request,
														
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 
															 	q->last_merge = rq;
														
 
															 }
														
 
															+void elv_bio_merged(struct request_queue *q, struct request *rq,
														
 
															+			struct bio *bio)
														
 
															+{
														
 
															+	struct elevator_queue *e = q->elevator;
														
 
															+
														
 
															+	if (e->ops->elevator_bio_merged_fn)
														
 
															+		e->ops->elevator_bio_merged_fn(q, rq, bio);
														
 
															+}
														
 
															+
														
 
															 void elv_requeue_request(struct request_queue *q, struct request *rq)
														
 
															 {
														
 
															 	/*
														
@@ -921,6 +930,7 @@ int elv_register_queue(struct request_queue *q)
 
															 	}
														
 
															 	return error;
														
 
															 }
														
 
															+EXPORT_SYMBOL(elv_register_queue);
														
 
															 static void __elv_unregister_queue(struct elevator_queue *e)
														
 
															 {
														
@@ -933,6 +943,7 @@ void elv_unregister_queue(struct request_queue *q)
 
															 	if (q)
														
 
															 		__elv_unregister_queue(q->elevator);
														
 
															 }
														
 
															+EXPORT_SYMBOL(elv_unregister_queue);
														
 
															 void elv_register(struct elevator_type *e)
														
 
															 {
														
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 
															 	return disk;
														
 
															 }
														
 
															+EXPORT_SYMBOL(get_gendisk);
														
 
															 /**
														
 
															  * bdget_disk - do bdget() by gendisk and partition number
														
@@ -987,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 
															 	if (!new_ptbl)
														
 
															 		return -ENOMEM;
														
 
															-	INIT_RCU_HEAD(&new_ptbl->rcu_head);
														
 
															 	new_ptbl->len = target;
														
 
															 	for (i = 0; i < len; i++)
														
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 
															 	if (start + len > (bdev->bd_inode->i_size >> 9))
														
 
															 		return -EINVAL;
														
 
															 	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
														
 
															-				    DISCARD_FL_WAIT);
														
 
															+				    BLKDEV_IFL_WAIT);
														
 
															 }
														
 
															 static int put_ushort(unsigned long arg, unsigned short val)
														
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -76,6 +76,17 @@ config BLK_DEV_XD
 
															 	  It's pretty unlikely that you have one of these: say N.
														
 
															+config GDROM
														
 
															+	tristate "SEGA Dreamcast GD-ROM drive"
														
 
															+	depends on SH_DREAMCAST
														
 
															+	help
														
 
															+	  A standard SEGA Dreamcast comes with a modified CD ROM drive called a
														
 
															+	  "GD-ROM" by SEGA to signify it is capable of reading special disks
														
 
															+	  with up to 1 GB of data. This drive will also read standard CD ROM
														
 
															+	  disks. Select this option to access any disks in your GD ROM drive.
														
 
															+	  Most users will want to say "Y" here.
														
 
															+	  You can also build this as a module which will be called gdrom.
														
 
															+
														
 
															 config PARIDE
														
 
															 	tristate "Parallel port IDE device support"
														
 
															 	depends on PARPORT_PC
														
@@ -103,17 +114,6 @@ config PARIDE
 
															 	  "MicroSolutions backpack protocol", "DataStor Commuter protocol"
														
 
															 	  etc.).
														
 
															-config GDROM
														
 
															-	tristate "SEGA Dreamcast GD-ROM drive"
														
 
															-	depends on SH_DREAMCAST
														
 
															-	help
														
 
															-	  A standard SEGA Dreamcast comes with a modified CD ROM drive called a
														
 
															-	  "GD-ROM" by SEGA to signify it is capable of reading special disks
														
 
															-	  with up to 1 GB of data. This drive will also read standard CD ROM
														
 
															-	  disks. Select this option to access any disks in your GD ROM drive.
														
 
															-	  Most users will want to say "Y" here.
														
 
															-	  You can also build this as a module which will be called gdrom.
														
 
															-
														
 
															 source "drivers/block/paride/Kconfig"
														
 
															 config BLK_CPQ_DA
														
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -84,6 +84,9 @@ struct drbd_bitmap {
 
															 #define BM_MD_IO_ERROR  1
														
 
															 #define BM_P_VMALLOCED  2
														
 
															+static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
														
 
															+			       unsigned long e, int val, const enum km_type km);
														
 
															+
														
 
															 static int bm_is_locked(struct drbd_bitmap *b)
														
 
															 {
														
 
															 	return test_bit(BM_LOCKED, &b->bm_flags);
														
@@ -441,7 +444,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
 
															  * In case this is actually a resize, we copy the old bitmap into the new one.
														
 
															  * Otherwise, the bitmap is initialized to all bits set.
														
 
															  */
														
 
															-int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
														
 
															+int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
														
 
															 {
														
 
															 	struct drbd_bitmap *b = mdev->bitmap;
														
 
															 	unsigned long bits, words, owords, obits, *p_addr, *bm;
														
@@ -516,7 +519,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
 
															 	obits  = b->bm_bits;
														
 
															 	growing = bits > obits;
														
 
															-	if (opages)
														
 
															+	if (opages && growing && set_new_bits)
														
 
															 		bm_set_surplus(b);
														
 
															 	b->bm_pages = npages;
														
@@ -526,8 +529,12 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
 
															 	b->bm_dev_capacity = capacity;
														
 
															 	if (growing) {
														
 
															-		bm_memset(b, owords, 0xff, words-owords);
														
 
															-		b->bm_set += bits - obits;
														
 
															+		if (set_new_bits) {
														
 
															+			bm_memset(b, owords, 0xff, words-owords);
														
 
															+			b->bm_set += bits - obits;
														
 
															+		} else
														
 
															+			bm_memset(b, owords, 0x00, words-owords);
														
 
															+
														
 
															 	}
														
 
															 	if (want < have) {
														
@@ -773,7 +780,7 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int
 
															 	/* nothing to do, on disk == in memory */
														
 
															 # define bm_cpu_to_lel(x) ((void)0)
														
 
															 # else
														
 
															-void bm_cpu_to_lel(struct drbd_bitmap *b)
														
 
															+static void bm_cpu_to_lel(struct drbd_bitmap *b)
														
 
															 {
														
 
															 	/* need to cpu_to_lel all the pages ...
														
 
															 	 * this may be optimized by using
														
@@ -1015,7 +1022,7 @@ unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_f
 
															  * wants bitnr, not sector.
														
 
															  * expected to be called for only a few bits (e - s about BITS_PER_LONG).
														
 
															  * Must hold bitmap lock already. */
														
 
															-int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
														
 
															+static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
														
 
															 	unsigned long e, int val, const enum km_type km)
														
 
															 {
														
 
															 	struct drbd_bitmap *b = mdev->bitmap;
														
@@ -1053,7 +1060,7 @@ int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 
															  * for val != 0, we change 0 -> 1, return code positive
														
 
															  * for val == 0, we change 1 -> 0, return code negative
														
 
															  * wants bitnr, not sector */
														
 
															-int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
														
 
															+static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
														
 
															 	const unsigned long e, int val)
														
 
															 {
														
 
															 	unsigned long flags;
														
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -132,6 +132,7 @@ enum {
 
															 	DRBD_FAULT_DT_RA = 6,	/* data read ahead */
														
 
															 	DRBD_FAULT_BM_ALLOC = 7,	/* bitmap allocation */
														
 
															 	DRBD_FAULT_AL_EE = 8,	/* alloc ee */
														
 
															+	DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
														
 
															 	DRBD_FAULT_MAX,
														
 
															 };
														
@@ -208,8 +209,11 @@ enum drbd_packets {
 
															 	P_RS_IS_IN_SYNC	      = 0x22, /* meta socket */
														
 
															 	P_SYNC_PARAM89	      = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
														
 
															 	P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
														
 
															+	/* P_CKPT_FENCE_REQ      = 0x25, * currently reserved for protocol D */
														
 
															+	/* P_CKPT_DISABLE_REQ    = 0x26, * currently reserved for protocol D */
														
 
															+	P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
														
 
															-	P_MAX_CMD	      = 0x25,
														
 
															+	P_MAX_CMD	      = 0x28,
														
 
															 	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
														
 
															 	P_MAX_OPT_CMD	      = 0x101,
														
@@ -264,6 +268,7 @@ static inline const char *cmdname(enum drbd_packets cmd)
 
															 		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
														
 
															 		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
														
 
															 		[P_COMPRESSED_BITMAP]   = "CBitmap",
														
 
															+		[P_DELAY_PROBE]         = "DelayProbe",
														
 
															 		[P_MAX_CMD]	        = NULL,
														
 
															 	};
														
@@ -481,7 +486,8 @@ struct p_sizes {
 
															 	u64	    u_size;  /* user requested size */
														
 
															 	u64	    c_size;  /* current exported size */
														
 
															 	u32	    max_segment_size;  /* Maximal size of a BIO */
														
 
															-	u32	    queue_order_type;
														
 
															+	u16	    queue_order_type;  /* not yet implemented in DRBD*/
														
 
															+	u16	    dds_flags; /* use enum dds_flags here. */
														
 
															 } __packed;
														
 
															 struct p_state {
														
@@ -538,6 +544,18 @@ struct p_compressed_bm {
 
															 	u8 code[0];
														
 
															 } __packed;
														
 
															+struct p_delay_probe {
														
 
															+	struct p_header head;
														
 
															+	u32	seq_num; /* sequence number to match the two probe packets */
														
 
															+	u32	offset;	 /* usecs the probe got sent after the reference time point */
														
 
															+} __packed;
														
 
															+
														
 
															+struct delay_probe {
														
 
															+	struct list_head list;
														
 
															+	unsigned int seq_num;
														
 
															+	struct timeval time;
														
 
															+};
														
 
															+
														
 
															 /* DCBP: Drbd Compressed Bitmap Packet ... */
														
 
															 static inline enum drbd_bitmap_code
														
 
															 DCBP_get_code(struct p_compressed_bm *p)
														
@@ -722,22 +740,6 @@ enum epoch_event {
 
															 	EV_CLEANUP = 32, /* used as flag */
														
 
															 };
														
 
															-struct drbd_epoch_entry {
														
 
															-	struct drbd_work    w;
														
 
															-	struct drbd_conf *mdev;
														
 
															-	struct bio *private_bio;
														
 
															-	struct hlist_node colision;
														
 
															-	sector_t sector;
														
 
															-	unsigned int size;
														
 
															-	struct drbd_epoch *epoch;
														
 
															-
														
 
															-	/* up to here, the struct layout is identical to drbd_request;
														
 
															-	 * we might be able to use that to our advantage...  */
														
 
															-
														
 
															-	unsigned int flags;
														
 
															-	u64    block_id;
														
 
															-};
														
 
															-
														
 
															 struct drbd_wq_barrier {
														
 
															 	struct drbd_work w;
														
 
															 	struct completion done;
														
@@ -748,17 +750,49 @@ struct digest_info {
 
															 	void *digest;
														
 
															 };
														
 
															-/* ee flag bits */
														
 
															+struct drbd_epoch_entry {
														
 
															+	struct drbd_work w;
														
 
															+	struct hlist_node colision;
														
 
															+	struct drbd_epoch *epoch;
														
 
															+	struct drbd_conf *mdev;
														
 
															+	struct page *pages;
														
 
															+	atomic_t pending_bios;
														
 
															+	unsigned int size;
														
 
															+	/* see comments on ee flag bits below */
														
 
															+	unsigned long flags;
														
 
															+	sector_t sector;
														
 
															+	u64 block_id;
														
 
															+};
														
 
															+
														
 
															+/* ee flag bits.
														
 
															+ * While corresponding bios are in flight, the only modification will be
														
 
															+ * set_bit WAS_ERROR, which has to be atomic.
														
 
															+ * If no bios are in flight yet, or all have been completed,
														
 
															+ * non-atomic modification to ee->flags is ok.
														
 
															+ */
														
 
															 enum {
														
 
															 	__EE_CALL_AL_COMPLETE_IO,
														
 
															-	__EE_CONFLICT_PENDING,
														
 
															 	__EE_MAY_SET_IN_SYNC,
														
 
															+
														
 
															+	/* This epoch entry closes an epoch using a barrier.
														
 
															+	 * On sucessful completion, the epoch is released,
														
 
															+	 * and the P_BARRIER_ACK send. */
														
 
															 	__EE_IS_BARRIER,
														
 
															+
														
 
															+	/* In case a barrier failed,
														
 
															+	 * we need to resubmit without the barrier flag. */
														
 
															+	__EE_RESUBMITTED,
														
 
															+
														
 
															+	/* we may have several bios per epoch entry.
														
 
															+	 * if any of those fail, we set this flag atomically
														
 
															+	 * from the endio callback */
														
 
															+	__EE_WAS_ERROR,
														
 
															 };
														
 
															 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
														
 
															-#define EE_CONFLICT_PENDING    (1<<__EE_CONFLICT_PENDING)
														
 
															 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
														
 
															 #define EE_IS_BARRIER          (1<<__EE_IS_BARRIER)
														
 
															+#define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
														
 
															+#define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
														
 
															 /* global flag bits */
														
 
															 enum {
														
@@ -908,9 +942,12 @@ struct drbd_conf {
 
															 	unsigned int ko_count;
														
 
															 	struct drbd_work  resync_work,
														
 
															 			  unplug_work,
														
 
															-			  md_sync_work;
														
 
															+			  md_sync_work,
														
 
															+			  delay_probe_work,
														
 
															+			  uuid_work;
														
 
															 	struct timer_list resync_timer;
														
 
															 	struct timer_list md_sync_timer;
														
 
															+	struct timer_list delay_probe_timer;
														
 
															 	/* Used after attach while negotiating new disk state. */
														
 
															 	union drbd_state new_state_tmp;
														
@@ -1026,6 +1063,13 @@ struct drbd_conf {
 
															 	u64 ed_uuid; /* UUID of the exposed data */
														
 
															 	struct mutex state_mutex;
														
 
															 	char congestion_reason;  /* Why we where congested... */
														
 
															+	struct list_head delay_probes; /* protected by peer_seq_lock */
														
 
															+	int data_delay;   /* Delay of packets on the data-sock behind meta-sock */
														
 
															+	unsigned int delay_seq; /* To generate sequence numbers of delay probes */
														
 
															+	struct timeval dps_time; /* delay-probes-start-time */
														
 
															+	unsigned int dp_volume_last;  /* send_cnt of last delay probe */
														
 
															+	int c_sync_rate; /* current resync rate after delay_probe magic */
														
 
															+	atomic_t new_c_uuid;
														
 
															 };
														
 
															 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
														
@@ -1081,6 +1125,11 @@ enum chg_state_flags {
 
															 	CS_ORDERED      = CS_WAIT_COMPLETE + CS_SERIALIZE,
														
 
															 };
														
 
															+enum dds_flags {
														
 
															+	DDSF_FORCED    = 1,
														
 
															+	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
														
 
															+};
														
 
															+
														
 
															 extern void drbd_init_set_defaults(struct drbd_conf *mdev);
														
 
															 extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
														
 
															 			union drbd_state mask, union drbd_state val);
														
@@ -1113,7 +1162,7 @@ extern int drbd_send_protocol(struct drbd_conf *mdev);
 
															 extern int drbd_send_uuids(struct drbd_conf *mdev);
														
 
															 extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
														
 
															 extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
														
 
															-extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
														
 
															+extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
														
 
															 extern int _drbd_send_state(struct drbd_conf *mdev);
														
 
															 extern int drbd_send_state(struct drbd_conf *mdev);
														
 
															 extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
														
@@ -1311,7 +1360,7 @@ struct bm_extent {
 
															 #define APP_R_HSIZE 15
														
 
															 extern int  drbd_bm_init(struct drbd_conf *mdev);
														
 
															-extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
														
 
															+extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
														
 
															 extern void drbd_bm_cleanup(struct drbd_conf *mdev);
														
 
															 extern void drbd_bm_set_all(struct drbd_conf *mdev);
														
 
															 extern void drbd_bm_clear_all(struct drbd_conf *mdev);
														
@@ -1383,7 +1432,7 @@ extern void drbd_resume_io(struct drbd_conf *mdev);
 
															 extern char *ppsize(char *buf, unsigned long long size);
														
 
															 extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
														
 
															 enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
														
 
															-extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local);
														
 
															+extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
														
 
															 extern void resync_after_online_grow(struct drbd_conf *);
														
 
															 extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
														
 
															 extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
														
@@ -1414,7 +1463,8 @@ static inline void ov_oos_print(struct drbd_conf *mdev)
 
															 }
														
 
															-extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
														
 
															+extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
														
 
															+extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
														
 
															 /* worker callbacks */
														
 
															 extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
														
 
															 extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
														
@@ -1438,6 +1488,8 @@ extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
 
															 extern void resync_timer_fn(unsigned long data);
														
 
															 /* drbd_receiver.c */
														
 
															+extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
														
 
															+		const unsigned rw, const int fault_type);
														
 
															 extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
														
 
															 extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
														
 
															 					    u64 id,
														
@@ -1593,6 +1645,41 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 
															  * inline helper functions
														
 
															  *************************/
														
 
															+/* see also page_chain_add and friends in drbd_receiver.c */
														
 
															+static inline struct page *page_chain_next(struct page *page)
														
 
															+{
														
 
															+	return (struct page *)page_private(page);
														
 
															+}
														
 
															+#define page_chain_for_each(page) \
														
 
															+	for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
														
 
															+			page = page_chain_next(page))
														
 
															+#define page_chain_for_each_safe(page, n) \
														
 
															+	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
														
 
															+
														
 
															+static inline int drbd_bio_has_active_page(struct bio *bio)
														
 
															+{
														
 
															+	struct bio_vec *bvec;
														
 
															+	int i;
														
 
															+
														
 
															+	__bio_for_each_segment(bvec, bio, i, 0) {
														
 
															+		if (page_count(bvec->bv_page) > 1)
														
 
															+			return 1;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
														
 
															+{
														
 
															+	struct page *page = e->pages;
														
 
															+	page_chain_for_each(page) {
														
 
															+		if (page_count(page) > 1)
														
 
															+			return 1;
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+
														
 
															 static inline void drbd_state_lock(struct drbd_conf *mdev)
														
 
															 {
														
 
															 	wait_event(mdev->misc_wait,
														
@@ -2132,13 +2219,15 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
 
															 		return 0;
														
 
															 	if (test_bit(BITMAP_IO, &mdev->flags))
														
 
															 		return 0;
														
 
															+	if (atomic_read(&mdev->new_c_uuid))
														
 
															+		return 0;
														
 
															 	return 1;
														
 
															 }
														
 
															 /* I'd like to use wait_event_lock_irq,
														
 
															  * but I'm not sure when it got introduced,
														
 
															  * and not sure when it has 3 or 4 arguments */
														
 
															-static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
														
 
															+static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
														
 
															 {
														
 
															 	/* compare with after_state_ch,
														
 
															 	 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
														
@@ -2152,6 +2241,9 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
 
															 	 * to avoid races with the reconnect code,
														
 
															 	 * we need to atomic_inc within the spinlock. */
														
 
															+	if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
														
 
															+		drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
														
 
															+
														
 
															 	spin_lock_irq(&mdev->req_lock);
														
 
															 	while (!__inc_ap_bio_cond(mdev)) {
														
 
															 		prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
														
@@ -2160,7 +2252,7 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
 
															 		finish_wait(&mdev->misc_wait, &wait);
														
 
															 		spin_lock_irq(&mdev->req_lock);
														
 
															 	}
														
 
															-	atomic_add(one_or_two, &mdev->ap_bio_cnt);
														
 
															+	atomic_add(count, &mdev->ap_bio_cnt);
														
 
															 	spin_unlock_irq(&mdev->req_lock);
														
 
															 }
														
@@ -2251,7 +2343,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
 
															 	if (test_bit(MD_NO_BARRIER, &mdev->flags))
														
 
															 		return;
														
 
															-	r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
														
 
															+	r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
														
 
															+			BLKDEV_IFL_WAIT);
														
 
															 	if (r) {
														
 
															 		set_bit(MD_NO_BARRIER, &mdev->flags);
														
 
															 		dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
														
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -684,6 +684,9 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
 
															 	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
														
 
															 		rv = SS_NO_REMOTE_DISK;
														
 
															+	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
														
 
															+		rv = SS_NO_UP_TO_DATE_DISK;
														
 
															+
														
 
															 	else if ((ns.conn == C_CONNECTED ||
														
 
															 		  ns.conn == C_WF_BITMAP_S ||
														
 
															 		  ns.conn == C_SYNC_SOURCE ||
														
@@ -840,7 +843,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 
															 			break;
														
 
															 		case C_WF_BITMAP_S:
														
 
															 		case C_PAUSED_SYNC_S:
														
 
															-			ns.pdsk = D_OUTDATED;
														
 
															+			/* remap any consistent state to D_OUTDATED,
														
 
															+			 * but disallow "upgrade" of not even consistent states.
														
 
															+			 */
														
 
															+			ns.pdsk =
														
 
															+				(D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
														
 
															+				? os.pdsk : D_OUTDATED;
														
 
															 			break;
														
 
															 		case C_SYNC_SOURCE:
														
 
															 			ns.pdsk = D_INCONSISTENT;
														
@@ -1205,21 +1213,20 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 
															 	&&  (ns.pdsk < D_INCONSISTENT ||
														
 
															 	     ns.pdsk == D_UNKNOWN ||
														
 
															 	     ns.pdsk == D_OUTDATED)) {
														
 
															-		kfree(mdev->p_uuid);
														
 
															-		mdev->p_uuid = NULL;
														
 
															 		if (get_ldev(mdev)) {
														
 
															 			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
														
 
															-			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
														
 
															-				drbd_uuid_new_current(mdev);
														
 
															-				drbd_send_uuids(mdev);
														
 
															-			}
														
 
															+			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
														
 
															+			    !atomic_read(&mdev->new_c_uuid))
														
 
															+				atomic_set(&mdev->new_c_uuid, 2);
														
 
															 			put_ldev(mdev);
														
 
															 		}
														
 
															 	}
														
 
															 	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
														
 
															-		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
														
 
															-			drbd_uuid_new_current(mdev);
														
 
															+		/* Diskless peer becomes primary or got connected do diskless, primary peer. */
														
 
															+		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 &&
														
 
															+		    !atomic_read(&mdev->new_c_uuid))
														
 
															+			atomic_set(&mdev->new_c_uuid, 2);
														
 
															 		/* D_DISKLESS Peer becomes secondary */
														
 
															 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
														
@@ -1232,7 +1239,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 
															 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
														
 
															 		kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
														
 
															 		mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
														
 
															-		drbd_send_sizes(mdev, 0);  /* to start sync... */
														
 
															+		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
														
 
															 		drbd_send_uuids(mdev);
														
 
															 		drbd_send_state(mdev);
														
 
															 	}
														
@@ -1343,6 +1350,24 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 
															 	drbd_md_sync(mdev);
														
 
															 }
														
 
															+static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
														
 
															+{
														
 
															+	if (get_ldev(mdev)) {
														
 
															+		if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
														
 
															+			drbd_uuid_new_current(mdev);
														
 
															+			if (get_net_conf(mdev)) {
														
 
															+				drbd_send_uuids(mdev);
														
 
															+				put_net_conf(mdev);
														
 
															+			}
														
 
															+			drbd_md_sync(mdev);
														
 
															+		}
														
 
															+		put_ldev(mdev);
														
 
															+	}
														
 
															+	atomic_dec(&mdev->new_c_uuid);
														
 
															+	wake_up(&mdev->misc_wait);
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															 static int drbd_thread_setup(void *arg)
														
 
															 {
														
@@ -1755,7 +1780,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
 
															 			     (struct p_header *)&p, sizeof(p));
														
 
															 }
														
 
															-int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
														
 
															+int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
														
 
															 {
														
 
															 	struct p_sizes p;
														
 
															 	sector_t d_size, u_size;
														
@@ -1767,7 +1792,6 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
 
															 		d_size = drbd_get_max_capacity(mdev->ldev);
														
 
															 		u_size = mdev->ldev->dc.disk_size;
														
 
															 		q_order_type = drbd_queue_order_type(mdev);
														
 
															-		p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
														
 
															 		put_ldev(mdev);
														
 
															 	} else {
														
 
															 		d_size = 0;
														
@@ -1779,7 +1803,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
 
															 	p.u_size = cpu_to_be64(u_size);
														
 
															 	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
														
 
															 	p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
														
 
															-	p.queue_order_type = cpu_to_be32(q_order_type);
														
 
															+	p.queue_order_type = cpu_to_be16(q_order_type);
														
 
															+	p.dds_flags = cpu_to_be16(flags);
														
 
															 	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
														
 
															 			   (struct p_header *)&p, sizeof(p));
														
@@ -2180,6 +2205,43 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
 
															 	return ok;
														
 
															 }
														
 
															+static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
														
 
															+{
														
 
															+	struct p_delay_probe dp;
														
 
															+	int offset, ok = 0;
														
 
															+	struct timeval now;
														
 
															+
														
 
															+	mutex_lock(&ds->mutex);
														
 
															+	if (likely(ds->socket)) {
														
 
															+		do_gettimeofday(&now);
														
 
															+		offset = now.tv_usec - mdev->dps_time.tv_usec +
														
 
															+			 (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
														
 
															+		dp.seq_num  = cpu_to_be32(mdev->delay_seq);
														
 
															+		dp.offset   = cpu_to_be32(offset);
														
 
															+
														
 
															+		ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
														
 
															+				    (struct p_header *)&dp, sizeof(dp), 0);
														
 
															+	}
														
 
															+	mutex_unlock(&ds->mutex);
														
 
															+
														
 
															+	return ok;
														
 
															+}
														
 
															+
														
 
															+static int drbd_send_delay_probes(struct drbd_conf *mdev)
														
 
															+{
														
 
															+	int ok;
														
 
															+
														
 
															+	mdev->delay_seq++;
														
 
															+	do_gettimeofday(&mdev->dps_time);
														
 
															+	ok = drbd_send_delay_probe(mdev, &mdev->meta);
														
 
															+	ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
														
 
															+
														
 
															+	mdev->dp_volume_last = mdev->send_cnt;
														
 
															+	mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
														
 
															+
														
 
															+	return ok;
														
 
															+}
														
 
															+
														
 
															 /* called on sndtimeo
														
 
															  * returns FALSE if we should retry,
														
 
															  * TRUE if we think connection is dead
														
@@ -2309,6 +2371,44 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 
															 	return 1;
														
 
															 }
														
 
															+static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
														
 
															+{
														
 
															+	struct page *page = e->pages;
														
 
															+	unsigned len = e->size;
														
 
															+	page_chain_for_each(page) {
														
 
															+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
														
 
															+		if (!_drbd_send_page(mdev, page, 0, l))
														
 
															+			return 0;
														
 
															+		len -= l;
														
 
															+	}
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+static void consider_delay_probes(struct drbd_conf *mdev)
														
 
															+{
														
 
															+	if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93)
														
 
															+		return;
														
 
															+
														
 
															+	if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
														
 
															+		drbd_send_delay_probes(mdev);
														
 
															+}
														
 
															+
														
 
															+static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
														
 
															+{
														
 
															+	if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
														
 
															+		drbd_send_delay_probes(mdev);
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+static void delay_probe_timer_fn(unsigned long data)
														
 
															+{
														
 
															+	struct drbd_conf *mdev = (struct drbd_conf *) data;
														
 
															+
														
 
															+	if (list_empty(&mdev->delay_probe_work.list))
														
 
															+		drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
														
 
															+}
														
 
															+
														
 
															 /* Used to send write requests
														
 
															  * R_PRIMARY -> Peer	(P_DATA)
														
 
															  */
														
@@ -2360,7 +2460,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 
															 		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
														
 
															 	if (ok && dgs) {
														
 
															 		dgb = mdev->int_dig_out;
														
 
															-		drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
														
 
															+		drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
														
 
															 		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
														
 
															 	}
														
 
															 	if (ok) {
														
@@ -2371,6 +2471,10 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 
															 	}
														
 
															 	drbd_put_data_sock(mdev);
														
 
															+
														
 
															+	if (ok)
														
 
															+		consider_delay_probes(mdev);
														
 
															+
														
 
															 	return ok;
														
 
															 }
														
@@ -2409,13 +2513,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
 
															 					sizeof(p), MSG_MORE);
														
 
															 	if (ok && dgs) {
														
 
															 		dgb = mdev->int_dig_out;
														
 
															-		drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
														
 
															+		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
														
 
															 		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
														
 
															 	}
														
 
															 	if (ok)
														
 
															-		ok = _drbd_send_zc_bio(mdev, e->private_bio);
														
 
															+		ok = _drbd_send_zc_ee(mdev, e);
														
 
															 	drbd_put_data_sock(mdev);
														
 
															+
														
 
															+	if (ok)
														
 
															+		consider_delay_probes(mdev);
														
 
															+
														
 
															 	return ok;
														
 
															 }
														
@@ -2600,6 +2708,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 
															 	atomic_set(&mdev->net_cnt, 0);
														
 
															 	atomic_set(&mdev->packet_seq, 0);
														
 
															 	atomic_set(&mdev->pp_in_use, 0);
														
 
															+	atomic_set(&mdev->new_c_uuid, 0);
														
 
															 	mutex_init(&mdev->md_io_mutex);
														
 
															 	mutex_init(&mdev->data.mutex);
														
@@ -2628,16 +2737,26 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 
															 	INIT_LIST_HEAD(&mdev->unplug_work.list);
														
 
															 	INIT_LIST_HEAD(&mdev->md_sync_work.list);
														
 
															 	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
														
 
															+	INIT_LIST_HEAD(&mdev->delay_probes);
														
 
															+	INIT_LIST_HEAD(&mdev->delay_probe_work.list);
														
 
															+	INIT_LIST_HEAD(&mdev->uuid_work.list);
														
 
															+
														
 
															 	mdev->resync_work.cb  = w_resync_inactive;
														
 
															 	mdev->unplug_work.cb  = w_send_write_hint;
														
 
															 	mdev->md_sync_work.cb = w_md_sync;
														
 
															 	mdev->bm_io_work.w.cb = w_bitmap_io;
														
 
															+	mdev->delay_probe_work.cb = w_delay_probes;
														
 
															+	mdev->uuid_work.cb = w_new_current_uuid;
														
 
															 	init_timer(&mdev->resync_timer);
														
 
															 	init_timer(&mdev->md_sync_timer);
														
 
															+	init_timer(&mdev->delay_probe_timer);
														
 
															 	mdev->resync_timer.function = resync_timer_fn;
														
 
															 	mdev->resync_timer.data = (unsigned long) mdev;
														
 
															 	mdev->md_sync_timer.function = md_sync_timer_fn;
														
 
															 	mdev->md_sync_timer.data = (unsigned long) mdev;
														
 
															+	mdev->delay_probe_timer.function = delay_probe_timer_fn;
														
 
															+	mdev->delay_probe_timer.data = (unsigned long) mdev;
														
 
															+
														
 
															 	init_waitqueue_head(&mdev->misc_wait);
														
 
															 	init_waitqueue_head(&mdev->state_wait);
														
@@ -2680,7 +2799,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
 
															 	drbd_set_my_capacity(mdev, 0);
														
 
															 	if (mdev->bitmap) {
														
 
															 		/* maybe never allocated. */
														
 
															-		drbd_bm_resize(mdev, 0);
														
 
															+		drbd_bm_resize(mdev, 0, 1);
														
 
															 		drbd_bm_cleanup(mdev);
														
 
															 	}
														
@@ -3129,7 +3248,7 @@ int __init drbd_init(void)
 
															 	if (err)
														
 
															 		goto Enomem;
														
 
															-	drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
														
 
															+	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
														
 
															 	if (!drbd_proc)	{
														
 
															 		printk(KERN_ERR "drbd: unable to register proc file\n");
														
 
															 		goto Enomem;
														
@@ -3660,7 +3779,8 @@ _drbd_fault_str(unsigned int type) {
 
															 		[DRBD_FAULT_DT_RD] = "Data read",
														
 
															 		[DRBD_FAULT_DT_RA] = "Data read ahead",
														
 
															 		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
														
 
															-		[DRBD_FAULT_AL_EE] = "EE allocation"
														
 
															+		[DRBD_FAULT_AL_EE] = "EE allocation",
														
 
															+		[DRBD_FAULT_RECEIVE] = "receive data corruption",
														
 
															 	};
														
 
															 	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
														
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -510,7 +510,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
 
															  * Returns 0 on success, negative return values indicate errors.
														
 
															  * You should call drbd_md_sync() after calling this function.
														
 
															  */
														
 
															-enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local)
														
 
															+enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
														
 
															 {
														
 
															 	sector_t prev_first_sect, prev_size; /* previous meta location */
														
 
															 	sector_t la_size;
														
@@ -541,12 +541,12 @@ enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force
 
															 	/* TODO: should only be some assert here, not (re)init... */
														
 
															 	drbd_md_set_sector_offsets(mdev, mdev->ldev);
														
 
															-	size = drbd_new_dev_size(mdev, mdev->ldev, force);
														
 
															+	size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
														
 
															 	if (drbd_get_capacity(mdev->this_bdev) != size ||
														
 
															 	    drbd_bm_capacity(mdev) != size) {
														
 
															 		int err;
														
 
															-		err = drbd_bm_resize(mdev, size);
														
 
															+		err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
														
 
															 		if (unlikely(err)) {
														
 
															 			/* currently there is only one error: ENOMEM! */
														
 
															 			size = drbd_bm_capacity(mdev)>>1;
														
@@ -704,9 +704,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
 
															 	struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
														
 
															 	int max_segments = mdev->ldev->dc.max_bio_bvecs;
														
 
															-	if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
														
 
															-		max_seg_s = PAGE_SIZE;
														
 
															-
														
 
															 	max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
														
 
															 	blk_queue_max_hw_sectors(q, max_seg_s >> 9);
														
@@ -1199,13 +1196,12 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
															 	}
														
 
															 	/* allocation not in the IO path, cqueue thread context */
														
 
															-	new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
														
 
															+	new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
														
 
															 	if (!new_conf) {
														
 
															 		retcode = ERR_NOMEM;
														
 
															 		goto fail;
														
 
															 	}
														
 
															-	memset(new_conf, 0, sizeof(struct net_conf));
														
 
															 	new_conf->timeout	   = DRBD_TIMEOUT_DEF;
														
 
															 	new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
														
 
															 	new_conf->ping_int	   = DRBD_PING_INT_DEF;
														
@@ -1477,8 +1473,8 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
															 {
														
 
															 	struct resize rs;
														
 
															 	int retcode = NO_ERROR;
														
 
															-	int ldsc = 0; /* local disk size changed */
														
 
															 	enum determine_dev_size dd;
														
 
															+	enum dds_flags ddsf;
														
 
															 	memset(&rs, 0, sizeof(struct resize));
														
 
															 	if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
														
@@ -1502,13 +1498,17 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
															 		goto fail;
														
 
															 	}
														
 
															-	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
														
 
															-		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
														
 
															-		ldsc = 1;
														
 
															+	if (rs.no_resync && mdev->agreed_pro_version < 93) {
														
 
															+		retcode = ERR_NEED_APV_93;
														
 
															+		goto fail;
														
 
															 	}
														
 
															+	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
														
 
															+		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
														
 
															+
														
 
															 	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
														
 
															-	dd = drbd_determin_dev_size(mdev, rs.resize_force);
														
 
															+	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
														
 
															+	dd = drbd_determin_dev_size(mdev, ddsf);
														
 
															 	drbd_md_sync(mdev);
														
 
															 	put_ldev(mdev);
														
 
															 	if (dd == dev_size_error) {
														
@@ -1516,12 +1516,12 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
															 		goto fail;
														
 
															 	}
														
 
															-	if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
														
 
															+	if (mdev->state.conn == C_CONNECTED) {
														
 
															 		if (dd == grew)
														
 
															 			set_bit(RESIZE_PENDING, &mdev->flags);
														
 
															 		drbd_send_uuids(mdev);
														
 
															-		drbd_send_sizes(mdev, 1);
														
 
															+		drbd_send_sizes(mdev, 1, ddsf);
														
 
															 	}
														
 
															  fail:
														
@@ -1551,6 +1551,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 
															 		sc.rate       = DRBD_RATE_DEF;
														
 
															 		sc.after      = DRBD_AFTER_DEF;
														
 
															 		sc.al_extents = DRBD_AL_EXTENTS_DEF;
														
 
															+		sc.dp_volume  = DRBD_DP_VOLUME_DEF;
														
 
															+		sc.dp_interval = DRBD_DP_INTERVAL_DEF;
														
 
															+		sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF;
														
 
															+		sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF;
														
 
															 	} else
														
 
															 		memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
														
@@ -2207,9 +2211,9 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 
															 {
														
 
															 	struct cn_msg *cn_reply;
														
 
															 	struct drbd_nl_cfg_reply *reply;
														
 
															-	struct bio_vec *bvec;
														
 
															 	unsigned short *tl;
														
 
															-	int i;
														
 
															+	struct page *page;
														
 
															+	unsigned len;
														
 
															 	if (!e)
														
 
															 		return;
														
@@ -2247,11 +2251,15 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 
															 	put_unaligned(T_ee_data, tl++);
														
 
															 	put_unaligned(e->size, tl++);
														
 
															-	__bio_for_each_segment(bvec, e->private_bio, i, 0) {
														
 
															-		void *d = kmap(bvec->bv_page);
														
 
															-		memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
														
 
															-		kunmap(bvec->bv_page);
														
 
															-		tl=(unsigned short*)((char*)tl + bvec->bv_len);
														
 
															+	len = e->size;
														
 
															+	page = e->pages;
														
 
															+	page_chain_for_each(page) {
														
 
															+		void *d = kmap_atomic(page, KM_USER0);
														
 
															+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
														
 
															+		memcpy(tl, d, l);
														
 
															+		kunmap_atomic(d, KM_USER0);
														
 
															+		tl = (unsigned short*)((char*)tl + l);
														
 
															+		len -= l;
														
 
															 	}
														
 
															 	put_unaligned(TT_END, tl++); /* Close the tag list */
														
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -73,14 +73,21 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 
															 	seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
														
 
															 	/* if more than 1 GB display in MB */
														
 
															 	if (mdev->rs_total > 0x100000L)
														
 
															-		seq_printf(seq, "(%lu/%lu)M\n\t",
														
 
															+		seq_printf(seq, "(%lu/%lu)M",
														
 
															 			    (unsigned long) Bit2KB(rs_left >> 10),
														
 
															 			    (unsigned long) Bit2KB(mdev->rs_total >> 10));
														
 
															 	else
														
 
															-		seq_printf(seq, "(%lu/%lu)K\n\t",
														
 
															+		seq_printf(seq, "(%lu/%lu)K",
														
 
															 			    (unsigned long) Bit2KB(rs_left),
														
 
															 			    (unsigned long) Bit2KB(mdev->rs_total));
														
 
															+	if (mdev->state.conn == C_SYNC_TARGET)
														
 
															+		seq_printf(seq, " queue_delay: %d.%d ms\n\t",
														
 
															+			   mdev->data_delay / 1000,
														
 
															+			   (mdev->data_delay % 1000) / 100);
														
 
															+	else if (mdev->state.conn == C_SYNC_SOURCE)
														
 
															+		seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq);
														
 
															+
														
 
															 	/* see drivers/md/md.c
														
 
															 	 * We do not want to overflow, so the order of operands and
														
 
															 	 * the * 100 / 100 trick are important. We do a +1 to be
														
@@ -128,6 +135,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
 
															 	else
														
 
															 		seq_printf(seq, " (%ld)", dbdt);
														
 
															+	if (mdev->state.conn == C_SYNC_TARGET) {
														
 
															+		if (mdev->c_sync_rate > 1000)
														
 
															+			seq_printf(seq, " want: %d,%03d",
														
 
															+				   mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
														
 
															+		else
														
 
															+			seq_printf(seq, " want: %d", mdev->c_sync_rate);
														
 
															+	}
														
 
															+
														
 
															 	seq_printf(seq, " K/sec\n");
														
 
															 }
														
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -80,30 +80,128 @@ static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epo
 
															 #define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
														
 
															-static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
														
 
															+/*
														
 
															+ * some helper functions to deal with single linked page lists,
														
 
															+ * page->private being our "next" pointer.
														
 
															+ */
														
 
															+
														
 
															+/* If at least n pages are linked at head, get n pages off.
														
 
															+ * Otherwise, don't modify head, and return NULL.
														
 
															+ * Locking is the responsibility of the caller.
														
 
															+ */
														
 
															+static struct page *page_chain_del(struct page **head, int n)
														
 
															+{
														
 
															+	struct page *page;
														
 
															+	struct page *tmp;
														
 
															+
														
 
															+	BUG_ON(!n);
														
 
															+	BUG_ON(!head);
														
 
															+
														
 
															+	page = *head;
														
 
															+
														
 
															+	if (!page)
														
 
															+		return NULL;
														
 
															+
														
 
															+	while (page) {
														
 
															+		tmp = page_chain_next(page);
														
 
															+		if (--n == 0)
														
 
															+			break; /* found sufficient pages */
														
 
															+		if (tmp == NULL)
														
 
															+			/* insufficient pages, don't use any of them. */
														
 
															+			return NULL;
														
 
															+		page = tmp;
														
 
															+	}
														
 
															+
														
 
															+	/* add end of list marker for the returned list */
														
 
															+	set_page_private(page, 0);
														
 
															+	/* actual return value, and adjustment of head */
														
 
															+	page = *head;
														
 
															+	*head = tmp;
														
 
															+	return page;
														
 
															+}
														
 
															+
														
 
															+/* may be used outside of locks to find the tail of a (usually short)
														
 
															+ * "private" page chain, before adding it back to a global chain head
														
 
															+ * with page_chain_add() under a spinlock. */
														
 
															+static struct page *page_chain_tail(struct page *page, int *len)
														
 
															+{
														
 
															+	struct page *tmp;
														
 
															+	int i = 1;
														
 
															+	while ((tmp = page_chain_next(page)))
														
 
															+		++i, page = tmp;
														
 
															+	if (len)
														
 
															+		*len = i;
														
 
															+	return page;
														
 
															+}
														
 
															+
														
 
															+static int page_chain_free(struct page *page)
														
 
															+{
														
 
															+	struct page *tmp;
														
 
															+	int i = 0;
														
 
															+	page_chain_for_each_safe(page, tmp) {
														
 
															+		put_page(page);
														
 
															+		++i;
														
 
															+	}
														
 
															+	return i;
														
 
															+}
														
 
															+
														
 
															+static void page_chain_add(struct page **head,
														
 
															+		struct page *chain_first, struct page *chain_last)
														
 
															+{
														
 
															+#if 1
														
 
															+	struct page *tmp;
														
 
															+	tmp = page_chain_tail(chain_first, NULL);
														
 
															+	BUG_ON(tmp != chain_last);
														
 
															+#endif
														
 
															+
														
 
															+	/* add chain to head */
														
 
															+	set_page_private(chain_last, (unsigned long)*head);
														
 
															+	*head = chain_first;
														
 
															+}
														
 
															+
														
 
															+static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
														
 
															 {
														
 
															 	struct page *page = NULL;
														
 
															+	struct page *tmp = NULL;
														
 
															+	int i = 0;
														
 
															 	/* Yes, testing drbd_pp_vacant outside the lock is racy.
														
 
															 	 * So what. It saves a spin_lock. */
														
 
															-	if (drbd_pp_vacant > 0) {
														
 
															+	if (drbd_pp_vacant >= number) {
														
 
															 		spin_lock(&drbd_pp_lock);
														
 
															-		page = drbd_pp_pool;
														
 
															-		if (page) {
														
 
															-			drbd_pp_pool = (struct page *)page_private(page);
														
 
															-			set_page_private(page, 0); /* just to be polite */
														
 
															-			drbd_pp_vacant--;
														
 
															-		}
														
 
															+		page = page_chain_del(&drbd_pp_pool, number);
														
 
															+		if (page)
														
 
															+			drbd_pp_vacant -= number;
														
 
															 		spin_unlock(&drbd_pp_lock);
														
 
															+		if (page)
														
 
															+			return page;
														
 
															 	}
														
 
															+
														
 
															 	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
														
 
															 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
														
 
															 	 * which in turn might block on the other node at this very place.  */
														
 
															-	if (!page)
														
 
															-		page = alloc_page(GFP_TRY);
														
 
															-	if (page)
														
 
															-		atomic_inc(&mdev->pp_in_use);
														
 
															-	return page;
														
 
															+	for (i = 0; i < number; i++) {
														
 
															+		tmp = alloc_page(GFP_TRY);
														
 
															+		if (!tmp)
														
 
															+			break;
														
 
															+		set_page_private(tmp, (unsigned long)page);
														
 
															+		page = tmp;
														
 
															+	}
														
 
															+
														
 
															+	if (i == number)
														
 
															+		return page;
														
 
															+
														
 
															+	/* Not enough pages immediately available this time.
														
 
															+	 * No need to jump around here, drbd_pp_alloc will retry this
														
 
															+	 * function "soon". */
														
 
															+	if (page) {
														
 
															+		tmp = page_chain_tail(page, NULL);
														
 
															+		spin_lock(&drbd_pp_lock);
														
 
															+		page_chain_add(&drbd_pp_pool, page, tmp);
														
 
															+		drbd_pp_vacant += i;
														
 
															+		spin_unlock(&drbd_pp_lock);
														
 
															+	}
														
 
															+	return NULL;
														
 
															 }
														
 
															 /* kick lower level device, if we have more than (arbitrary number)
														
@@ -127,7 +225,7 @@ static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed
 
															 	list_for_each_safe(le, tle, &mdev->net_ee) {
														
 
															 		e = list_entry(le, struct drbd_epoch_entry, w.list);
														
 
															-		if (drbd_bio_has_active_page(e->private_bio))
														
 
															+		if (drbd_ee_has_active_page(e))
														
 
															 			break;
														
 
															 		list_move(le, to_be_freed);
														
 
															 	}
														
@@ -148,32 +246,34 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
 
															 }
														
 
															 /**
														
 
															- * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
														
 
															+ * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
														
 
															  * @mdev:	DRBD device.
														
 
															- * @retry:	whether or not to retry allocation forever (or until signalled)
														
 
															+ * @number:	number of pages requested
														
 
															+ * @retry:	whether to retry, if not enough pages are available right now
														
 
															+ *
														
 
															+ * Tries to allocate number pages, first from our own page pool, then from
														
 
															+ * the kernel, unless this allocation would exceed the max_buffers setting.
														
 
															+ * Possibly retry until DRBD frees sufficient pages somewhere else.
														
 
															  *
														
 
															- * Tries to allocate a page, first from our own page pool, then from the
														
 
															- * kernel, unless this allocation would exceed the max_buffers setting.
														
 
															- * If @retry is non-zero, retry until DRBD frees a page somewhere else.
														
 
															+ * Returns a page chain linked via page->private.
														
 
															  */
														
 
															-static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
														
 
															+static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
														
 
															 {
														
 
															 	struct page *page = NULL;
														
 
															 	DEFINE_WAIT(wait);
														
 
															-	if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
														
 
															-		page = drbd_pp_first_page_or_try_alloc(mdev);
														
 
															-		if (page)
														
 
															-			return page;
														
 
															-	}
														
 
															+	/* Yes, we may run up to @number over max_buffers. If we
														
 
															+	 * follow it strictly, the admin will get it wrong anyways. */
														
 
															+	if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
														
 
															+		page = drbd_pp_first_pages_or_try_alloc(mdev, number);
														
 
															-	for (;;) {
														
 
															+	while (page == NULL) {
														
 
															 		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
														
 
															 		drbd_kick_lo_and_reclaim_net(mdev);
														
 
															 		if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
														
 
															-			page = drbd_pp_first_page_or_try_alloc(mdev);
														
 
															+			page = drbd_pp_first_pages_or_try_alloc(mdev, number);
														
 
															 			if (page)
														
 
															 				break;
														
 
															 		}
														
@@ -190,62 +290,32 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
 
															 	}
														
 
															 	finish_wait(&drbd_pp_wait, &wait);
														
 
															+	if (page)
														
 
															+		atomic_add(number, &mdev->pp_in_use);
														
 
															 	return page;
														
 
															 }
														
 
															 /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
														
 
															- * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
														
 
															+ * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
														
 
															+ * Either links the page chain back to the global pool,
														
 
															+ * or returns all pages to the system. */
														
 
															 static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
														
 
															 {
														
 
															-	int free_it;
														
 
															-
														
 
															-	spin_lock(&drbd_pp_lock);
														
 
															-	if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
														
 
															-		free_it = 1;
														
 
															-	} else {
														
 
															-		set_page_private(page, (unsigned long)drbd_pp_pool);
														
 
															-		drbd_pp_pool = page;
														
 
															-		drbd_pp_vacant++;
														
 
															-		free_it = 0;
														
 
															-	}
														
 
															-	spin_unlock(&drbd_pp_lock);
														
 
															-
														
 
															-	atomic_dec(&mdev->pp_in_use);
														
 
															-
														
 
															-	if (free_it)
														
 
															-		__free_page(page);
														
 
															-
														
 
															-	wake_up(&drbd_pp_wait);
														
 
															-}
														
 
															-
														
 
															-static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
														
 
															-{
														
 
															-	struct page *p_to_be_freed = NULL;
														
 
															-	struct page *page;
														
 
															-	struct bio_vec *bvec;
														
 
															 	int i;
														
 
															-
														
 
															-	spin_lock(&drbd_pp_lock);
														
 
															-	__bio_for_each_segment(bvec, bio, i, 0) {
														
 
															-		if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
														
 
															-			set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
														
 
															-			p_to_be_freed = bvec->bv_page;
														
 
															-		} else {
														
 
															-			set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
														
 
															-			drbd_pp_pool = bvec->bv_page;
														
 
															-			drbd_pp_vacant++;
														
 
															-		}
														
 
															-	}
														
 
															-	spin_unlock(&drbd_pp_lock);
														
 
															-	atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
														
 
															-
														
 
															-	while (p_to_be_freed) {
														
 
															-		page = p_to_be_freed;
														
 
															-		p_to_be_freed = (struct page *)page_private(page);
														
 
															-		set_page_private(page, 0); /* just to be polite */
														
 
															-		put_page(page);
														
 
															+	if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
														
 
															+		i = page_chain_free(page);
														
 
															+	else {
														
 
															+		struct page *tmp;
														
 
															+		tmp = page_chain_tail(page, &i);
														
 
															+		spin_lock(&drbd_pp_lock);
														
 
															+		page_chain_add(&drbd_pp_pool, page, tmp);
														
 
															+		drbd_pp_vacant += i;
														
 
															+		spin_unlock(&drbd_pp_lock);
														
 
															 	}
														
 
															-
														
 
															+	atomic_sub(i, &mdev->pp_in_use);
														
 
															+	i = atomic_read(&mdev->pp_in_use);
														
 
															+	if (i < 0)
														
 
															+		dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
														
 
															 	wake_up(&drbd_pp_wait);
														
 
															 }
														
@@ -270,11 +340,9 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 
															 				     unsigned int data_size,
														
 
															 				     gfp_t gfp_mask) __must_hold(local)
														
 
															 {
														
 
															-	struct request_queue *q;
														
 
															 	struct drbd_epoch_entry *e;
														
 
															 	struct page *page;
														
 
															-	struct bio *bio;
														
 
															-	unsigned int ds;
														
 
															+	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
														
 
															 	if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
														
 
															 		return NULL;
														
@@ -286,84 +354,32 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
 
															 		return NULL;
														
 
															 	}
														
 
															-	bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
														
 
															-	if (!bio) {
														
 
															-		if (!(gfp_mask & __GFP_NOWARN))
														
 
															-			dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
														
 
															-		goto fail1;
														
 
															-	}
														
 
															-
														
 
															-	bio->bi_bdev = mdev->ldev->backing_bdev;
														
 
															-	bio->bi_sector = sector;
														
 
															-
														
 
															-	ds = data_size;
														
 
															-	while (ds) {
														
 
															-		page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
														
 
															-		if (!page) {
														
 
															-			if (!(gfp_mask & __GFP_NOWARN))
														
 
															-				dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
														
 
															-			goto fail2;
														
 
															-		}
														
 
															-		if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
														
 
															-			drbd_pp_free(mdev, page);
														
 
															-			dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
														
 
															-			    "data_size=%u,ds=%u) failed\n",
														
 
															-			    (unsigned long long)sector, data_size, ds);
														
 
															-
														
 
															-			q = bdev_get_queue(bio->bi_bdev);
														
 
															-			if (q->merge_bvec_fn) {
														
 
															-				struct bvec_merge_data bvm = {
														
 
															-					.bi_bdev = bio->bi_bdev,
														
 
															-					.bi_sector = bio->bi_sector,
														
 
															-					.bi_size = bio->bi_size,
														
 
															-					.bi_rw = bio->bi_rw,
														
 
															-				};
														
 
															-				int l = q->merge_bvec_fn(q, &bvm,
														
 
															-						&bio->bi_io_vec[bio->bi_vcnt]);
														
 
															-				dev_err(DEV, "merge_bvec_fn() = %d\n", l);
														
 
															-			}
														
 
															-
														
 
															-			/* dump more of the bio. */
														
 
															-			dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
														
 
															-			dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
														
 
															-			dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
														
 
															-			dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
														
 
															-
														
 
															-			goto fail2;
														
 
															-			break;
														
 
															-		}
														
 
															-		ds -= min_t(int, ds, PAGE_SIZE);
														
 
															-	}
														
 
															-
														
 
															-	D_ASSERT(data_size == bio->bi_size);
														
 
															-
														
 
															-	bio->bi_private = e;
														
 
															-	e->mdev = mdev;
														
 
															-	e->sector = sector;
														
 
															-	e->size = bio->bi_size;
														
 
															+	page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
														
 
															+	if (!page)
														
 
															+		goto fail;
														
 
															-	e->private_bio = bio;
														
 
															-	e->block_id = id;
														
 
															 	INIT_HLIST_NODE(&e->colision);
														
 
															 	e->epoch = NULL;
														
 
															+	e->mdev = mdev;
														
 
															+	e->pages = page;
														
 
															+	atomic_set(&e->pending_bios, 0);
														
 
															+	e->size = data_size;
														
 
															 	e->flags = 0;
														
 
															+	e->sector = sector;
														
 
															+	e->sector = sector;
														
 
															+	e->block_id = id;
														
 
															 	return e;
														
 
															- fail2:
														
 
															-	drbd_pp_free_bio_pages(mdev, bio);
														
 
															-	bio_put(bio);
														
 
															- fail1:
														
 
															+ fail:
														
 
															 	mempool_free(e, drbd_ee_mempool);
														
 
															-
														
 
															 	return NULL;
														
 
															 }
														
 
															 void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
														
 
															 {
														
 
															-	struct bio *bio = e->private_bio;
														
 
															-	drbd_pp_free_bio_pages(mdev, bio);
														
 
															-	bio_put(bio);
														
 
															+	drbd_pp_free(mdev, e->pages);
														
 
															+	D_ASSERT(atomic_read(&e->pending_bios) == 0);
														
 
															 	D_ASSERT(hlist_unhashed(&e->colision));
														
 
															 	mempool_free(e, drbd_ee_mempool);
														
 
															 }
														
@@ -902,7 +918,7 @@ retry:
 
															 	if (!drbd_send_protocol(mdev))
														
 
															 		return -1;
														
 
															 	drbd_send_sync_param(mdev, &mdev->sync_conf);
														
 
															-	drbd_send_sizes(mdev, 0);
														
 
															+	drbd_send_sizes(mdev, 0, 0);
														
 
															 	drbd_send_uuids(mdev);
														
 
															 	drbd_send_state(mdev);
														
 
															 	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
														
@@ -946,7 +962,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
 
															 	int rv;
														
 
															 	if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
														
 
															-		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
														
 
															+		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
														
 
															+					NULL, BLKDEV_IFL_WAIT);
														
 
															 		if (rv) {
														
 
															 			dev_err(DEV, "local disk flush failed with status %d\n", rv);
														
 
															 			/* would rather check on EOPNOTSUPP, but that is not reliable.
														
@@ -1119,6 +1136,101 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
 
															 		dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
														
 
															 }
														
 
															+/**
														
 
															+ * drbd_submit_ee()
														
 
															+ * @mdev:	DRBD device.
														
 
															+ * @e:		epoch entry
														
 
															+ * @rw:		flag field, see bio->bi_rw
														
 
															+ */
														
 
															+/* TODO allocate from our own bio_set. */
														
 
															+int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
														
 
															+		const unsigned rw, const int fault_type)
														
 
															+{
														
 
															+	struct bio *bios = NULL;
														
 
															+	struct bio *bio;
														
 
															+	struct page *page = e->pages;
														
 
															+	sector_t sector = e->sector;
														
 
															+	unsigned ds = e->size;
														
 
															+	unsigned n_bios = 0;
														
 
															+	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
														
 
															+
														
 
															+	if (atomic_read(&mdev->new_c_uuid)) {
														
 
															+		if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
														
 
															+			drbd_uuid_new_current(mdev);
														
 
															+			drbd_md_sync(mdev);
														
 
															+
														
 
															+			atomic_dec(&mdev->new_c_uuid);
														
 
															+			wake_up(&mdev->misc_wait);
														
 
															+		}
														
 
															+		wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
														
 
															+	}
														
 
															+
														
 
															+	/* In most cases, we will only need one bio.  But in case the lower
														
 
															+	 * level restrictions happen to be different at this offset on this
														
 
															+	 * side than those of the sending peer, we may need to submit the
														
 
															+	 * request in more than one bio. */
														
 
															+next_bio:
														
 
															+	bio = bio_alloc(GFP_NOIO, nr_pages);
														
 
															+	if (!bio) {
														
 
															+		dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
														
 
															+		goto fail;
														
 
															+	}
														
 
															+	/* > e->sector, unless this is the first bio */
														
 
															+	bio->bi_sector = sector;
														
 
															+	bio->bi_bdev = mdev->ldev->backing_bdev;
														
 
															+	/* we special case some flags in the multi-bio case, see below
														
 
															+	 * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */
														
 
															+	bio->bi_rw = rw;
														
 
															+	bio->bi_private = e;
														
 
															+	bio->bi_end_io = drbd_endio_sec;
														
 
															+
														
 
															+	bio->bi_next = bios;
														
 
															+	bios = bio;
														
 
															+	++n_bios;
														
 
															+
														
 
															+	page_chain_for_each(page) {
														
 
															+		unsigned len = min_t(unsigned, ds, PAGE_SIZE);
														
 
															+		if (!bio_add_page(bio, page, len, 0)) {
														
 
															+			/* a single page must always be possible! */
														
 
															+			BUG_ON(bio->bi_vcnt == 0);
														
 
															+			goto next_bio;
														
 
															+		}
														
 
															+		ds -= len;
														
 
															+		sector += len >> 9;
														
 
															+		--nr_pages;
														
 
															+	}
														
 
															+	D_ASSERT(page == NULL);
														
 
															+	D_ASSERT(ds == 0);
														
 
															+
														
 
															+	atomic_set(&e->pending_bios, n_bios);
														
 
															+	do {
														
 
															+		bio = bios;
														
 
															+		bios = bios->bi_next;
														
 
															+		bio->bi_next = NULL;
														
 
															+
														
 
															+		/* strip off BIO_RW_UNPLUG unless it is the last bio */
														
 
															+		if (bios)
														
 
															+			bio->bi_rw &= ~(1<<BIO_RW_UNPLUG);
														
 
															+
														
 
															+		drbd_generic_make_request(mdev, fault_type, bio);
														
 
															+
														
 
															+		/* strip off BIO_RW_BARRIER,
														
 
															+		 * unless it is the first or last bio */
														
 
															+		if (bios && bios->bi_next)
														
 
															+			bios->bi_rw &= ~(1<<BIO_RW_BARRIER);
														
 
															+	} while (bios);
														
 
															+	maybe_kick_lo(mdev);
														
 
															+	return 0;
														
 
															+
														
 
															+fail:
														
 
															+	while (bios) {
														
 
															+		bio = bios;
														
 
															+		bios = bios->bi_next;
														
 
															+		bio_put(bio);
														
 
															+	}
														
 
															+	return -ENOMEM;
														
 
															+}
														
 
															+
														
 
															 /**
														
 
															  * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
														
 
															  * @mdev:	DRBD device.
														
@@ -1128,8 +1240,6 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
 
															 int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
														
 
															 {
														
 
															 	struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
														
 
															-	struct bio *bio = e->private_bio;
														
 
															-
														
 
															 	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
														
 
															 	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
														
 
															 	   so that we can finish that epoch in drbd_may_finish_epoch().
														
@@ -1143,33 +1253,17 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
 
															 	if (previous_epoch(mdev, e->epoch))
														
 
															 		dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
														
 
															-	/* prepare bio for re-submit,
														
 
															-	 * re-init volatile members */
														
 
															 	/* we still have a local reference,
														
 
															 	 * get_ldev was done in receive_Data. */
														
 
															-	bio->bi_bdev = mdev->ldev->backing_bdev;
														
 
															-	bio->bi_sector = e->sector;
														
 
															-	bio->bi_size = e->size;
														
 
															-	bio->bi_idx = 0;
														
 
															-
														
 
															-	bio->bi_flags &= ~(BIO_POOL_MASK - 1);
														
 
															-	bio->bi_flags |= 1 << BIO_UPTODATE;
														
 
															-
														
 
															-	/* don't know whether this is necessary: */
														
 
															-	bio->bi_phys_segments = 0;
														
 
															-	bio->bi_next = NULL;
														
 
															-
														
 
															-	/* these should be unchanged: */
														
 
															-	/* bio->bi_end_io = drbd_endio_write_sec; */
														
 
															-	/* bio->bi_vcnt = whatever; */
														
 
															 	e->w.cb = e_end_block;
														
 
															-
														
 
															-	/* This is no longer a barrier request. */
														
 
															-	bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
														
 
															-
														
 
															-	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
														
 
															-
														
 
															+	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
														
 
															+		/* drbd_submit_ee fails for one reason only:
														
 
															+		 * if was not able to allocate sufficient bios.
														
 
															+		 * requeue, try again later. */
														
 
															+		e->w.cb = w_e_reissue;
														
 
															+		drbd_queue_work(&mdev->data.work, &e->w);
														
 
															+	}
														
 
															 	return 1;
														
 
															 }
														
@@ -1261,13 +1355,13 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
 
															 static struct drbd_epoch_entry *
														
 
															 read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
														
 
															 {
														
 
															+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
														
 
															 	struct drbd_epoch_entry *e;
														
 
															-	struct bio_vec *bvec;
														
 
															 	struct page *page;
														
 
															-	struct bio *bio;
														
 
															-	int dgs, ds, i, rr;
														
 
															+	int dgs, ds, rr;
														
 
															 	void *dig_in = mdev->int_dig_in;
														
 
															 	void *dig_vv = mdev->int_dig_vv;
														
 
															+	unsigned long *data;
														
 
															 	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
														
 
															 		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
														
@@ -1286,29 +1380,44 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __
 
															 	ERR_IF(data_size &  0x1ff) return NULL;
														
 
															 	ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
														
 
															+	/* even though we trust out peer,
														
 
															+	 * we sometimes have to double check. */
														
 
															+	if (sector + (data_size>>9) > capacity) {
														
 
															+		dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
														
 
															+			(unsigned long long)capacity,
														
 
															+			(unsigned long long)sector, data_size);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
														
 
															 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
														
 
															 	 * which in turn might block on the other node at this very place.  */
														
 
															 	e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
														
 
															 	if (!e)
														
 
															 		return NULL;
														
 
															-	bio = e->private_bio;
														
 
															+
														
 
															 	ds = data_size;
														
 
															-	bio_for_each_segment(bvec, bio, i) {
														
 
															-		page = bvec->bv_page;
														
 
															-		rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
														
 
															+	page = e->pages;
														
 
															+	page_chain_for_each(page) {
														
 
															+		unsigned len = min_t(int, ds, PAGE_SIZE);
														
 
															+		data = kmap(page);
														
 
															+		rr = drbd_recv(mdev, data, len);
														
 
															+		if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
														
 
															+			dev_err(DEV, "Fault injection: Corrupting data on receive\n");
														
 
															+			data[0] = data[0] ^ (unsigned long)-1;
														
 
															+		}
														
 
															 		kunmap(page);
														
 
															-		if (rr != min_t(int, ds, PAGE_SIZE)) {
														
 
															+		if (rr != len) {
														
 
															 			drbd_free_ee(mdev, e);
														
 
															 			dev_warn(DEV, "short read receiving data: read %d expected %d\n",
														
 
															-			     rr, min_t(int, ds, PAGE_SIZE));
														
 
															+			     rr, len);
														
 
															 			return NULL;
														
 
															 		}
														
 
															 		ds -= rr;
														
 
															 	}
														
 
															 	if (dgs) {
														
 
															-		drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
														
 
															+		drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
														
 
															 		if (memcmp(dig_in, dig_vv, dgs)) {
														
 
															 			dev_err(DEV, "Digest integrity check FAILED.\n");
														
 
															 			drbd_bcast_ee(mdev, "digest failed",
														
@@ -1330,7 +1439,10 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
 
															 	int rr, rv = 1;
														
 
															 	void *data;
														
 
															-	page = drbd_pp_alloc(mdev, 1);
														
 
															+	if (!data_size)
														
 
															+		return TRUE;
														
 
															+
														
 
															+	page = drbd_pp_alloc(mdev, 1, 1);
														
 
															 	data = kmap(page);
														
 
															 	while (data_size) {
														
@@ -1394,7 +1506,7 @@ static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
 
															 	}
														
 
															 	if (dgs) {
														
 
															-		drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
														
 
															+		drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
														
 
															 		if (memcmp(dig_in, dig_vv, dgs)) {
														
 
															 			dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
														
 
															 			return 0;
														
@@ -1415,7 +1527,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
 
															 	D_ASSERT(hlist_unhashed(&e->colision));
														
 
															-	if (likely(drbd_bio_uptodate(e->private_bio))) {
														
 
															+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
														
 
															 		drbd_set_in_sync(mdev, sector, e->size);
														
 
															 		ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
														
 
															 	} else {
														
@@ -1434,30 +1546,28 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
 
															 	struct drbd_epoch_entry *e;
														
 
															 	e = read_in_block(mdev, ID_SYNCER, sector, data_size);
														
 
															-	if (!e) {
														
 
															-		put_ldev(mdev);
														
 
															-		return FALSE;
														
 
															-	}
														
 
															+	if (!e)
														
 
															+		goto fail;
														
 
															 	dec_rs_pending(mdev);
														
 
															-	e->private_bio->bi_end_io = drbd_endio_write_sec;
														
 
															-	e->private_bio->bi_rw = WRITE;
														
 
															-	e->w.cb = e_end_resync_block;
														
 
															-
														
 
															 	inc_unacked(mdev);
														
 
															 	/* corresponding dec_unacked() in e_end_resync_block()
														
 
															 	 * respective _drbd_clear_done_ee */
														
 
															+	e->w.cb = e_end_resync_block;
														
 
															+
														
 
															 	spin_lock_irq(&mdev->req_lock);
														
 
															 	list_add(&e->w.list, &mdev->sync_ee);
														
 
															 	spin_unlock_irq(&mdev->req_lock);
														
 
															-	drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
														
 
															-	/* accounting done in endio */
														
 
															+	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
														
 
															+		return TRUE;
														
 
															-	maybe_kick_lo(mdev);
														
 
															-	return TRUE;
														
 
															+	drbd_free_ee(mdev, e);
														
 
															+fail:
														
 
															+	put_ldev(mdev);
														
 
															+	return FALSE;
														
 
															 }
														
 
															 static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
														
@@ -1552,7 +1662,7 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
															 	}
														
 
															 	if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
														
 
															-		if (likely(drbd_bio_uptodate(e->private_bio))) {
														
 
															+		if (likely((e->flags & EE_WAS_ERROR) == 0)) {
														
 
															 			pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
														
 
															 				mdev->state.conn <= C_PAUSED_SYNC_T &&
														
 
															 				e->flags & EE_MAY_SET_IN_SYNC) ?
														
@@ -1698,7 +1808,6 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
 
															 		return FALSE;
														
 
															 	}
														
 
															-	e->private_bio->bi_end_io = drbd_endio_write_sec;
														
 
															 	e->w.cb = e_end_block;
														
 
															 	spin_lock(&mdev->epoch_lock);
														
@@ -1894,12 +2003,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
 
															 		drbd_al_begin_io(mdev, e->sector);
														
 
															 	}
														
 
															-	e->private_bio->bi_rw = rw;
														
 
															-	drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
														
 
															-	/* accounting done in endio */
														
 
															-
														
 
															-	maybe_kick_lo(mdev);
														
 
															-	return TRUE;
														
 
															+	if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
														
 
															+		return TRUE;
														
 
															 out_interrupted:
														
 
															 	/* yes, the epoch_size now is imbalanced.
														
@@ -1945,7 +2050,7 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 
															 			    "no local data.\n");
														
 
															 		drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
														
 
															 				 P_NEG_RS_DREPLY , p);
														
 
															-		return TRUE;
														
 
															+		return drbd_drain_block(mdev, h->length - brps);
														
 
															 	}
														
 
															 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
														
@@ -1957,9 +2062,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 
															 		return FALSE;
														
 
															 	}
														
 
															-	e->private_bio->bi_rw = READ;
														
 
															-	e->private_bio->bi_end_io = drbd_endio_read_sec;
														
 
															-
														
 
															 	switch (h->command) {
														
 
															 	case P_DATA_REQUEST:
														
 
															 		e->w.cb = w_e_end_data_req;
														
@@ -2053,10 +2155,8 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
 
															 	inc_unacked(mdev);
														
 
															-	drbd_generic_make_request(mdev, fault_type, e->private_bio);
														
 
															-	maybe_kick_lo(mdev);
														
 
															-
														
 
															-	return TRUE;
														
 
															+	if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
														
 
															+		return TRUE;
														
 
															 out_free_e:
														
 
															 	kfree(di);
														
@@ -2473,6 +2573,9 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 
															 		     hg > 0 ? "source" : "target");
														
 
															 	}
														
 
															+	if (abs(hg) == 100)
														
 
															+		drbd_khelper(mdev, "initial-split-brain");
														
 
															+
														
 
															 	if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
														
 
															 		int pcount = (mdev->state.role == R_PRIMARY)
														
 
															 			   + (peer_role == R_PRIMARY);
														
@@ -2518,7 +2621,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_rol
 
															 		 * after an attempted attach on a diskless node.
														
 
															 		 * We just refuse to attach -- well, we drop the "connection"
														
 
															 		 * to that disk, in a way... */
														
 
															-		dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
														
 
															+		dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
														
 
															 		drbd_khelper(mdev, "split-brain");
														
 
															 		return C_MASK;
														
 
															 	}
														
@@ -2849,7 +2952,7 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 
															 	unsigned int max_seg_s;
														
 
															 	sector_t p_size, p_usize, my_usize;
														
 
															 	int ldsc = 0; /* local disk size changed */
														
 
															-	enum drbd_conns nconn;
														
 
															+	enum dds_flags ddsf;
														
 
															 	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
														
 
															 	if (drbd_recv(mdev, h->payload, h->length) != h->length)
														
@@ -2905,8 +3008,9 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 
															 	}
														
 
															 #undef min_not_zero
														
 
															+	ddsf = be16_to_cpu(p->dds_flags);
														
 
															 	if (get_ldev(mdev)) {
														
 
															-	  dd = drbd_determin_dev_size(mdev, 0);
														
 
															+		dd = drbd_determin_dev_size(mdev, ddsf);
														
 
															 		put_ldev(mdev);
														
 
															 		if (dd == dev_size_error)
														
 
															 			return FALSE;
														
@@ -2916,33 +3020,21 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 
															 		drbd_set_my_capacity(mdev, p_size);
														
 
															 	}
														
 
															-	if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
														
 
															-		nconn = drbd_sync_handshake(mdev,
														
 
															-				mdev->state.peer, mdev->state.pdsk);
														
 
															-		put_ldev(mdev);
														
 
															-
														
 
															-		if (nconn == C_MASK) {
														
 
															-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
														
 
															-			return FALSE;
														
 
															-		}
														
 
															-
														
 
															-		if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
														
 
															-			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
														
 
															-			return FALSE;
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															 	if (get_ldev(mdev)) {
														
 
															 		if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
														
 
															 			mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
														
 
															 			ldsc = 1;
														
 
															 		}
														
 
															-		max_seg_s = be32_to_cpu(p->max_segment_size);
														
 
															+		if (mdev->agreed_pro_version < 94)
														
 
															+			max_seg_s = be32_to_cpu(p->max_segment_size);
														
 
															+		else /* drbd 8.3.8 onwards */
														
 
															+			max_seg_s = DRBD_MAX_SEGMENT_SIZE;
														
 
															+
														
 
															 		if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
														
 
															 			drbd_setup_queue_param(mdev, max_seg_s);
														
 
															-		drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
														
 
															+		drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
														
 
															 		put_ldev(mdev);
														
 
															 	}
														
@@ -2951,14 +3043,17 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
 
															 		    drbd_get_capacity(mdev->this_bdev) || ldsc) {
														
 
															 			/* we have different sizes, probably peer
														
 
															 			 * needs to know my new size... */
														
 
															-			drbd_send_sizes(mdev, 0);
														
 
															+			drbd_send_sizes(mdev, 0, ddsf);
														
 
															 		}
														
 
															 		if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
														
 
															 		    (dd == grew && mdev->state.conn == C_CONNECTED)) {
														
 
															 			if (mdev->state.pdsk >= D_INCONSISTENT &&
														
 
															-			    mdev->state.disk >= D_INCONSISTENT)
														
 
															-				resync_after_online_grow(mdev);
														
 
															-			else
														
 
															+			    mdev->state.disk >= D_INCONSISTENT) {
														
 
															+				if (ddsf & DDSF_NO_RESYNC)
														
 
															+					dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
														
 
															+				else
														
 
															+					resync_after_online_grow(mdev);
														
 
															+			} else
														
 
															 				set_bit(RESYNC_AFTER_NEG, &mdev->flags);
														
 
															 		}
														
 
															 	}
														
@@ -3490,6 +3585,92 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
 
															 	return TRUE;
														
 
															 }
														
 
															+static void timeval_sub_us(struct timeval* tv, unsigned int us)
														
 
															+{
														
 
															+	tv->tv_sec -= us / 1000000;
														
 
															+	us = us % 1000000;
														
 
															+	if (tv->tv_usec > us) {
														
 
															+		tv->tv_usec += 1000000;
														
 
															+		tv->tv_sec--;
														
 
															+	}
														
 
															+	tv->tv_usec -= us;
														
 
															+}
														
 
															+
														
 
															+static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p)
														
 
															+{
														
 
															+	struct delay_probe *dp;
														
 
															+	struct list_head *le;
														
 
															+	struct timeval now;
														
 
															+	int seq_num;
														
 
															+	int offset;
														
 
															+	int data_delay;
														
 
															+
														
 
															+	seq_num = be32_to_cpu(p->seq_num);
														
 
															+	offset  = be32_to_cpu(p->offset);
														
 
															+
														
 
															+	spin_lock(&mdev->peer_seq_lock);
														
 
															+	if (!list_empty(&mdev->delay_probes)) {
														
 
															+		if (from == USE_DATA_SOCKET)
														
 
															+			le = mdev->delay_probes.next;
														
 
															+		else
														
 
															+			le = mdev->delay_probes.prev;
														
 
															+
														
 
															+		dp = list_entry(le, struct delay_probe, list);
														
 
															+
														
 
															+		if (dp->seq_num == seq_num) {
														
 
															+			list_del(le);
														
 
															+			spin_unlock(&mdev->peer_seq_lock);
														
 
															+			do_gettimeofday(&now);
														
 
															+			timeval_sub_us(&now, offset);
														
 
															+			data_delay =
														
 
															+				now.tv_usec - dp->time.tv_usec +
														
 
															+				(now.tv_sec - dp->time.tv_sec) * 1000000;
														
 
															+
														
 
															+			if (data_delay > 0)
														
 
															+				mdev->data_delay = data_delay;
														
 
															+
														
 
															+			kfree(dp);
														
 
															+			return;
														
 
															+		}
														
 
															+
														
 
															+		if (dp->seq_num > seq_num) {
														
 
															+			spin_unlock(&mdev->peer_seq_lock);
														
 
															+			dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n");
														
 
															+			return; /* Do not alloca a struct delay_probe.... */
														
 
															+		}
														
 
															+	}
														
 
															+	spin_unlock(&mdev->peer_seq_lock);
														
 
															+
														
 
															+	dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO);
														
 
															+	if (!dp) {
														
 
															+		dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	dp->seq_num = seq_num;
														
 
															+	do_gettimeofday(&dp->time);
														
 
															+	timeval_sub_us(&dp->time, offset);
														
 
															+
														
 
															+	spin_lock(&mdev->peer_seq_lock);
														
 
															+	if (from == USE_DATA_SOCKET)
														
 
															+		list_add(&dp->list, &mdev->delay_probes);
														
 
															+	else
														
 
															+		list_add_tail(&dp->list, &mdev->delay_probes);
														
 
															+	spin_unlock(&mdev->peer_seq_lock);
														
 
															+}
														
 
															+
														
 
															+static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h)
														
 
															+{
														
 
															+	struct p_delay_probe *p = (struct p_delay_probe *)h;
														
 
															+
														
 
															+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
														
 
															+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
														
 
															+		return FALSE;
														
 
															+
														
 
															+	got_delay_probe(mdev, USE_DATA_SOCKET, p);
														
 
															+	return TRUE;
														
 
															+}
														
 
															+
														
 
															 typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
														
 
															 static drbd_cmd_handler_f drbd_default_handler[] = {
														
@@ -3513,6 +3694,7 @@ static drbd_cmd_handler_f drbd_default_handler[] = {
 
															 	[P_OV_REQUEST]      = receive_DataRequest,
														
 
															 	[P_OV_REPLY]        = receive_DataRequest,
														
 
															 	[P_CSUM_RS_REQUEST]    = receive_DataRequest,
														
 
															+	[P_DELAY_PROBE]     = receive_delay_probe,
														
 
															 	/* anything missing from this table is in
														
 
															 	 * the asender_tbl, see get_asender_cmd */
														
 
															 	[P_MAX_CMD]	    = NULL,
														
@@ -3739,7 +3921,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 
															 		dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
														
 
															 	i = atomic_read(&mdev->pp_in_use);
														
 
															 	if (i)
														
 
															-		dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
														
 
															+		dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
														
 
															 	D_ASSERT(list_empty(&mdev->read_ee));
														
 
															 	D_ASSERT(list_empty(&mdev->active_ee));
														
@@ -4232,7 +4414,6 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
 
															 	sector = be64_to_cpu(p->sector);
														
 
															 	size = be32_to_cpu(p->blksize);
														
 
															-	D_ASSERT(p->block_id == ID_SYNCER);
														
 
															 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
														
@@ -4290,6 +4471,14 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
 
															 	return TRUE;
														
 
															 }
														
 
															+static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h)
														
 
															+{
														
 
															+	struct p_delay_probe *p = (struct p_delay_probe *)h;
														
 
															+
														
 
															+	got_delay_probe(mdev, USE_META_SOCKET, p);
														
 
															+	return TRUE;
														
 
															+}
														
 
															+
														
 
															 struct asender_cmd {
														
 
															 	size_t pkt_size;
														
 
															 	int (*process)(struct drbd_conf *mdev, struct p_header *h);
														
@@ -4314,6 +4503,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
 
															 	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
														
 
															 	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
														
 
															 	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
														
 
															+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_delay_probe_m },
														
 
															 	[P_MAX_CMD]	    = { 0, NULL },
														
 
															 	};
														
 
															 	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
														
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -722,6 +722,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 
															 	struct drbd_request *req;
														
 
															 	int local, remote;
														
 
															 	int err = -EIO;
														
 
															+	int ret = 0;
														
 
															 	/* allocate outside of all locks; */
														
 
															 	req = drbd_req_new(mdev, bio);
														
@@ -784,7 +785,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
 
															 			    (mdev->state.pdsk == D_INCONSISTENT &&
														
 
															 			     mdev->state.conn >= C_CONNECTED));
														
 
															-	if (!(local || remote)) {
														
 
															+	if (!(local || remote) && !mdev->state.susp) {
														
 
															 		dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
														
 
															 		goto fail_free_complete;
														
 
															 	}
														
@@ -810,6 +811,16 @@ allocate_barrier:
 
															 	/* GOOD, everything prepared, grab the spin_lock */
														
 
															 	spin_lock_irq(&mdev->req_lock);
														
 
															+	if (mdev->state.susp) {
														
 
															+		/* If we got suspended, use the retry mechanism of
														
 
															+		   generic_make_request() to restart processing of this
														
 
															+		   bio. In the next call to drbd_make_request_26
														
 
															+		   we sleep in inc_ap_bio() */
														
 
															+		ret = 1;
														
 
															+		spin_unlock_irq(&mdev->req_lock);
														
 
															+		goto fail_free_complete;
														
 
															+	}
														
 
															+
														
 
															 	if (remote) {
														
 
															 		remote = (mdev->state.pdsk == D_UP_TO_DATE ||
														
 
															 			    (mdev->state.pdsk == D_INCONSISTENT &&
														
@@ -947,12 +958,14 @@ fail_and_free_req:
 
															 		req->private_bio = NULL;
														
 
															 		put_ldev(mdev);
														
 
															 	}
														
 
															-	bio_endio(bio, err);
														
 
															+	if (!ret)
														
 
															+		bio_endio(bio, err);
														
 
															+
														
 
															 	drbd_req_free(req);
														
 
															 	dec_ap_bio(mdev);
														
 
															 	kfree(b);
														
 
															-	return 0;
														
 
															+	return ret;
														
 
															 }
														
 
															 /* helper function for drbd_make_request
														
@@ -962,11 +975,6 @@ fail_and_free_req:
 
															  */
														
 
															 static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
														
 
															 {
														
 
															-	/* Unconfigured */
														
 
															-	if (mdev->state.conn == C_DISCONNECTING &&
														
 
															-	    mdev->state.disk == D_DISKLESS)
														
 
															-		return 1;
														
 
															-
														
 
															 	if (mdev->state.role != R_PRIMARY &&
														
 
															 		(!allow_oos || is_write)) {
														
 
															 		if (__ratelimit(&drbd_ratelimit_state)) {
														
@@ -1070,15 +1078,21 @@ int drbd_make_request_26(struct request_queue *q, struct bio *bio)
 
															 		/* we need to get a "reference count" (ap_bio_cnt)
														
 
															 		 * to avoid races with the disconnect/reconnect/suspend code.
														
 
															-		 * In case we need to split the bio here, we need to get two references
														
 
															+		 * In case we need to split the bio here, we need to get three references
														
 
															 		 * atomically, otherwise we might deadlock when trying to submit the
														
 
															 		 * second one! */
														
 
															-		inc_ap_bio(mdev, 2);
														
 
															+		inc_ap_bio(mdev, 3);
														
 
															 		D_ASSERT(e_enr == s_enr + 1);
														
 
															-		drbd_make_request_common(mdev, &bp->bio1);
														
 
															-		drbd_make_request_common(mdev, &bp->bio2);
														
 
															+		while (drbd_make_request_common(mdev, &bp->bio1))
														
 
															+			inc_ap_bio(mdev, 1);
														
 
															+
														
 
															+		while (drbd_make_request_common(mdev, &bp->bio2))
														
 
															+			inc_ap_bio(mdev, 1);
														
 
															+
														
 
															+		dec_ap_bio(mdev);
														
 
															+
														
 
															 		bio_pair_release(bp);
														
 
															 	}
														
 
															 	return 0;
														
@@ -1115,7 +1129,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 
															 	} else if (limit && get_ldev(mdev)) {
														
 
															 		struct request_queue * const b =
														
 
															 			mdev->ldev->backing_bdev->bd_disk->queue;
														
 
															-		if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
														
 
															+		if (b->merge_bvec_fn) {
														
 
															 			backing_limit = b->merge_bvec_fn(b, bvm, bvec);
														
 
															 			limit = min(limit, backing_limit);
														
 
															 		}
														
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -70,7 +70,7 @@ static const char *drbd_disk_s_names[] = {
 
															 static const char *drbd_state_sw_errors[] = {
														
 
															 	[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
														
 
															-	[-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
														
 
															+	[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
														
 
															 	[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
														
 
															 	[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
														
 
															 	[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
														
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -47,8 +47,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
 
															 /* defined here:
														
 
															    drbd_md_io_complete
														
 
															-   drbd_endio_write_sec
														
 
															-   drbd_endio_read_sec
														
 
															+   drbd_endio_sec
														
 
															    drbd_endio_pri
														
 
															  * more endio handlers:
														
@@ -85,27 +84,10 @@ void drbd_md_io_complete(struct bio *bio, int error)
 
															 /* reads on behalf of the partner,
														
 
															  * "submitted" by the receiver
														
 
															  */
														
 
															-void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
														
 
															+void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
														
 
															 {
														
 
															 	unsigned long flags = 0;
														
 
															-	struct drbd_epoch_entry *e = NULL;
														
 
															-	struct drbd_conf *mdev;
														
 
															-	int uptodate = bio_flagged(bio, BIO_UPTODATE);
														
 
															-
														
 
															-	e = bio->bi_private;
														
 
															-	mdev = e->mdev;
														
 
															-
														
 
															-	if (error)
														
 
															-		dev_warn(DEV, "read: error=%d s=%llus\n", error,
														
 
															-				(unsigned long long)e->sector);
														
 
															-	if (!error && !uptodate) {
														
 
															-		dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
														
 
															-				(unsigned long long)e->sector);
														
 
															-		/* strange behavior of some lower level drivers...
														
 
															-		 * fail the request by clearing the uptodate flag,
														
 
															-		 * but do not return any error?! */
														
 
															-		error = -EIO;
														
 
															-	}
														
 
															+	struct drbd_conf *mdev = e->mdev;
														
 
															 	D_ASSERT(e->block_id != ID_VACANT);
														
@@ -114,49 +96,38 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
 
															 	list_del(&e->w.list);
														
 
															 	if (list_empty(&mdev->read_ee))
														
 
															 		wake_up(&mdev->ee_wait);
														
 
															+	if (test_bit(__EE_WAS_ERROR, &e->flags))
														
 
															+		__drbd_chk_io_error(mdev, FALSE);
														
 
															 	spin_unlock_irqrestore(&mdev->req_lock, flags);
														
 
															-	drbd_chk_io_error(mdev, error, FALSE);
														
 
															 	drbd_queue_work(&mdev->data.work, &e->w);
														
 
															 	put_ldev(mdev);
														
 
															 }
														
 
															+static int is_failed_barrier(int ee_flags)
														
 
															+{
														
 
															+	return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
														
 
															+			== (EE_IS_BARRIER|EE_WAS_ERROR);
														
 
															+}
														
 
															+
														
 
															 /* writes on behalf of the partner, or resync writes,
														
 
															- * "submitted" by the receiver.
														
 
															- */
														
 
															-void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
														
 
															+ * "submitted" by the receiver, final stage.  */
														
 
															+static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
														
 
															 {
														
 
															 	unsigned long flags = 0;
														
 
															-	struct drbd_epoch_entry *e = NULL;
														
 
															-	struct drbd_conf *mdev;
														
 
															+	struct drbd_conf *mdev = e->mdev;
														
 
															 	sector_t e_sector;
														
 
															 	int do_wake;
														
 
															 	int is_syncer_req;
														
 
															 	int do_al_complete_io;
														
 
															-	int uptodate = bio_flagged(bio, BIO_UPTODATE);
														
 
															-	int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
														
 
															-
														
 
															-	e = bio->bi_private;
														
 
															-	mdev = e->mdev;
														
 
															-	if (error)
														
 
															-		dev_warn(DEV, "write: error=%d s=%llus\n", error,
														
 
															-				(unsigned long long)e->sector);
														
 
															-	if (!error && !uptodate) {
														
 
															-		dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
														
 
															-				(unsigned long long)e->sector);
														
 
															-		/* strange behavior of some lower level drivers...
														
 
															-		 * fail the request by clearing the uptodate flag,
														
 
															-		 * but do not return any error?! */
														
 
															-		error = -EIO;
														
 
															-	}
														
 
															-
														
 
															-	/* error == -ENOTSUPP would be a better test,
														
 
															-	 * alas it is not reliable */
														
 
															-	if (error && is_barrier && e->flags & EE_IS_BARRIER) {
														
 
															+	/* if this is a failed barrier request, disable use of barriers,
														
 
															+	 * and schedule for resubmission */
														
 
															+	if (is_failed_barrier(e->flags)) {
														
 
															 		drbd_bump_write_ordering(mdev, WO_bdev_flush);
														
 
															 		spin_lock_irqsave(&mdev->req_lock, flags);
														
 
															 		list_del(&e->w.list);
														
 
															+		e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
														
 
															 		e->w.cb = w_e_reissue;
														
 
															 		/* put_ldev actually happens below, once we come here again. */
														
 
															 		__release(local);
														
@@ -167,17 +138,16 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
 
															 	D_ASSERT(e->block_id != ID_VACANT);
														
 
															-	spin_lock_irqsave(&mdev->req_lock, flags);
														
 
															-	mdev->writ_cnt += e->size >> 9;
														
 
															-	is_syncer_req = is_syncer_block_id(e->block_id);
														
 
															-
														
 
															 	/* after we moved e to done_ee,
														
 
															 	 * we may no longer access it,
														
 
															 	 * it may be freed/reused already!
														
 
															 	 * (as soon as we release the req_lock) */
														
 
															 	e_sector = e->sector;
														
 
															 	do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
														
 
															+	is_syncer_req = is_syncer_block_id(e->block_id);
														
 
															+	spin_lock_irqsave(&mdev->req_lock, flags);
														
 
															+	mdev->writ_cnt += e->size >> 9;
														
 
															 	list_del(&e->w.list); /* has been on active_ee or sync_ee */
														
 
															 	list_add_tail(&e->w.list, &mdev->done_ee);
														
@@ -190,7 +160,7 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
 
															 		? list_empty(&mdev->sync_ee)
														
 
															 		: list_empty(&mdev->active_ee);
														
 
															-	if (error)
														
 
															+	if (test_bit(__EE_WAS_ERROR, &e->flags))
														
 
															 		__drbd_chk_io_error(mdev, FALSE);
														
 
															 	spin_unlock_irqrestore(&mdev->req_lock, flags);
														
@@ -205,7 +175,42 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
 
															 	wake_asender(mdev);
														
 
															 	put_ldev(mdev);
														
 
															+}
														
 
															+
														
 
															+/* writes on behalf of the partner, or resync writes,
														
 
															+ * "submitted" by the receiver.
														
 
															+ */
														
 
															+void drbd_endio_sec(struct bio *bio, int error)
														
 
															+{
														
 
															+	struct drbd_epoch_entry *e = bio->bi_private;
														
 
															+	struct drbd_conf *mdev = e->mdev;
														
 
															+	int uptodate = bio_flagged(bio, BIO_UPTODATE);
														
 
															+	int is_write = bio_data_dir(bio) == WRITE;
														
 
															+
														
 
															+	if (error)
														
 
															+		dev_warn(DEV, "%s: error=%d s=%llus\n",
														
 
															+				is_write ? "write" : "read", error,
														
 
															+				(unsigned long long)e->sector);
														
 
															+	if (!error && !uptodate) {
														
 
															+		dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
														
 
															+				is_write ? "write" : "read",
														
 
															+				(unsigned long long)e->sector);
														
 
															+		/* strange behavior of some lower level drivers...
														
 
															+		 * fail the request by clearing the uptodate flag,
														
 
															+		 * but do not return any error?! */
														
 
															+		error = -EIO;
														
 
															+	}
														
 
															+
														
 
															+	if (error)
														
 
															+		set_bit(__EE_WAS_ERROR, &e->flags);
														
 
															+	bio_put(bio); /* no need for the bio anymore */
														
 
															+	if (atomic_dec_and_test(&e->pending_bios)) {
														
 
															+		if (is_write)
														
 
															+			drbd_endio_write_sec_final(e);
														
 
															+		else
														
 
															+			drbd_endio_read_sec_final(e);
														
 
															+	}
														
 
															 }
														
 
															 /* read, readA or write requests on R_PRIMARY coming from drbd_make_request
														
@@ -295,7 +300,34 @@ int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
															 	return 1; /* Simply ignore this! */
														
 
															 }
														
 
															-void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
														
 
															+void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
														
 
															+{
														
 
															+	struct hash_desc desc;
														
 
															+	struct scatterlist sg;
														
 
															+	struct page *page = e->pages;
														
 
															+	struct page *tmp;
														
 
															+	unsigned len;
														
 
															+
														
 
															+	desc.tfm = tfm;
														
 
															+	desc.flags = 0;
														
 
															+
														
 
															+	sg_init_table(&sg, 1);
														
 
															+	crypto_hash_init(&desc);
														
 
															+
														
 
															+	while ((tmp = page_chain_next(page))) {
														
 
															+		/* all but the last page will be fully used */
														
 
															+		sg_set_page(&sg, page, PAGE_SIZE, 0);
														
 
															+		crypto_hash_update(&desc, &sg, sg.length);
														
 
															+		page = tmp;
														
 
															+	}
														
 
															+	/* and now the last, possibly only partially used page */
														
 
															+	len = e->size & (PAGE_SIZE - 1);
														
 
															+	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
														
 
															+	crypto_hash_update(&desc, &sg, sg.length);
														
 
															+	crypto_hash_final(&desc, digest);
														
 
															+}
														
 
															+
														
 
															+void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
														
 
															 {
														
 
															 	struct hash_desc desc;
														
 
															 	struct scatterlist sg;
														
@@ -329,11 +361,11 @@ static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel
 
															 		return 1;
														
 
															 	}
														
 
															-	if (likely(drbd_bio_uptodate(e->private_bio))) {
														
 
															+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
														
 
															 		digest_size = crypto_hash_digestsize(mdev->csums_tfm);
														
 
															 		digest = kmalloc(digest_size, GFP_NOIO);
														
 
															 		if (digest) {
														
 
															-			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
														
 
															+			drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
														
 
															 			inc_rs_pending(mdev);
														
 
															 			ok = drbd_send_drequest_csum(mdev,
														
@@ -369,23 +401,21 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
 
															 	/* GFP_TRY, because if there is no memory available right now, this may
														
 
															 	 * be rescheduled for later. It is "only" background resync, after all. */
														
 
															 	e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
														
 
															-	if (!e) {
														
 
															-		put_ldev(mdev);
														
 
															-		return 2;
														
 
															-	}
														
 
															+	if (!e)
														
 
															+		goto fail;
														
 
															 	spin_lock_irq(&mdev->req_lock);
														
 
															 	list_add(&e->w.list, &mdev->read_ee);
														
 
															 	spin_unlock_irq(&mdev->req_lock);
														
 
															-	e->private_bio->bi_end_io = drbd_endio_read_sec;
														
 
															-	e->private_bio->bi_rw = READ;
														
 
															 	e->w.cb = w_e_send_csum;
														
 
															+	if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
														
 
															+		return 1;
														
 
															-	mdev->read_cnt += size >> 9;
														
 
															-	drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
														
 
															-
														
 
															-	return 1;
														
 
															+	drbd_free_ee(mdev, e);
														
 
															+fail:
														
 
															+	put_ldev(mdev);
														
 
															+	return 2;
														
 
															 }
														
 
															 void resync_timer_fn(unsigned long data)
														
@@ -414,13 +444,25 @@ void resync_timer_fn(unsigned long data)
 
															 		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
														
 
															 }
														
 
															+static int calc_resync_rate(struct drbd_conf *mdev)
														
 
															+{
														
 
															+	int d = mdev->data_delay / 1000; /* us -> ms */
														
 
															+	int td = mdev->sync_conf.throttle_th * 100;  /* 0.1s -> ms */
														
 
															+	int hd = mdev->sync_conf.hold_off_th * 100;  /* 0.1s -> ms */
														
 
															+	int cr = mdev->sync_conf.rate;
														
 
															+
														
 
															+	return d <= td ? cr :
														
 
															+		d >= hd ? 0 :
														
 
															+		cr + (cr * (td - d) / (hd - td));
														
 
															+}
														
 
															+
														
 
															 int w_make_resync_request(struct drbd_conf *mdev,
														
 
															 		struct drbd_work *w, int cancel)
														
 
															 {
														
 
															 	unsigned long bit;
														
 
															 	sector_t sector;
														
 
															 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
														
 
															-	int max_segment_size = queue_max_segment_size(mdev->rq_queue);
														
 
															+	int max_segment_size;
														
 
															 	int number, i, size, pe, mx;
														
 
															 	int align, queued, sndbuf;
														
@@ -446,7 +488,13 @@ int w_make_resync_request(struct drbd_conf *mdev,
 
															 		return 1;
														
 
															 	}
														
 
															-	number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
														
 
															+	/* starting with drbd 8.3.8, we can handle multi-bio EEs,
														
 
															+	 * if it should be necessary */
														
 
															+	max_segment_size = mdev->agreed_pro_version < 94 ?
														
 
															+		queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
														
 
															+
														
 
															+	mdev->c_sync_rate = calc_resync_rate(mdev);
														
 
															+	number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
														
 
															 	pe = atomic_read(&mdev->rs_pending_cnt);
														
 
															 	mutex_lock(&mdev->data.mutex);
														
@@ -509,12 +557,6 @@ next_sector:
 
															 		 *
														
 
															 		 * Additionally always align bigger requests, in order to
														
 
															 		 * be prepared for all stripe sizes of software RAIDs.
														
 
															-		 *
														
 
															-		 * we _do_ care about the agreed-upon q->max_segment_size
														
 
															-		 * here, as splitting up the requests on the other side is more
														
 
															-		 * difficult.  the consequence is, that on lvm and md and other
														
 
															-		 * "indirect" devices, this is dead code, since
														
 
															-		 * q->max_segment_size will be PAGE_SIZE.
														
 
															 		 */
														
 
															 		align = 1;
														
 
															 		for (;;) {
														
@@ -806,7 +848,7 @@ out:
 
															 /* helper */
														
 
															 static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
														
 
															 {
														
 
															-	if (drbd_bio_has_active_page(e->private_bio)) {
														
 
															+	if (drbd_ee_has_active_page(e)) {
														
 
															 		/* This might happen if sendpage() has not finished */
														
 
															 		spin_lock_irq(&mdev->req_lock);
														
 
															 		list_add_tail(&e->w.list, &mdev->net_ee);
														
@@ -832,7 +874,7 @@ int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
															 		return 1;
														
 
															 	}
														
 
															-	if (likely(drbd_bio_uptodate(e->private_bio))) {
														
 
															+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
														
 
															 		ok = drbd_send_block(mdev, P_DATA_REPLY, e);
														
 
															 	} else {
														
 
															 		if (__ratelimit(&drbd_ratelimit_state))
														
@@ -873,7 +915,7 @@ int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
															 		put_ldev(mdev);
														
 
															 	}
														
 
															-	if (likely(drbd_bio_uptodate(e->private_bio))) {
														
 
															+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
														
 
															 		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
														
 
															 			inc_rs_pending(mdev);
														
 
															 			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
														
@@ -921,7 +963,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
															 	di = (struct digest_info *)(unsigned long)e->block_id;
														
 
															-	if (likely(drbd_bio_uptodate(e->private_bio))) {
														
 
															+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
														
 
															 		/* quick hack to try to avoid a race against reconfiguration.
														
 
															 		 * a real fix would be much more involved,
														
 
															 		 * introducing more locking mechanisms */
														
@@ -931,7 +973,7 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
															 			digest = kmalloc(digest_size, GFP_NOIO);
														
 
															 		}
														
 
															 		if (digest) {
														
 
															-			drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
														
 
															+			drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
														
 
															 			eq = !memcmp(digest, di->digest, digest_size);
														
 
															 			kfree(digest);
														
 
															 		}
														
@@ -973,14 +1015,14 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
															 	if (unlikely(cancel))
														
 
															 		goto out;
														
 
															-	if (unlikely(!drbd_bio_uptodate(e->private_bio)))
														
 
															+	if (unlikely((e->flags & EE_WAS_ERROR) != 0))
														
 
															 		goto out;
														
 
															 	digest_size = crypto_hash_digestsize(mdev->verify_tfm);
														
 
															 	/* FIXME if this allocation fails, online verify will not terminate! */
														
 
															 	digest = kmalloc(digest_size, GFP_NOIO);
														
 
															 	if (digest) {
														
 
															-		drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
														
 
															+		drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
														
 
															 		inc_rs_pending(mdev);
														
 
															 		ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
														
 
															 					     digest, digest_size, P_OV_REPLY);
														
@@ -1029,11 +1071,11 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 
															 	di = (struct digest_info *)(unsigned long)e->block_id;
														
 
															-	if (likely(drbd_bio_uptodate(e->private_bio))) {
														
 
															+	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
														
 
															 		digest_size = crypto_hash_digestsize(mdev->verify_tfm);
														
 
															 		digest = kmalloc(digest_size, GFP_NOIO);
														
 
															 		if (digest) {
														
 
															-			drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
														
 
															+			drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
														
 
															 			D_ASSERT(digest_size == di->digest_size);
														
 
															 			eq = !memcmp(digest, di->digest, digest_size);
														
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -18,23 +18,9 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
 
															 #define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
														
 
															-static inline int drbd_bio_has_active_page(struct bio *bio)
														
 
															-{
														
 
															-	struct bio_vec *bvec;
														
 
															-	int i;
														
 
															-
														
 
															-	__bio_for_each_segment(bvec, bio, i, 0) {
														
 
															-		if (page_count(bvec->bv_page) > 1)
														
 
															-			return 1;
														
 
															-	}
														
 
															-
														
 
															-	return 0;
														
 
															-}
														
 
															-
														
 
															 /* bi_end_io handlers */
														
 
															 extern void drbd_md_io_complete(struct bio *bio, int error);
														
 
															-extern void drbd_endio_read_sec(struct bio *bio, int error);
														
 
															-extern void drbd_endio_write_sec(struct bio *bio, int error);
														
 
															+extern void drbd_endio_sec(struct bio *bio, int error);
														
 
															 extern void drbd_endio_pri(struct bio *bio, int error);
														
 
															 /*
														
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -407,32 +407,24 @@ static int ide_disk_get_capacity(ide_drive_t *drive)
 
															 	return 0;
														
 
															 }
														
 
															-static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity)
														
 
															+static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
														
 
															 {
														
 
															-	u64 set = min(capacity, drive->probed_capacity);
														
 
															 	u16 *id = drive->id;
														
 
															 	int lba48 = ata_id_lba48_enabled(id);
														
 
															 	if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
														
 
															 	    ata_id_hpa_enabled(id) == 0)
														
 
															-		goto out;
														
 
															+		return;
														
 
															 	/*
														
 
															 	 * according to the spec the SET MAX ADDRESS command shall be
														
 
															 	 * immediately preceded by a READ NATIVE MAX ADDRESS command
														
 
															 	 */
														
 
															-	capacity = ide_disk_hpa_get_native_capacity(drive, lba48);
														
 
															-	if (capacity == 0)
														
 
															-		goto out;
														
 
															-
														
 
															-	set = ide_disk_hpa_set_capacity(drive, set, lba48);
														
 
															-	if (set) {
														
 
															-		/* needed for ->resume to disable HPA */
														
 
															-		drive->dev_flags |= IDE_DFLAG_NOHPA;
														
 
															-		return set;
														
 
															-	}
														
 
															-out:
														
 
															-	return drive->capacity64;
														
 
															+	if (!ide_disk_hpa_get_native_capacity(drive, lba48))
														
 
															+		return;
														
 
															+
														
 
															+	if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48))
														
 
															+		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
														
 
															 }
														
 
															 static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
														
@@ -783,13 +775,13 @@ static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
 
															 }
														
 
															 const struct ide_disk_ops ide_ata_disk_ops = {
														
 
															-	.check		= ide_disk_check,
														
 
															-	.set_capacity	= ide_disk_set_capacity,
														
 
															-	.get_capacity	= ide_disk_get_capacity,
														
 
															-	.setup		= ide_disk_setup,
														
 
															-	.flush		= ide_disk_flush,
														
 
															-	.init_media	= ide_disk_init_media,
														
 
															-	.set_doorlock	= ide_disk_set_doorlock,
														
 
															-	.do_request	= ide_do_rw_disk,
														
 
															-	.ioctl		= ide_disk_ioctl,
														
 
															+	.check			= ide_disk_check,
														
 
															+	.unlock_native_capacity	= ide_disk_unlock_native_capacity,
														
 
															+	.get_capacity		= ide_disk_get_capacity,
														
 
															+	.setup			= ide_disk_setup,
														
 
															+	.flush			= ide_disk_flush,
														
 
															+	.init_media		= ide_disk_init_media,
														
 
															+	.set_doorlock		= ide_disk_set_doorlock,
														
 
															+	.do_request		= ide_do_rw_disk,
														
 
															+	.ioctl			= ide_disk_ioctl,
														
 
															 };
														
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -288,17 +288,14 @@ static int ide_gd_media_changed(struct gendisk *disk)
 
															 	return ret;
														
 
															 }
														
 
															-static unsigned long long ide_gd_set_capacity(struct gendisk *disk,
														
 
															-					      unsigned long long capacity)
														
 
															+static void ide_gd_unlock_native_capacity(struct gendisk *disk)
														
 
															 {
														
 
															 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
														
 
															 	ide_drive_t *drive = idkp->drive;
														
 
															 	const struct ide_disk_ops *disk_ops = drive->disk_ops;
														
 
															-	if (disk_ops->set_capacity)
														
 
															-		return disk_ops->set_capacity(drive, capacity);
														
 
															-
														
 
															-	return drive->capacity64;
														
 
															+	if (disk_ops->unlock_native_capacity)
														
 
															+		disk_ops->unlock_native_capacity(drive);
														
 
															 }
														
 
															 static int ide_gd_revalidate_disk(struct gendisk *disk)
														
@@ -329,7 +326,7 @@ static const struct block_device_operations ide_gd_ops = {
 
															 	.locked_ioctl		= ide_gd_ioctl,
														
 
															 	.getgeo			= ide_gd_getgeo,
														
 
															 	.media_changed		= ide_gd_media_changed,
														
 
															-	.set_capacity		= ide_gd_set_capacity,
														
 
															+	.unlock_native_capacity	= ide_gd_unlock_native_capacity,
														
 
															 	.revalidate_disk	= ide_gd_revalidate_disk
														
 
															 };
														
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -417,7 +417,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 
															 	 */
														
 
															 	mutex_unlock(&bd_inode->i_mutex);
														
 
															-	error = blkdev_issue_flush(bdev, NULL);
														
 
															+	error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
														
 
															 	if (error == -EOPNOTSUPP)
														
 
															 		error = 0;
														
@@ -668,41 +668,209 @@ void bd_forget(struct inode *inode)
 
															 		iput(bdev->bd_inode);
														
 
															 }
														
 
															-int bd_claim(struct block_device *bdev, void *holder)
														
 
															+/**
														
 
															+ * bd_may_claim - test whether a block device can be claimed
														
 
															+ * @bdev: block device of interest
														
 
															+ * @whole: whole block device containing @bdev, may equal @bdev
														
 
															+ * @holder: holder trying to claim @bdev
														
 
															+ *
														
 
															+ * Test whther @bdev can be claimed by @holder.
														
 
															+ *
														
 
															+ * CONTEXT:
														
 
															+ * spin_lock(&bdev_lock).
														
 
															+ *
														
 
															+ * RETURNS:
														
 
															+ * %true if @bdev can be claimed, %false otherwise.
														
 
															+ */
														
 
															+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
														
 
															+			 void *holder)
														
 
															 {
														
 
															-	int res;
														
 
															-	spin_lock(&bdev_lock);
														
 
															-
														
 
															-	/* first decide result */
														
 
															 	if (bdev->bd_holder == holder)
														
 
															-		res = 0;	 /* already a holder */
														
 
															+		return true;	 /* already a holder */
														
 
															 	else if (bdev->bd_holder != NULL)
														
 
															-		res = -EBUSY; 	 /* held by someone else */
														
 
															+		return false; 	 /* held by someone else */
														
 
															 	else if (bdev->bd_contains == bdev)
														
 
															-		res = 0;  	 /* is a whole device which isn't held */
														
 
															+		return true;  	 /* is a whole device which isn't held */
														
 
															-	else if (bdev->bd_contains->bd_holder == bd_claim)
														
 
															-		res = 0; 	 /* is a partition of a device that is being partitioned */
														
 
															-	else if (bdev->bd_contains->bd_holder != NULL)
														
 
															-		res = -EBUSY;	 /* is a partition of a held device */
														
 
															+	else if (whole->bd_holder == bd_claim)
														
 
															+		return true; 	 /* is a partition of a device that is being partitioned */
														
 
															+	else if (whole->bd_holder != NULL)
														
 
															+		return false;	 /* is a partition of a held device */
														
 
															 	else
														
 
															-		res = 0;	 /* is a partition of an un-held device */
														
 
															+		return true;	 /* is a partition of an un-held device */
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * bd_prepare_to_claim - prepare to claim a block device
														
 
															+ * @bdev: block device of interest
														
 
															+ * @whole: the whole device containing @bdev, may equal @bdev
														
 
															+ * @holder: holder trying to claim @bdev
														
 
															+ *
														
 
															+ * Prepare to claim @bdev.  This function fails if @bdev is already
														
 
															+ * claimed by another holder and waits if another claiming is in
														
 
															+ * progress.  This function doesn't actually claim.  On successful
														
 
															+ * return, the caller has ownership of bd_claiming and bd_holder[s].
														
 
															+ *
														
 
															+ * CONTEXT:
														
 
															+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
														
 
															+ * it multiple times.
														
 
															+ *
														
 
															+ * RETURNS:
														
 
															+ * 0 if @bdev can be claimed, -EBUSY otherwise.
														
 
															+ */
														
 
															+static int bd_prepare_to_claim(struct block_device *bdev,
														
 
															+			       struct block_device *whole, void *holder)
														
 
															+{
														
 
															+retry:
														
 
															+	/* if someone else claimed, fail */
														
 
															+	if (!bd_may_claim(bdev, whole, holder))
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	/* if someone else is claiming, wait for it to finish */
														
 
															+	if (whole->bd_claiming && whole->bd_claiming != holder) {
														
 
															+		wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
														
 
															+		DEFINE_WAIT(wait);
														
 
															+
														
 
															+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
														
 
															+		spin_unlock(&bdev_lock);
														
 
															+		schedule();
														
 
															+		finish_wait(wq, &wait);
														
 
															+		spin_lock(&bdev_lock);
														
 
															+		goto retry;
														
 
															+	}
														
 
															+
														
 
															+	/* yay, all mine */
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * bd_start_claiming - start claiming a block device
														
 
															+ * @bdev: block device of interest
														
 
															+ * @holder: holder trying to claim @bdev
														
 
															+ *
														
 
															+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
														
 
															+ * exclusively and mark that an exclusive open is in progress.  Each
														
 
															+ * successful call to this function must be matched with a call to
														
 
															+ * either bd_claim() or bd_abort_claiming().  If this function
														
 
															+ * succeeds, the matching bd_claim() is guaranteed to succeed.
														
 
															+ *
														
 
															+ * CONTEXT:
														
 
															+ * Might sleep.
														
 
															+ *
														
 
															+ * RETURNS:
														
 
															+ * Pointer to the block device containing @bdev on success, ERR_PTR()
														
 
															+ * value on failure.
														
 
															+ */
														
 
															+static struct block_device *bd_start_claiming(struct block_device *bdev,
														
 
															+					      void *holder)
														
 
															+{
														
 
															+	struct gendisk *disk;
														
 
															+	struct block_device *whole;
														
 
															+	int partno, err;
														
 
															+
														
 
															+	might_sleep();
														
 
															+
														
 
															+	/*
														
 
															+	 * @bdev might not have been initialized properly yet, look up
														
 
															+	 * and grab the outer block device the hard way.
														
 
															+	 */
														
 
															+	disk = get_gendisk(bdev->bd_dev, &partno);
														
 
															+	if (!disk)
														
 
															+		return ERR_PTR(-ENXIO);
														
 
															+
														
 
															+	whole = bdget_disk(disk, 0);
														
 
															+	put_disk(disk);
														
 
															+	if (!whole)
														
 
															+		return ERR_PTR(-ENOMEM);
														
 
															+
														
 
															+	/* prepare to claim, if successful, mark claiming in progress */
														
 
															+	spin_lock(&bdev_lock);
														
 
															+
														
 
															+	err = bd_prepare_to_claim(bdev, whole, holder);
														
 
															+	if (err == 0) {
														
 
															+		whole->bd_claiming = holder;
														
 
															+		spin_unlock(&bdev_lock);
														
 
															+		return whole;
														
 
															+	} else {
														
 
															+		spin_unlock(&bdev_lock);
														
 
															+		bdput(whole);
														
 
															+		return ERR_PTR(err);
														
 
															+	}
														
 
															+}
														
 
															-	/* now impose change */
														
 
															-	if (res==0) {
														
 
															+/* releases bdev_lock */
														
 
															+static void __bd_abort_claiming(struct block_device *whole, void *holder)
														
 
															+{
														
 
															+	BUG_ON(whole->bd_claiming != holder);
														
 
															+	whole->bd_claiming = NULL;
														
 
															+	wake_up_bit(&whole->bd_claiming, 0);
														
 
															+
														
 
															+	spin_unlock(&bdev_lock);
														
 
															+	bdput(whole);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * bd_abort_claiming - abort claiming a block device
														
 
															+ * @whole: whole block device returned by bd_start_claiming()
														
 
															+ * @holder: holder trying to claim @bdev
														
 
															+ *
														
 
															+ * Abort a claiming block started by bd_start_claiming().  Note that
														
 
															+ * @whole is not the block device to be claimed but the whole device
														
 
															+ * returned by bd_start_claiming().
														
 
															+ *
														
 
															+ * CONTEXT:
														
 
															+ * Grabs and releases bdev_lock.
														
 
															+ */
														
 
															+static void bd_abort_claiming(struct block_device *whole, void *holder)
														
 
															+{
														
 
															+	spin_lock(&bdev_lock);
														
 
															+	__bd_abort_claiming(whole, holder);		/* releases bdev_lock */
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * bd_claim - claim a block device
														
 
															+ * @bdev: block device to claim
														
 
															+ * @holder: holder trying to claim @bdev
														
 
															+ *
														
 
															+ * Try to claim @bdev which must have been opened successfully.  This
														
 
															+ * function may be called with or without preceding
														
 
															+ * blk_start_claiming().  In the former case, this function is always
														
 
															+ * successful and terminates the claiming block.
														
 
															+ *
														
 
															+ * CONTEXT:
														
 
															+ * Might sleep.
														
 
															+ *
														
 
															+ * RETURNS:
														
 
															+ * 0 if successful, -EBUSY if @bdev is already claimed.
														
 
															+ */
														
 
															+int bd_claim(struct block_device *bdev, void *holder)
														
 
															+{
														
 
															+	struct block_device *whole = bdev->bd_contains;
														
 
															+	int res;
														
 
															+
														
 
															+	might_sleep();
														
 
															+
														
 
															+	spin_lock(&bdev_lock);
														
 
															+
														
 
															+	res = bd_prepare_to_claim(bdev, whole, holder);
														
 
															+	if (res == 0) {
														
 
															 		/* note that for a whole device bd_holders
														
 
															 		 * will be incremented twice, and bd_holder will
														
 
															 		 * be set to bd_claim before being set to holder
														
 
															 		 */
														
 
															-		bdev->bd_contains->bd_holders ++;
														
 
															-		bdev->bd_contains->bd_holder = bd_claim;
														
 
															+		whole->bd_holders++;
														
 
															+		whole->bd_holder = bd_claim;
														
 
															 		bdev->bd_holders++;
														
 
															 		bdev->bd_holder = holder;
														
 
															 	}
														
 
															-	spin_unlock(&bdev_lock);
														
 
															+
														
 
															+	if (whole->bd_claiming)
														
 
															+		__bd_abort_claiming(whole, holder);	/* releases bdev_lock */
														
 
															+	else
														
 
															+		spin_unlock(&bdev_lock);
														
 
															+
														
 
															 	return res;
														
 
															 }
														
 
															-
														
 
															 EXPORT_SYMBOL(bd_claim);
														
 
															 void bd_release(struct block_device *bdev)
														
@@ -1316,6 +1484,7 @@ EXPORT_SYMBOL(blkdev_get);
 
															 static int blkdev_open(struct inode * inode, struct file * filp)
														
 
															 {
														
 
															+	struct block_device *whole = NULL;
														
 
															 	struct block_device *bdev;
														
 
															 	int res;
														
@@ -1338,22 +1507,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 
															 	if (bdev == NULL)
														
 
															 		return -ENOMEM;
														
 
															+	if (filp->f_mode & FMODE_EXCL) {
														
 
															+		whole = bd_start_claiming(bdev, filp);
														
 
															+		if (IS_ERR(whole)) {
														
 
															+			bdput(bdev);
														
 
															+			return PTR_ERR(whole);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															 	filp->f_mapping = bdev->bd_inode->i_mapping;
														
 
															 	res = blkdev_get(bdev, filp->f_mode);
														
 
															-	if (res)
														
 
															-		return res;
														
 
															-	if (filp->f_mode & FMODE_EXCL) {
														
 
															-		res = bd_claim(bdev, filp);
														
 
															-		if (res)
														
 
															-			goto out_blkdev_put;
														
 
															+	if (whole) {
														
 
															+		if (res == 0)
														
 
															+			BUG_ON(bd_claim(bdev, filp) != 0);
														
 
															+		else
														
 
															+			bd_abort_claiming(whole, filp);
														
 
															 	}
														
 
															-	return 0;
														
 
															-
														
 
															- out_blkdev_put:
														
 
															-	blkdev_put(bdev, filp->f_mode);
														
 
															 	return res;
														
 
															 }
														
@@ -1564,27 +1736,34 @@ EXPORT_SYMBOL(lookup_bdev);
 
															  */
														
 
															 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
														
 
															 {
														
 
															-	struct block_device *bdev;
														
 
															-	int error = 0;
														
 
															+	struct block_device *bdev, *whole;
														
 
															+	int error;
														
 
															 	bdev = lookup_bdev(path);
														
 
															 	if (IS_ERR(bdev))
														
 
															 		return bdev;
														
 
															+	whole = bd_start_claiming(bdev, holder);
														
 
															+	if (IS_ERR(whole)) {
														
 
															+		bdput(bdev);
														
 
															+		return whole;
														
 
															+	}
														
 
															+
														
 
															 	error = blkdev_get(bdev, mode);
														
 
															 	if (error)
														
 
															-		return ERR_PTR(error);
														
 
															+		goto out_abort_claiming;
														
 
															+
														
 
															 	error = -EACCES;
														
 
															 	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
														
 
															-		goto blkdev_put;
														
 
															-	error = bd_claim(bdev, holder);
														
 
															-	if (error)
														
 
															-		goto blkdev_put;
														
 
															+		goto out_blkdev_put;
														
 
															+	BUG_ON(bd_claim(bdev, holder) != 0);
														
 
															 	return bdev;
														
 
															-	
														
 
															-blkdev_put:
														
 
															+
														
 
															+out_blkdev_put:
														
 
															 	blkdev_put(bdev, mode);
														
 
															+out_abort_claiming:
														
 
															+	bd_abort_claiming(whole, holder);
														
 
															 	return ERR_PTR(error);
														
 
															 }
														
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
 
															 				u64 start, u64 len)
														
 
															 {
														
 
															 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
														
 
															-			     DISCARD_FL_BARRIER);
														
 
															+			BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
														
 
															 }
														
 
															 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
														
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
 
															 		return;
														
 
															 	invalidate_bh_lrus();
														
 
															+	lru_add_drain_all();	/* make sure all lru add caches are flushed */
														
 
															 	invalidate_mapping_pages(mapping, 0, -1);
														
 
															 }
														
 
															 EXPORT_SYMBOL(invalidate_bdev);
														
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,6 +90,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 
															 	 * storage
														
 
															 	 */
														
 
															 	if (needs_barrier)
														
 
															-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
														
 
															+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
														
 
															+				BLKDEV_IFL_WAIT);
														
 
															 	return ret;
														
 
															 }
														
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
															 		if (ext4_should_writeback_data(inode) &&
														
 
															 		    (journal->j_fs_dev != journal->j_dev) &&
														
 
															 		    (journal->j_flags & JBD2_BARRIER))
														
 
															-			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
														
 
															+			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
														
 
															+					NULL, BLKDEV_IFL_WAIT);
														
 
															 		jbd2_log_wait_commit(journal, commit_tid);
														
 
															 	} else if (journal->j_flags & JBD2_BARRIER)
														
 
															-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
														
 
															+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
														
 
															+			BLKDEV_IFL_WAIT);
														
 
															 	return ret;
														
 
															 }
														
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
 
															 #include <linux/dnotify.h>
														
 
															 #include <linux/slab.h>
														
 
															 #include <linux/module.h>
														
 
															+#include <linux/pipe_fs_i.h>
														
 
															 #include <linux/security.h>
														
 
															 #include <linux/ptrace.h>
														
 
															 #include <linux/signal.h>
														
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 
															 	case F_NOTIFY:
														
 
															 		err = fcntl_dirnotify(fd, filp, arg);
														
 
															 		break;
														
 
															+	case F_SETPIPE_SZ:
														
 
															+	case F_GETPIPE_SZ:
														
 
															+		err = pipe_fcntl(filp, cmd, arg);
														
 
															+		break;
														
 
															 	default:
														
 
															 		break;
														
 
															 	}
														
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_args {
 
															 	int for_kupdate:1;
														
 
															 	int range_cyclic:1;
														
 
															 	int for_background:1;
														
 
															+	int sb_pinned:1;
														
 
															 };
														
 
															 /*
														
@@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 
															 }
														
 
															 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
														
 
															-				 struct wb_writeback_args *args)
														
 
															+				 struct wb_writeback_args *args,
														
 
															+				 int wait)
														
 
															 {
														
 
															 	struct bdi_work *work;
														
@@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 
															 	if (work) {
														
 
															 		bdi_work_init(work, args);
														
 
															 		bdi_queue_work(bdi, work);
														
 
															+		if (wait)
														
 
															+			bdi_wait_on_work_clear(work);
														
 
															 	} else {
														
 
															 		struct bdi_writeback *wb = &bdi->wb;
														
@@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 
															 		.sync_mode	= WB_SYNC_ALL,
														
 
															 		.nr_pages	= LONG_MAX,
														
 
															 		.range_cyclic	= 0,
														
 
															+		/*
														
 
															+		 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
														
 
															+		 * lets make it explicitly clear.
														
 
															+		 */
														
 
															+		.sb_pinned	= 1,
														
 
															 	};
														
 
															 	struct bdi_work work;
														
@@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 
															  * @bdi: the backing device to write from
														
 
															  * @sb: write inodes from this super_block
														
 
															  * @nr_pages: the number of pages to write
														
 
															+ * @sb_locked: caller already holds sb umount sem.
														
 
															  *
														
 
															  * Description:
														
 
															  *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
														
 
															  *   started when this function returns, we make no guarentees on
														
 
															- *   completion. Caller need not hold sb s_umount semaphore.
														
 
															+ *   completion. Caller specifies whether sb umount sem is held already or not.
														
 
															  *
														
 
															  */
														
 
															 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
														
 
															-			 long nr_pages)
														
 
															+			 long nr_pages, int sb_locked)
														
 
															 {
														
 
															 	struct wb_writeback_args args = {
														
 
															 		.sb		= sb,
														
 
															 		.sync_mode	= WB_SYNC_NONE,
														
 
															 		.nr_pages	= nr_pages,
														
 
															 		.range_cyclic	= 1,
														
 
															+		.sb_pinned	= sb_locked,
														
 
															 	};
														
 
															 	/*
														
@@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
 
															 		args.for_background = 1;
														
 
															 	}
														
 
															-	bdi_alloc_queue_work(bdi, &args);
														
 
															+	bdi_alloc_queue_work(bdi, &args, sb_locked);
														
 
															 }
														
 
															 /*
														
@@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
															 	BUG_ON(inode->i_state & I_SYNC);
														
 
															-	/* Set I_SYNC, reset I_DIRTY */
														
 
															-	dirty = inode->i_state & I_DIRTY;
														
 
															+	/* Set I_SYNC, reset I_DIRTY_PAGES */
														
 
															 	inode->i_state |= I_SYNC;
														
 
															-	inode->i_state &= ~I_DIRTY;
														
 
															-
														
 
															+	inode->i_state &= ~I_DIRTY_PAGES;
														
 
															 	spin_unlock(&inode_lock);
														
 
															 	ret = do_writepages(mapping, wbc);
														
@@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
															 			ret = err;
														
 
															 	}
														
 
															+	/*
														
 
															+	 * Some filesystems may redirty the inode during the writeback
														
 
															+	 * due to delalloc, clear dirty metadata flags right before
														
 
															+	 * write_inode()
														
 
															+	 */
														
 
															+	spin_lock(&inode_lock);
														
 
															+	dirty = inode->i_state & I_DIRTY;
														
 
															+	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
														
 
															+	spin_unlock(&inode_lock);
														
 
															 	/* Don't write the inode if only I_DIRTY_PAGES was set */
														
 
															 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
														
 
															 		int err = write_inode(inode, wbc);
														
@@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
 
															 	/*
														
 
															 	 * Caller must already hold the ref for this
														
 
															 	 */
														
 
															-	if (wbc->sync_mode == WB_SYNC_ALL) {
														
 
															+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
														
 
															 		WARN_ON(!rwsem_is_locked(&sb->s_umount));
														
 
															 		return SB_NOT_PINNED;
														
 
															 	}
														
@@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 
															 		.for_kupdate		= args->for_kupdate,
														
 
															 		.for_background		= args->for_background,
														
 
															 		.range_cyclic		= args->range_cyclic,
														
 
															+		.sb_pinned		= args->sb_pinned,
														
 
															 	};
														
 
															 	unsigned long oldest_jif;
														
 
															 	long wrote = 0;
														
@@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 
															 	unsigned long expired;
														
 
															 	long nr_pages;
														
 
															+	/*
														
 
															+	 * When set to zero, disable periodic writeback
														
 
															+	 */
														
 
															+	if (!dirty_writeback_interval)
														
 
															+		return 0;
														
 
															+
														
 
															 	expired = wb->last_old_flush +
														
 
															 			msecs_to_jiffies(dirty_writeback_interval * 10);
														
 
															 	if (time_before(jiffies, expired))
														
@@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
															 	while ((work = get_next_work_item(bdi, wb)) != NULL) {
														
 
															 		struct wb_writeback_args args = work->args;
														
 
															+		int post_clear;
														
 
															 		/*
														
 
															 		 * Override sync mode, in case we must wait for completion
														
@@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
															 		if (force_wait)
														
 
															 			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
														
 
															+		post_clear = WB_SYNC_ALL || args.sb_pinned;
														
 
															+
														
 
															 		/*
														
 
															 		 * If this isn't a data integrity operation, just notify
														
 
															 		 * that we have seen this work and we are now starting it.
														
 
															 		 */
														
 
															-		if (args.sync_mode == WB_SYNC_NONE)
														
 
															+		if (!post_clear)
														
 
															 			wb_clear_pending(wb, work);
														
 
															 		wrote += wb_writeback(wb, &args);
														
@@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
															 		 * This is a data integrity writeback, so only do the
														
 
															 		 * notification when we have completed the work.
														
 
															 		 */
														
 
															-		if (args.sync_mode == WB_SYNC_ALL)
														
 
															+		if (post_clear)
														
 
															 			wb_clear_pending(wb, work);
														
 
															 	}
														
@@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 
															 				break;
														
 
															 		}
														
 
															-		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
														
 
															-		schedule_timeout_interruptible(wait_jiffies);
														
 
															+		if (dirty_writeback_interval) {
														
 
															+			wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
														
 
															+			schedule_timeout_interruptible(wait_jiffies);
														
 
															+		} else {
														
 
															+			set_current_state(TASK_INTERRUPTIBLE);
														
 
															+			if (list_empty_careful(&wb->bdi->work_list) &&
														
 
															+			    !kthread_should_stop())
														
 
															+				schedule();
														
 
															+			__set_current_state(TASK_RUNNING);
														
 
															+		}
														
 
															+
														
 
															 		try_to_freeze();
														
 
															 	}
														
@@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 
															 		if (!bdi_has_dirty_io(bdi))
														
 
															 			continue;
														
 
															-		bdi_alloc_queue_work(bdi, &args);
														
 
															+		bdi_alloc_queue_work(bdi, &args, 0);
														
 
															 	}
														
 
															 	rcu_read_unlock();
														
@@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
 
															 	iput(old_inode);
														
 
															 }
														
 
															+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
														
 
															+{
														
 
															+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
														
 
															+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
														
 
															+	long nr_to_write;
														
 
															+
														
 
															+	nr_to_write = nr_dirty + nr_unstable +
														
 
															+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
														
 
															+
														
 
															+	bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
														
 
															+}
														
 
															+
														
 
															 /**
														
 
															  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
														
 
															  * @sb: the superblock
														
@@ -1194,17 +1243,22 @@ static void wait_sb_inodes(struct super_block *sb)
 
															  */
														
 
															 void writeback_inodes_sb(struct super_block *sb)
														
 
															 {
														
 
															-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
														
 
															-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
														
 
															-	long nr_to_write;
														
 
															-
														
 
															-	nr_to_write = nr_dirty + nr_unstable +
														
 
															-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
														
 
															-
														
 
															-	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
														
 
															+	__writeback_inodes_sb(sb, 0);
														
 
															 }
														
 
															 EXPORT_SYMBOL(writeback_inodes_sb);
														
 
															+/**
														
 
															+ * writeback_inodes_sb_locked	- writeback dirty inodes from given super_block
														
 
															+ * @sb: the superblock
														
 
															+ *
														
 
															+ * Like writeback_inodes_sb(), except the caller already holds the
														
 
															+ * sb umount sem.
														
 
															+ */
														
 
															+void writeback_inodes_sb_locked(struct super_block *sb)
														
 
															+{
														
 
															+	__writeback_inodes_sb(sb, 1);
														
 
															+}
														
 
															+
														
 
															 /**
														
 
															  * writeback_inodes_sb_if_idle	-	start writeback if none underway
														
 
															  * @sb: the superblock
														
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 
															 				if ((start + nr_sects) != blk) {
														
 
															 					rv = blkdev_issue_discard(bdev, start,
														
 
															 							    nr_sects, GFP_NOFS,
														
 
															-							    DISCARD_FL_BARRIER);
														
 
															+							    BLKDEV_IFL_WAIT |
														
 
															+							    BLKDEV_IFL_BARRIER);
														
 
															 					if (rv)
														
 
															 						goto fail;
														
 
															 					nr_sects = 0;
														
@@ -869,7 +870,7 @@ start_new_extent:
 
															 	}
														
 
															 	if (nr_sects) {
														
 
															 		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
														
 
															-					 DISCARD_FL_BARRIER);
														
 
															+					 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
														
 
															 		if (rv)
														
 
															 			goto fail;
														
 
															 	}
														
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 
															 	 */
														
 
															 	if ((journal->j_fs_dev != journal->j_dev) &&
														
 
															 	    (journal->j_flags & JBD2_BARRIER))
														
 
															-		blkdev_issue_flush(journal->j_fs_dev, NULL);
														
 
															+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
														
 
															+			BLKDEV_IFL_WAIT);
														
 
															 	if (!(journal->j_flags & JBD2_ABORT))
														
 
															 		jbd2_journal_update_superblock(journal, 1);
														
 
															 	return 0;
														
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
 
															 	if (commit_transaction->t_flushed_data_blocks &&
														
 
															 	    (journal->j_fs_dev != journal->j_dev) &&
														
 
															 	    (journal->j_flags & JBD2_BARRIER))
														
 
															-		blkdev_issue_flush(journal->j_fs_dev, NULL);
														
 
															+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
														
 
															+			BLKDEV_IFL_WAIT);
														
 
															 	/* Done it all: now write the commit record asynchronously. */
														
 
															 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
														
@@ -727,7 +728,8 @@ start_journal_io:
 
															 		if (err)
														
 
															 			__jbd2_journal_abort_hard(journal);
														
 
															 		if (journal->j_flags & JBD2_BARRIER)
														
 
															-			blkdev_issue_flush(journal->j_dev, NULL);
														
 
															+			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
														
 
															+				BLKDEV_IFL_WAIT);
														
 
															 	}
														
 
															 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
														
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 
															 						   start * sects_per_block,
														
 
															 						   nblocks * sects_per_block,
														
 
															 						   GFP_NOFS,
														
 
															-						   DISCARD_FL_BARRIER);
														
 
															+						   BLKDEV_IFL_BARRIER);
														
 
															 			if (ret < 0)
														
 
															 				return ret;
														
 
															 			nblocks = 0;
														
@@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 
															 		ret = blkdev_issue_discard(nilfs->ns_bdev,
														
 
															 					   start * sects_per_block,
														
 
															 					   nblocks * sects_per_block,
														
 
															-					   GFP_NOFS, DISCARD_FL_BARRIER);
														
 
															+					   GFP_NOFS, BLKDEV_IFL_BARRIER);
														
 
															 	return ret;
														
 
															 }
														
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
 
															 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
														
 
															 	defined(CONFIG_ACORN_PARTITION_ADFS)
														
 
															-static int
														
 
															-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-		unsigned long first_sect, int slot, unsigned long nr_sects)
														
 
															+static int riscix_partition(struct parsed_partitions *state,
														
 
															+			    unsigned long first_sect, int slot,
														
 
															+			    unsigned long nr_sects)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	struct riscix_record *rr;
														
 
															-	rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect);
														
 
															+	rr = read_part_sector(state, first_sect, &sect);
														
 
															 	if (!rr)
														
 
															 		return -1;
														
@@ -123,9 +123,9 @@ struct linux_part {
 
															 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
														
 
															 	defined(CONFIG_ACORN_PARTITION_ADFS)
														
 
															-static int
														
 
															-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-		unsigned long first_sect, int slot, unsigned long nr_sects)
														
 
															+static int linux_partition(struct parsed_partitions *state,
														
 
															+			   unsigned long first_sect, int slot,
														
 
															+			   unsigned long nr_sects)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	struct linux_part *linuxp;
														
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 
															 	put_partition(state, slot++, first_sect, size);
														
 
															-	linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect);
														
 
															+	linuxp = read_part_sector(state, first_sect, &sect);
														
 
															 	if (!linuxp)
														
 
															 		return -1;
														
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 
															 #endif
														
 
															 #ifdef CONFIG_ACORN_PARTITION_CUMANA
														
 
															-int
														
 
															-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int adfspart_check_CUMANA(struct parsed_partitions *state)
														
 
															 {
														
 
															 	unsigned long first_sector = 0;
														
 
															 	unsigned int start_blk = 0;
														
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 
															 		struct adfs_discrecord *dr;
														
 
															 		unsigned int nr_sects;
														
 
															-		data = read_dev_sector(bdev, start_blk * 2 + 6, &sect);
														
 
															+		data = read_part_sector(state, start_blk * 2 + 6, &sect);
														
 
															 		if (!data)
														
 
															 			return -1;
														
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 
															 #ifdef CONFIG_ACORN_PARTITION_RISCIX
														
 
															 		case PARTITION_RISCIX_SCSI:
														
 
															 			/* RISCiX - we don't know how to find the next one. */
														
 
															-			slot = riscix_partition(state, bdev, first_sector,
														
 
															-						 slot, nr_sects);
														
 
															+			slot = riscix_partition(state, first_sector, slot,
														
 
															+						nr_sects);
														
 
															 			break;
														
 
															 #endif
														
 
															 		case PARTITION_LINUX:
														
 
															-			slot = linux_partition(state, bdev, first_sector,
														
 
															-						slot, nr_sects);
														
 
															+			slot = linux_partition(state, first_sector, slot,
														
 
															+					       nr_sects);
														
 
															 			break;
														
 
															 		}
														
 
															 		put_dev_sector(sect);
														
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 
															  *	    hda1 = ADFS partition on first drive.
														
 
															  *	    hda2 = non-ADFS partition.
														
 
															  */
														
 
															-int
														
 
															-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int adfspart_check_ADFS(struct parsed_partitions *state)
														
 
															 {
														
 
															 	unsigned long start_sect, nr_sects, sectscyl, heads;
														
 
															 	Sector sect;
														
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 
															 	unsigned char id;
														
 
															 	int slot = 1;
														
 
															-	data = read_dev_sector(bdev, 6, &sect);
														
 
															+	data = read_part_sector(state, 6, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 
															 	/*
														
 
															 	 * Work out start of non-adfs partition.
														
 
															 	 */
														
 
															-	nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
														
 
															+	nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
														
 
															 	if (start_sect) {
														
 
															 		switch (id) {
														
 
															 #ifdef CONFIG_ACORN_PARTITION_RISCIX
														
 
															 		case PARTITION_RISCIX_SCSI:
														
 
															 		case PARTITION_RISCIX_MFM:
														
 
															-			slot = riscix_partition(state, bdev, start_sect,
														
 
															-						 slot, nr_sects);
														
 
															+			slot = riscix_partition(state, start_sect, slot,
														
 
															+						nr_sects);
														
 
															 			break;
														
 
															 #endif
														
 
															 		case PARTITION_LINUX:
														
 
															-			slot = linux_partition(state, bdev, start_sect,
														
 
															-						slot, nr_sects);
														
 
															+			slot = linux_partition(state, start_sect, slot,
														
 
															+					       nr_sects);
														
 
															 			break;
														
 
															 		}
														
 
															 	}
														
@@ -308,10 +306,11 @@ struct ics_part {
 
															 	__le32 size;
														
 
															 };
														
 
															-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
														
 
															+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
														
 
															+				   unsigned long block)
														
 
															 {
														
 
															 	Sector sect;
														
 
															-	unsigned char *data = read_dev_sector(bdev, block, &sect);
														
 
															+	unsigned char *data = read_part_sector(state, block, &sect);
														
 
															 	int result = 0;
														
 
															 	if (data) {
														
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
 
															  *	    hda2 = ADFS partition 1 on first drive.
														
 
															  *		..etc..
														
 
															  */
														
 
															-int
														
 
															-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int adfspart_check_ICS(struct parsed_partitions *state)
														
 
															 {
														
 
															 	const unsigned char *data;
														
 
															 	const struct ics_part *p;
														
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 
															 	/*
														
 
															 	 * Try ICS style partitions - sector 0 contains partition info.
														
 
															 	 */
														
 
															-	data = read_dev_sector(bdev, 0, &sect);
														
 
															+	data = read_part_sector(state, 0, &sect);
														
 
															 	if (!data)
														
 
															 	    	return -1;
														
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 
															 			 * partition is.  We must not make this visible
														
 
															 			 * to the filesystem.
														
 
															 			 */
														
 
															-			if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
														
 
															+			if (size > 1 && adfspart_check_ICSLinux(state, start)) {
														
 
															 				start += 1;
														
 
															 				size -= 1;
														
 
															 			}
														
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
 
															  *	    hda2 = ADFS partition 1 on first drive.
														
 
															  *		..etc..
														
 
															  */
														
 
															-int
														
 
															-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int adfspart_check_POWERTEC(struct parsed_partitions *state)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	const unsigned char *data;
														
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
 
															 	int slot = 1;
														
 
															 	int i;
														
 
															-	data = read_dev_sector(bdev, 0, &sect);
														
 
															+	data = read_part_sector(state, 0, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
 
															  *  1. The individual ADFS boot block entries that are placed on the disk.
														
 
															  *  2. The start address of the next entry.
														
 
															  */
														
 
															-int
														
 
															-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int adfspart_check_EESOX(struct parsed_partitions *state)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	const unsigned char *data;
														
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 
															 	sector_t start = 0;
														
 
															 	int i, slot = 1;
														
 
															-	data = read_dev_sector(bdev, 7, &sect);
														
 
															+	data = read_part_sector(state, 7, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 
															 	if (i != 0) {
														
 
															 		sector_t size;
														
 
															-		size = get_capacity(bdev->bd_disk);
														
 
															+		size = get_capacity(state->bdev->bd_disk);
														
 
															 		put_partition(state, slot++, start, size - start);
														
 
															 		printk("\n");
														
 
															 	}
														
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
 
															  *  format, and everyone stick to it?
														
 
															  */
														
 
															-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
														
 
															-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
														
 
															-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
														
 
															-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
														
 
															-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int adfspart_check_CUMANA(struct parsed_partitions *state);
														
 
															+int adfspart_check_ADFS(struct parsed_partitions *state);
														
 
															+int adfspart_check_ICS(struct parsed_partitions *state);
														
 
															+int adfspart_check_POWERTEC(struct parsed_partitions *state);
														
 
															+int adfspart_check_EESOX(struct parsed_partitions *state);
														
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
 
															 	return sum;
														
 
															 }
														
 
															-int
														
 
															-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int amiga_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	unsigned char *data;
														
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	for (blk = 0; ; blk++, put_dev_sector(sect)) {
														
 
															 		if (blk == RDB_ALLOCATION_LIMIT)
														
 
															 			goto rdb_done;
														
 
															-		data = read_dev_sector(bdev, blk, &sect);
														
 
															+		data = read_part_sector(state, blk, &sect);
														
 
															 		if (!data) {
														
 
															 			if (warn_no_part)
														
 
															 				printk("Dev %s: unable to read RDB block %d\n",
														
 
															-				       bdevname(bdev, b), blk);
														
 
															+				       bdevname(state->bdev, b), blk);
														
 
															 			res = -1;
														
 
															 			goto rdb_done;
														
 
															 		}
														
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 		}
														
 
															 		printk("Dev %s: RDB in block %d has bad checksum\n",
														
 
															-			       bdevname(bdev, b), blk);
														
 
															+		       bdevname(state->bdev, b), blk);
														
 
															 	}
														
 
															 	/* blksize is blocks per 512 byte standard block */
														
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	put_dev_sector(sect);
														
 
															 	for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
														
 
															 		blk *= blksize;	/* Read in terms partition table understands */
														
 
															-		data = read_dev_sector(bdev, blk, &sect);
														
 
															+		data = read_part_sector(state, blk, &sect);
														
 
															 		if (!data) {
														
 
															 			if (warn_no_part)
														
 
															 				printk("Dev %s: unable to read partition block %d\n",
														
 
															-				       bdevname(bdev, b), blk);
														
 
															+				       bdevname(state->bdev, b), blk);
														
 
															 			res = -1;
														
 
															 			goto rdb_done;
														
 
															 		}
														
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
 
															  *  fs/partitions/amiga.h
														
 
															  */
														
 
															-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int amiga_partition(struct parsed_partitions *state);
														
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
 
															 		memcmp (s, "RAW", 3) == 0 ;
														
 
															 }
														
 
															-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int atari_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	struct rootsector *rs;
														
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
														
 
															 #endif
														
 
															-	rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect);
														
 
															+	rs = read_part_sector(state, 0, &sect);
														
 
															 	if (!rs)
														
 
															 		return -1;
														
 
															 	/* Verify this is an Atari rootsector: */
														
 
															-	hd_size = bdev->bd_inode->i_size >> 9;
														
 
															+	hd_size = state->bdev->bd_inode->i_size >> 9;
														
 
															 	if (!VALID_PARTITION(&rs->part[0], hd_size) &&
														
 
															 	    !VALID_PARTITION(&rs->part[1], hd_size) &&
														
 
															 	    !VALID_PARTITION(&rs->part[2], hd_size) &&
														
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 		printk(" XGM<");
														
 
															 		partsect = extensect = be32_to_cpu(pi->st);
														
 
															 		while (1) {
														
 
															-			xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2);
														
 
															+			xrs = read_part_sector(state, partsect, &sect2);
														
 
															 			if (!xrs) {
														
 
															 				printk (" block %ld read failed\n", partsect);
														
 
															 				put_dev_sector(sect);
														
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
 
															   u16 checksum;			/* checksum for bootable disks */
														
 
															 } __attribute__((__packed__));
														
 
															-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int atari_partition(struct parsed_partitions *state);
														
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
 
															 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
														
 
															-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
														
 
															+static int (*check_part[])(struct parsed_partitions *) = {
														
 
															 	/*
														
 
															 	 * Probe partition formats with tables at disk address 0
														
 
															 	 * that also have an ADFS boot block at 0xdc0.
														
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 
															 	struct parsed_partitions *state;
														
 
															 	int i, res, err;
														
 
															-	state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
														
 
															+	state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
														
 
															 	if (!state)
														
 
															 		return NULL;
														
 
															+	state->bdev = bdev;
														
 
															 	disk_name(hd, 0, state->name);
														
 
															 	printk(KERN_INFO " %s:", state->name);
														
 
															 	if (isdigit(state->name[strlen(state->name)-1]))
														
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 
															 	i = res = err = 0;
														
 
															 	while (!res && check_part[i]) {
														
 
															 		memset(&state->parts, 0, sizeof(state->parts));
														
 
															-		res = check_part[i++](state, bdev);
														
 
															+		res = check_part[i++](state);
														
 
															 		if (res < 0) {
														
 
															 			/* We have hit an I/O error which we don't report now.
														
 
															 		 	* But record it, and let the others do their job.
														
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 
															 	}
														
 
															 	if (res > 0)
														
 
															 		return state;
														
 
															+	if (state->access_beyond_eod)
														
 
															+		err = -ENOSPC;
														
 
															 	if (err)
														
 
															 	/* The partition is unrecognized. So report I/O errors if there were any */
														
 
															 		res = err;
														
@@ -538,12 +541,33 @@ exit:
 
															 	disk_part_iter_exit(&piter);
														
 
															 }
														
 
															+static bool disk_unlock_native_capacity(struct gendisk *disk)
														
 
															+{
														
 
															+	const struct block_device_operations *bdops = disk->fops;
														
 
															+
														
 
															+	if (bdops->unlock_native_capacity &&
														
 
															+	    !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
														
 
															+		printk(KERN_CONT "enabling native capacity\n");
														
 
															+		bdops->unlock_native_capacity(disk);
														
 
															+		disk->flags |= GENHD_FL_NATIVE_CAPACITY;
														
 
															+		return true;
														
 
															+	} else {
														
 
															+		printk(KERN_CONT "truncated\n");
														
 
															+		return false;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
														
 
															 {
														
 
															+	struct parsed_partitions *state = NULL;
														
 
															 	struct disk_part_iter piter;
														
 
															 	struct hd_struct *part;
														
 
															-	struct parsed_partitions *state;
														
 
															 	int p, highest, res;
														
 
															+rescan:
														
 
															+	if (state && !IS_ERR(state)) {
														
 
															+		kfree(state);
														
 
															+		state = NULL;
														
 
															+	}
														
 
															 	if (bdev->bd_part_count)
														
 
															 		return -EBUSY;
														
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 
															 	bdev->bd_invalidated = 0;
														
 
															 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
														
 
															 		return 0;
														
 
															-	if (IS_ERR(state))	/* I/O error reading the partition table */
														
 
															+	if (IS_ERR(state)) {
														
 
															+		/*
														
 
															+		 * I/O error reading the partition table.  If any
														
 
															+		 * partition code tried to read beyond EOD, retry
														
 
															+		 * after unlocking native capacity.
														
 
															+		 */
														
 
															+		if (PTR_ERR(state) == -ENOSPC) {
														
 
															+			printk(KERN_WARNING "%s: partition table beyond EOD, ",
														
 
															+			       disk->disk_name);
														
 
															+			if (disk_unlock_native_capacity(disk))
														
 
															+				goto rescan;
														
 
															+		}
														
 
															 		return -EIO;
														
 
															+	}
														
 
															+	/*
														
 
															+	 * If any partition code tried to read beyond EOD, try
														
 
															+	 * unlocking native capacity even if partition table is
														
 
															+	 * sucessfully read as we could be missing some partitions.
														
 
															+	 */
														
 
															+	if (state->access_beyond_eod) {
														
 
															+		printk(KERN_WARNING
														
 
															+		       "%s: partition table partially beyond EOD, ",
														
 
															+		       disk->disk_name);
														
 
															+		if (disk_unlock_native_capacity(disk))
														
 
															+			goto rescan;
														
 
															+	}
														
 
															 	/* tell userspace that the media / partition table may have changed */
														
 
															 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
														
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 
															 	/* add partitions */
														
 
															 	for (p = 1; p < state->limit; p++) {
														
 
															 		sector_t size, from;
														
 
															-try_scan:
														
 
															+
														
 
															 		size = state->parts[p].size;
														
 
															 		if (!size)
														
 
															 			continue;
														
@@ -589,30 +637,21 @@ try_scan:
 
															 		from = state->parts[p].from;
														
 
															 		if (from >= get_capacity(disk)) {
														
 
															 			printk(KERN_WARNING
														
 
															-			       "%s: p%d ignored, start %llu is behind the end of the disk\n",
														
 
															+			       "%s: p%d start %llu is beyond EOD, ",
														
 
															 			       disk->disk_name, p, (unsigned long long) from);
														
 
															+			if (disk_unlock_native_capacity(disk))
														
 
															+				goto rescan;
														
 
															 			continue;
														
 
															 		}
														
 
															 		if (from + size > get_capacity(disk)) {
														
 
															-			const struct block_device_operations *bdops = disk->fops;
														
 
															-			unsigned long long capacity;
														
 
															-
														
 
															 			printk(KERN_WARNING
														
 
															-			       "%s: p%d size %llu exceeds device capacity, ",
														
 
															+			       "%s: p%d size %llu extends beyond EOD, ",
														
 
															 			       disk->disk_name, p, (unsigned long long) size);
														
 
															-			if (bdops->set_capacity &&
														
 
															-			    (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
														
 
															-				printk(KERN_CONT "enabling native capacity\n");
														
 
															-				capacity = bdops->set_capacity(disk, ~0ULL);
														
 
															-				disk->flags |= GENHD_FL_NATIVE_CAPACITY;
														
 
															-				if (capacity > get_capacity(disk)) {
														
 
															-					set_capacity(disk, capacity);
														
 
															-					check_disk_size_change(disk, bdev);
														
 
															-					bdev->bd_invalidated = 0;
														
 
															-				}
														
 
															-				goto try_scan;
														
 
															+			if (disk_unlock_native_capacity(disk)) {
														
 
															+				/* free state and restart */
														
 
															+				goto rescan;
														
 
															 			} else {
														
 
															 				/*
														
 
															 				 * we can not ignore partitions of broken tables
														
@@ -620,7 +659,6 @@ try_scan:
 
															 				 * we limit them to the end of the disk to avoid
														
 
															 				 * creating invalid block devices
														
 
															 				 */
														
 
															-				printk(KERN_CONT "limited to end of disk\n");
														
 
															 				size = get_capacity(disk) - from;
														
 
															 			}
														
 
															 		}
														
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
 
															  * description.
														
 
															  */
														
 
															 struct parsed_partitions {
														
 
															+	struct block_device *bdev;
														
 
															 	char name[BDEVNAME_SIZE];
														
 
															 	struct {
														
 
															 		sector_t from;
														
@@ -14,8 +15,19 @@ struct parsed_partitions {
 
															 	} parts[DISK_MAX_PARTS];
														
 
															 	int next;
														
 
															 	int limit;
														
 
															+	bool access_beyond_eod;
														
 
															 };
														
 
															+static inline void *read_part_sector(struct parsed_partitions *state,
														
 
															+				     sector_t n, Sector *p)
														
 
															+{
														
 
															+	if (n >= get_capacity(state->bdev->bd_disk)) {
														
 
															+		state->access_beyond_eod = true;
														
 
															+		return NULL;
														
 
															+	}
														
 
															+	return read_dev_sector(state->bdev, n, p);
														
 
															+}
														
 
															+
														
 
															 static inline void
														
 
															 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
														
 
															 {
														
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
 
															  *  the part[0] entry for this disk, and is the number of
														
 
															  *  physical sectors available on the disk.
														
 
															  */
														
 
															-static u64
														
 
															-last_lba(struct block_device *bdev)
														
 
															+static u64 last_lba(struct block_device *bdev)
														
 
															 {
														
 
															 	if (!bdev || !bdev->bd_inode)
														
 
															 		return 0;
														
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
 
															 /**
														
 
															  * read_lba(): Read bytes from disk, starting at given LBA
														
 
															- * @bdev
														
 
															+ * @state
														
 
															  * @lba
														
 
															  * @buffer
														
 
															  * @size_t
														
 
															  *
														
 
															- * Description:  Reads @count bytes from @bdev into @buffer.
														
 
															+ * Description: Reads @count bytes from @state->bdev into @buffer.
														
 
															  * Returns number of bytes read on success, 0 on error.
														
 
															  */
														
 
															-static size_t
														
 
															-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
														
 
															+static size_t read_lba(struct parsed_partitions *state,
														
 
															+		       u64 lba, u8 *buffer, size_t count)
														
 
															 {
														
 
															 	size_t totalreadcount = 0;
														
 
															+	struct block_device *bdev = state->bdev;
														
 
															 	sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
														
 
															-	if (!bdev || !buffer || lba > last_lba(bdev))
														
 
															+	if (!buffer || lba > last_lba(bdev))
														
 
															                 return 0;
														
 
															 	while (count) {
														
 
															 		int copied = 512;
														
 
															 		Sector sect;
														
 
															-		unsigned char *data = read_dev_sector(bdev, n++, &sect);
														
 
															+		unsigned char *data = read_part_sector(state, n++, &sect);
														
 
															 		if (!data)
														
 
															 			break;
														
 
															 		if (copied > count)
														
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 
															 /**
														
 
															  * alloc_read_gpt_entries(): reads partition entries from disk
														
 
															- * @bdev
														
 
															+ * @state
														
 
															  * @gpt - GPT header
														
 
															  * 
														
 
															  * Description: Returns ptes on success,  NULL on error.
														
 
															  * Allocates space for PTEs based on information found in @gpt.
														
 
															  * Notes: remember to free pte when you're done!
														
 
															  */
														
 
															-static gpt_entry *
														
 
															-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
														
 
															+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
														
 
															+					 gpt_header *gpt)
														
 
															 {
														
 
															 	size_t count;
														
 
															 	gpt_entry *pte;
														
 
															-	if (!bdev || !gpt)
														
 
															+
														
 
															+	if (!gpt)
														
 
															 		return NULL;
														
 
															 	count = le32_to_cpu(gpt->num_partition_entries) *
														
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 
															 	if (!pte)
														
 
															 		return NULL;
														
 
															-	if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
														
 
															+	if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
														
 
															                      (u8 *) pte,
														
 
															 		     count) < count) {
														
 
															 		kfree(pte);
														
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 
															 /**
														
 
															  * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
														
 
															- * @bdev
														
 
															+ * @state
														
 
															  * @lba is the Logical Block Address of the partition table
														
 
															  * 
														
 
															  * Description: returns GPT header on success, NULL on error.   Allocates
														
 
															- * and fills a GPT header starting at @ from @bdev.
														
 
															+ * and fills a GPT header starting at @ from @state->bdev.
														
 
															  * Note: remember to free gpt when finished with it.
														
 
															  */
														
 
															-static gpt_header *
														
 
															-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
														
 
															+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
														
 
															+					 u64 lba)
														
 
															 {
														
 
															 	gpt_header *gpt;
														
 
															-	unsigned ssz = bdev_logical_block_size(bdev);
														
 
															-
														
 
															-	if (!bdev)
														
 
															-		return NULL;
														
 
															+	unsigned ssz = bdev_logical_block_size(state->bdev);
														
 
															 	gpt = kzalloc(ssz, GFP_KERNEL);
														
 
															 	if (!gpt)
														
 
															 		return NULL;
														
 
															-	if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
														
 
															+	if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
														
 
															 		kfree(gpt);
														
 
															                 gpt=NULL;
														
 
															 		return NULL;
														
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 
															 /**
														
 
															  * is_gpt_valid() - tests one GPT header and PTEs for validity
														
 
															- * @bdev
														
 
															+ * @state
														
 
															  * @lba is the logical block address of the GPT header to test
														
 
															  * @gpt is a GPT header ptr, filled on return.
														
 
															  * @ptes is a PTEs ptr, filled on return.
														
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 
															  * Description: returns 1 if valid,  0 on error.
														
 
															  * If valid, returns pointers to newly allocated GPT header and PTEs.
														
 
															  */
														
 
															-static int
														
 
															-is_gpt_valid(struct block_device *bdev, u64 lba,
														
 
															-	     gpt_header **gpt, gpt_entry **ptes)
														
 
															+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
														
 
															+			gpt_header **gpt, gpt_entry **ptes)
														
 
															 {
														
 
															 	u32 crc, origcrc;
														
 
															 	u64 lastlba;
														
 
															-	if (!bdev || !gpt || !ptes)
														
 
															+	if (!ptes)
														
 
															 		return 0;
														
 
															-	if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
														
 
															+	if (!(*gpt = alloc_read_gpt_header(state, lba)))
														
 
															 		return 0;
														
 
															 	/* Check the GUID Partition Table signature */
														
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 
															 	/* Check the first_usable_lba and last_usable_lba are
														
 
															 	 * within the disk.
														
 
															 	 */
														
 
															-	lastlba = last_lba(bdev);
														
 
															+	lastlba = last_lba(state->bdev);
														
 
															 	if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
														
 
															 		pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
														
 
															 			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
														
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 
															 		goto fail;
														
 
															 	}
														
 
															-	if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
														
 
															+	if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
														
 
															 		goto fail;
														
 
															 	/* Check the GUID Partition Entry Array CRC */
														
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
															 /**
														
 
															  * find_valid_gpt() - Search disk for valid GPT headers and PTEs
														
 
															- * @bdev
														
 
															+ * @state
														
 
															  * @gpt is a GPT header ptr, filled on return.
														
 
															  * @ptes is a PTEs ptr, filled on return.
														
 
															  * Description: Returns 1 if valid, 0 on error.
														
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 
															  * This protects against devices which misreport their size, and forces
														
 
															  * the user to decide to use the Alternate GPT.
														
 
															  */
														
 
															-static int
														
 
															-find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
														
 
															+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
														
 
															+			  gpt_entry **ptes)
														
 
															 {
														
 
															 	int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
														
 
															 	gpt_header *pgpt = NULL, *agpt = NULL;
														
 
															 	gpt_entry *pptes = NULL, *aptes = NULL;
														
 
															 	legacy_mbr *legacymbr;
														
 
															 	u64 lastlba;
														
 
															-	if (!bdev || !gpt || !ptes)
														
 
															+
														
 
															+	if (!ptes)
														
 
															 		return 0;
														
 
															-	lastlba = last_lba(bdev);
														
 
															+	lastlba = last_lba(state->bdev);
														
 
															         if (!force_gpt) {
														
 
															                 /* This will be added to the EFI Spec. per Intel after v1.02. */
														
 
															                 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
														
 
															                 if (legacymbr) {
														
 
															-                        read_lba(bdev, 0, (u8 *) legacymbr,
														
 
															-                                 sizeof (*legacymbr));
														
 
															+                        read_lba(state, 0, (u8 *) legacymbr,
														
 
															+				 sizeof (*legacymbr));
														
 
															                         good_pmbr = is_pmbr_valid(legacymbr);
														
 
															                         kfree(legacymbr);
														
 
															                 }
														
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 
															                         goto fail;
														
 
															         }
														
 
															-	good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA,
														
 
															+	good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
														
 
															 				 &pgpt, &pptes);
														
 
															         if (good_pgpt)
														
 
															-		good_agpt = is_gpt_valid(bdev,
														
 
															+		good_agpt = is_gpt_valid(state,
														
 
															 					 le64_to_cpu(pgpt->alternate_lba),
														
 
															 					 &agpt, &aptes);
														
 
															         if (!good_agpt && force_gpt)
														
 
															-                good_agpt = is_gpt_valid(bdev, lastlba,
														
 
															-                                         &agpt, &aptes);
														
 
															+                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
														
 
															         /* The obviously unsuccessful case */
														
 
															         if (!good_pgpt && !good_agpt)
														
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 
															 }
														
 
															 /**
														
 
															- * efi_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+ * efi_partition(struct parsed_partitions *state)
														
 
															  * @state
														
 
															- * @bdev
														
 
															  *
														
 
															  * Description: called from check.c, if the disk contains GPT
														
 
															  * partitions, sets up partition entries in the kernel.
														
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 
															  *  1 if successful
														
 
															  *
														
 
															  */
														
 
															-int
														
 
															-efi_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int efi_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	gpt_header *gpt = NULL;
														
 
															 	gpt_entry *ptes = NULL;
														
 
															 	u32 i;
														
 
															-	unsigned ssz = bdev_logical_block_size(bdev) / 512;
														
 
															+	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
														
 
															-	if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
														
 
															+	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
														
 
															 		kfree(gpt);
														
 
															 		kfree(ptes);
														
 
															 		return 0;
														
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 		u64 size = le64_to_cpu(ptes[i].ending_lba) -
														
 
															 			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
														
 
															-		if (!is_pte_valid(&ptes[i], last_lba(bdev)))
														
 
															+		if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
														
 
															 			continue;
														
 
															 		put_partition(state, i+1, start * ssz, size * ssz);
														
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
 
															 } __attribute__ ((packed)) legacy_mbr;
														
 
															 /* Functions */
														
 
															-extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+extern int efi_partition(struct parsed_partitions *state);
														
 
															 #endif
														
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 
															 /*
														
 
															  */
														
 
															-int
														
 
															-ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int ibm_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															+	struct block_device *bdev = state->bdev;
														
 
															 	int blocksize, res;
														
 
															 	loff_t i_size, offset, size, fmt_size;
														
 
															 	dasd_information2_t *info;
														
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	/*
														
 
															 	 * Get volume label, extract name and type.
														
 
															 	 */
														
 
															-	data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect);
														
 
															+	data = read_part_sector(state, info->label_block*(blocksize/512),
														
 
															+				&sect);
														
 
															 	if (data == NULL)
														
 
															 		goto out_readerr;
														
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 			 */
														
 
															 			blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
														
 
															 			counter = 0;
														
 
															-			data = read_dev_sector(bdev, blk * (blocksize/512),
														
 
															-					       &sect);
														
 
															+			data = read_part_sector(state, blk * (blocksize/512),
														
 
															+						&sect);
														
 
															 			while (data != NULL) {
														
 
															 				struct vtoc_format1_label f1;
														
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 				    || f1.DS1FMTID == _ascebc['7']
														
 
															 				    || f1.DS1FMTID == _ascebc['9']) {
														
 
															 					blk++;
														
 
															-					data = read_dev_sector(bdev, blk *
														
 
															-							       (blocksize/512),
														
 
															-								&sect);
														
 
															+					data = read_part_sector(state,
														
 
															+						blk * (blocksize/512), &sect);
														
 
															 					continue;
														
 
															 				}
														
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 					      size * (blocksize >> 9));
														
 
															 				counter++;
														
 
															 				blk++;
														
 
															-				data = read_dev_sector(bdev,
														
 
															-						       blk * (blocksize/512),
														
 
															-						       &sect);
														
 
															+				data = read_part_sector(state,
														
 
															+						blk * (blocksize/512), &sect);
														
 
															 			}
														
 
															 			if (!data)
														
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
 
															-int ibm_partition(struct parsed_partitions *, struct block_device *);
														
 
															+int ibm_partition(struct parsed_partitions *);
														
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
 
															 #include "check.h"
														
 
															 #include "karma.h"
														
 
															-int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int karma_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	int i;
														
 
															 	int slot = 1;
														
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	} __attribute__((packed)) *label;
														
 
															 	struct d_partition *p;
														
 
															-	data = read_dev_sector(bdev, 0, &sect);
														
 
															+	data = read_part_sector(state, 0, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
 
															 #define KARMA_LABEL_MAGIC		0xAB56
														
 
															-int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int karma_partition(struct parsed_partitions *state);
														
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 
															 /**
														
 
															  * ldm_validate_privheads - Compare the primary privhead with its backups
														
 
															- * @bdev:  Device holding the LDM Database
														
 
															+ * @state: Partition check state including device holding the LDM Database
														
 
															  * @ph1:   Memory struct to fill with ph contents
														
 
															  *
														
 
															  * Read and compare all three privheads from disk.
														
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 
															  * Return:  'true'   Success
														
 
															  *          'false'  Error
														
 
															  */
														
 
															-static bool ldm_validate_privheads (struct block_device *bdev,
														
 
															-				    struct privhead *ph1)
														
 
															+static bool ldm_validate_privheads(struct parsed_partitions *state,
														
 
															+				   struct privhead *ph1)
														
 
															 {
														
 
															 	static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
														
 
															 	struct privhead *ph[3] = { ph1 };
														
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 
															 	long num_sects;
														
 
															 	int i;
														
 
															-	BUG_ON (!bdev || !ph1);
														
 
															+	BUG_ON (!state || !ph1);
														
 
															 	ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
														
 
															 	ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
														
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 
															 	/* Read and parse privheads */
														
 
															 	for (i = 0; i < 3; i++) {
														
 
															-		data = read_dev_sector (bdev,
														
 
															-			ph[0]->config_start + off[i], &sect);
														
 
															+		data = read_part_sector(state, ph[0]->config_start + off[i],
														
 
															+					&sect);
														
 
															 		if (!data) {
														
 
															 			ldm_crit ("Disk read failed.");
														
 
															 			goto out;
														
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
 
															 		}
														
 
															 	}
														
 
															-	num_sects = bdev->bd_inode->i_size >> 9;
														
 
															+	num_sects = state->bdev->bd_inode->i_size >> 9;
														
 
															 	if ((ph[0]->config_start > num_sects) ||
														
 
															 	   ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
														
@@ -397,20 +397,20 @@ out:
 
															 /**
														
 
															  * ldm_validate_tocblocks - Validate the table of contents and its backups
														
 
															- * @bdev:  Device holding the LDM Database
														
 
															- * @base:  Offset, into @bdev, of the database
														
 
															+ * @state: Partition check state including device holding the LDM Database
														
 
															+ * @base:  Offset, into @state->bdev, of the database
														
 
															  * @ldb:   Cache of the database structures
														
 
															  *
														
 
															  * Find and compare the four tables of contents of the LDM Database stored on
														
 
															- * @bdev and return the parsed information into @toc1.
														
 
															+ * @state->bdev and return the parsed information into @toc1.
														
 
															  *
														
 
															  * The offsets and sizes of the configs are range-checked against a privhead.
														
 
															  *
														
 
															  * Return:  'true'   @toc1 contains validated TOCBLOCK info
														
 
															  *          'false'  @toc1 contents are undefined
														
 
															  */
														
 
															-static bool ldm_validate_tocblocks(struct block_device *bdev,
														
 
															-	unsigned long base, struct ldmdb *ldb)
														
 
															+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
														
 
															+				   unsigned long base, struct ldmdb *ldb)
														
 
															 {
														
 
															 	static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
														
 
															 	struct tocblock *tb[4];
														
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
 
															 	int i, nr_tbs;
														
 
															 	bool result = false;
														
 
															-	BUG_ON(!bdev || !ldb);
														
 
															+	BUG_ON(!state || !ldb);
														
 
															 	ph = &ldb->ph;
														
 
															 	tb[0] = &ldb->toc;
														
 
															 	tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
														
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
 
															 	 * skip any that fail as long as we get at least one valid TOCBLOCK.
														
 
															 	 */
														
 
															 	for (nr_tbs = i = 0; i < 4; i++) {
														
 
															-		data = read_dev_sector(bdev, base + off[i], &sect);
														
 
															+		data = read_part_sector(state, base + off[i], &sect);
														
 
															 		if (!data) {
														
 
															 			ldm_error("Disk read failed for TOCBLOCK %d.", i);
														
 
															 			continue;
														
@@ -473,7 +473,7 @@ err:
 
															 /**
														
 
															  * ldm_validate_vmdb - Read the VMDB and validate it
														
 
															- * @bdev:  Device holding the LDM Database
														
 
															+ * @state: Partition check state including device holding the LDM Database
														
 
															  * @base:  Offset, into @bdev, of the database
														
 
															  * @ldb:   Cache of the database structures
														
 
															  *
														
@@ -483,8 +483,8 @@ err:
 
															  * Return:  'true'   @ldb contains validated VBDB info
														
 
															  *          'false'  @ldb contents are undefined
														
 
															  */
														
 
															-static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
														
 
															-			       struct ldmdb *ldb)
														
 
															+static bool ldm_validate_vmdb(struct parsed_partitions *state,
														
 
															+			      unsigned long base, struct ldmdb *ldb)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	u8 *data;
														
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
 
															 	struct vmdb *vm;
														
 
															 	struct tocblock *toc;
														
 
															-	BUG_ON (!bdev || !ldb);
														
 
															+	BUG_ON (!state || !ldb);
														
 
															 	vm  = &ldb->vm;
														
 
															 	toc = &ldb->toc;
														
 
															-	data = read_dev_sector (bdev, base + OFF_VMDB, &sect);
														
 
															+	data = read_part_sector(state, base + OFF_VMDB, &sect);
														
 
															 	if (!data) {
														
 
															 		ldm_crit ("Disk read failed.");
														
 
															 		return false;
														
@@ -534,21 +534,21 @@ out:
 
															 /**
														
 
															  * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
														
 
															- * @bdev:  Device holding the LDM Database
														
 
															+ * @state: Partition check state including device holding the LDM Database
														
 
															  *
														
 
															  * This function provides a weak test to decide whether the device is a dynamic
														
 
															  * disk or not.  It looks for an MS-DOS-style partition table containing at
														
 
															  * least one partition of type 0x42 (formerly SFS, now used by Windows for
														
 
															  * dynamic disks).
														
 
															  *
														
 
															- * N.B.  The only possible error can come from the read_dev_sector and that is
														
 
															+ * N.B.  The only possible error can come from the read_part_sector and that is
														
 
															  *       only likely to happen if the underlying device is strange.  If that IS
														
 
															  *       the case we should return zero to let someone else try.
														
 
															  *
														
 
															- * Return:  'true'   @bdev is a dynamic disk
														
 
															- *          'false'  @bdev is not a dynamic disk, or an error occurred
														
 
															+ * Return:  'true'   @state->bdev is a dynamic disk
														
 
															+ *          'false'  @state->bdev is not a dynamic disk, or an error occurred
														
 
															  */
														
 
															-static bool ldm_validate_partition_table (struct block_device *bdev)
														
 
															+static bool ldm_validate_partition_table(struct parsed_partitions *state)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	u8 *data;
														
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
 
															 	int i;
														
 
															 	bool result = false;
														
 
															-	BUG_ON (!bdev);
														
 
															+	BUG_ON(!state);
														
 
															-	data = read_dev_sector (bdev, 0, &sect);
														
 
															+	data = read_part_sector(state, 0, &sect);
														
 
															 	if (!data) {
														
 
															 		ldm_crit ("Disk read failed.");
														
 
															 		return false;
														
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 
															 /**
														
 
															  * ldm_get_vblks - Read the on-disk database of VBLKs into memory
														
 
															- * @bdev:  Device holding the LDM Database
														
 
															- * @base:  Offset, into @bdev, of the database
														
 
															+ * @state: Partition check state including device holding the LDM Database
														
 
															+ * @base:  Offset, into @state->bdev, of the database
														
 
															  * @ldb:   Cache of the database structures
														
 
															  *
														
 
															  * To use the information from the VBLKs, they need to be read from the disk,
														
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 
															  * Return:  'true'   All the VBLKs were read successfully
														
 
															  *          'false'  An error occurred
														
 
															  */
														
 
															-static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
														
 
															-			   struct ldmdb *ldb)
														
 
															+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
														
 
															+			  struct ldmdb *ldb)
														
 
															 {
														
 
															 	int size, perbuf, skip, finish, s, v, recs;
														
 
															 	u8 *data = NULL;
														
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 
															 	bool result = false;
														
 
															 	LIST_HEAD (frags);
														
 
															-	BUG_ON (!bdev || !ldb);
														
 
															+	BUG_ON(!state || !ldb);
														
 
															 	size   = ldb->vm.vblk_size;
														
 
															 	perbuf = 512 / size;
														
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 
															 	finish = (size * ldb->vm.last_vblk_seq) >> 9;
														
 
															 	for (s = skip; s < finish; s++) {		/* For each sector */
														
 
															-		data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect);
														
 
															+		data = read_part_sector(state, base + OFF_VMDB + s, &sect);
														
 
															 		if (!data) {
														
 
															 			ldm_crit ("Disk read failed.");
														
 
															 			goto out;
														
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
 
															 /**
														
 
															  * ldm_partition - Find out whether a device is a dynamic disk and handle it
														
 
															- * @pp:    List of the partitions parsed so far
														
 
															- * @bdev:  Device holding the LDM Database
														
 
															+ * @state: Partition check state including device holding the LDM Database
														
 
															  *
														
 
															  * This determines whether the device @bdev is a dynamic disk and if so creates
														
 
															  * the partitions necessary in the gendisk structure pointed to by @hd.
														
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
 
															  * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
														
 
															  * and so on: the actual data containing partitions.
														
 
															  *
														
 
															- * Return:  1 Success, @bdev is a dynamic disk and we handled it
														
 
															- *          0 Success, @bdev is not a dynamic disk
														
 
															+ * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
														
 
															+ *          0 Success, @state->bdev is not a dynamic disk
														
 
															  *         -1 An error occurred before enough information had been read
														
 
															- *            Or @bdev is a dynamic disk, but it may be corrupted
														
 
															+ *            Or @state->bdev is a dynamic disk, but it may be corrupted
														
 
															  */
														
 
															-int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
														
 
															+int ldm_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	struct ldmdb  *ldb;
														
 
															 	unsigned long base;
														
 
															 	int result = -1;
														
 
															-	BUG_ON (!pp || !bdev);
														
 
															+	BUG_ON(!state);
														
 
															 	/* Look for signs of a Dynamic Disk */
														
 
															-	if (!ldm_validate_partition_table (bdev))
														
 
															+	if (!ldm_validate_partition_table(state))
														
 
															 		return 0;
														
 
															 	ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
														
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
 
															 	}
														
 
															 	/* Parse and check privheads. */
														
 
															-	if (!ldm_validate_privheads (bdev, &ldb->ph))
														
 
															+	if (!ldm_validate_privheads(state, &ldb->ph))
														
 
															 		goto out;		/* Already logged */
														
 
															 	/* All further references are relative to base (database start). */
														
 
															 	base = ldb->ph.config_start;
														
 
															 	/* Parse and check tocs and vmdb. */
														
 
															-	if (!ldm_validate_tocblocks (bdev, base, ldb) ||
														
 
															-	    !ldm_validate_vmdb      (bdev, base, ldb))
														
 
															+	if (!ldm_validate_tocblocks(state, base, ldb) ||
														
 
															+	    !ldm_validate_vmdb(state, base, ldb))
														
 
															 	    	goto out;		/* Already logged */
														
 
															 	/* Initialize vblk lists in ldmdb struct */
														
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
 
															 	INIT_LIST_HEAD (&ldb->v_comp);
														
 
															 	INIT_LIST_HEAD (&ldb->v_part);
														
 
															-	if (!ldm_get_vblks (bdev, base, ldb)) {
														
 
															+	if (!ldm_get_vblks(state, base, ldb)) {
														
 
															 		ldm_crit ("Failed to read the VBLKs from the database.");
														
 
															 		goto cleanup;
														
 
															 	}
														
 
															 	/* Finally, create the data partition devices. */
														
 
															-	if (ldm_create_data_partitions (pp, ldb)) {
														
 
															+	if (ldm_create_data_partitions(state, ldb)) {
														
 
															 		ldm_debug ("Parsed LDM database successfully.");
														
 
															 		result = 1;
														
 
															 	}
														
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb {				/* Cache of the database */
 
															 	struct list_head v_part;
														
 
															 };
														
 
															-int ldm_partition (struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int ldm_partition(struct parsed_partitions *state);
														
 
															 #endif /* _FS_PT_LDM_H_ */
														
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
 
															 		stg[i] = 0;
														
 
															 }
														
 
															-int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int mac_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	int slot = 1;
														
 
															 	Sector sect;
														
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	struct mac_driver_desc *md;
														
 
															 	/* Get 0th block and look at the first partition map entry. */
														
 
															-	md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect);
														
 
															+	md = read_part_sector(state, 0, &sect);
														
 
															 	if (!md)
														
 
															 		return -1;
														
 
															 	if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
														
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	}
														
 
															 	secsize = be16_to_cpu(md->block_size);
														
 
															 	put_dev_sector(sect);
														
 
															-	data = read_dev_sector(bdev, secsize/512, &sect);
														
 
															+	data = read_part_sector(state, secsize/512, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
 
															 	part = (struct mac_partition *) (data + secsize%512);
														
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	for (blk = 1; blk <= blocks_in_map; ++blk) {
														
 
															 		int pos = blk * secsize;
														
 
															 		put_dev_sector(sect);
														
 
															-		data = read_dev_sector(bdev, pos/512, &sect);
														
 
															+		data = read_part_sector(state, pos/512, &sect);
														
 
															 		if (!data)
														
 
															 			return -1;
														
 
															 		part = (struct mac_partition *) (data + pos%512);
														
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	}
														
 
															 #ifdef CONFIG_PPC_PMAC
														
 
															 	if (found_root_goodness)
														
 
															-		note_bootable_part(bdev->bd_dev, found_root, found_root_goodness);
														
 
															+		note_bootable_part(state->bdev->bd_dev, found_root,
														
 
															+				   found_root_goodness);
														
 
															 #endif
														
 
															 	put_dev_sector(sect);
														
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
 
															     /* ... more stuff */
														
 
															 };
														
 
															-int mac_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int mac_partition(struct parsed_partitions *state);
														
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
 
															 #define AIX_LABEL_MAGIC2	0xC2
														
 
															 #define AIX_LABEL_MAGIC3	0xD4
														
 
															 #define AIX_LABEL_MAGIC4	0xC1
														
 
															-static int aix_magic_present(unsigned char *p, struct block_device *bdev)
														
 
															+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
														
 
															 {
														
 
															 	struct partition *pt = (struct partition *) (p + 0x1be);
														
 
															 	Sector sect;
														
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 
															 			is_extended_partition(pt))
														
 
															 			return 0;
														
 
															 	}
														
 
															-	d = read_dev_sector(bdev, 7, &sect);
														
 
															+	d = read_part_sector(state, 7, &sect);
														
 
															 	if (d) {
														
 
															 		if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
														
 
															 			ret = 1;
														
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 
															  * only for the actual data partitions.
														
 
															  */
														
 
															-static void
														
 
															-parse_extended(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-			sector_t first_sector, sector_t first_size)
														
 
															+static void parse_extended(struct parsed_partitions *state,
														
 
															+			   sector_t first_sector, sector_t first_size)
														
 
															 {
														
 
															 	struct partition *p;
														
 
															 	Sector sect;
														
 
															 	unsigned char *data;
														
 
															 	sector_t this_sector, this_size;
														
 
															-	sector_t sector_size = bdev_logical_block_size(bdev) / 512;
														
 
															+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
														
 
															 	int loopct = 0;		/* number of links followed
														
 
															 				   without finding a data partition */
														
 
															 	int i;
														
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 
															 			return;
														
 
															 		if (state->next == state->limit)
														
 
															 			return;
														
 
															-		data = read_dev_sector(bdev, this_sector, &sect);
														
 
															+		data = read_part_sector(state, this_sector, &sect);
														
 
															 		if (!data)
														
 
															 			return;
														
@@ -198,9 +197,8 @@ done:
 
															 /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
														
 
															    indicates linux swap.  Be careful before believing this is Solaris. */
														
 
															-static void
														
 
															-parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-			sector_t offset, sector_t size, int origin)
														
 
															+static void parse_solaris_x86(struct parsed_partitions *state,
														
 
															+			      sector_t offset, sector_t size, int origin)
														
 
															 {
														
 
															 #ifdef CONFIG_SOLARIS_X86_PARTITION
														
 
															 	Sector sect;
														
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 
															 	int i;
														
 
															 	short max_nparts;
														
 
															-	v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect);
														
 
															+	v = read_part_sector(state, offset + 1, &sect);
														
 
															 	if (!v)
														
 
															 		return;
														
 
															 	if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
														
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 
															  * Create devices for BSD partitions listed in a disklabel, under a
														
 
															  * dos-like partition. See parse_extended() for more information.
														
 
															  */
														
 
															-static void
														
 
															-parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-		sector_t offset, sector_t size, int origin, char *flavour,
														
 
															-		int max_partitions)
														
 
															+static void parse_bsd(struct parsed_partitions *state,
														
 
															+		      sector_t offset, sector_t size, int origin, char *flavour,
														
 
															+		      int max_partitions)
														
 
															 {
														
 
															 	Sector sect;
														
 
															 	struct bsd_disklabel *l;
														
 
															 	struct bsd_partition *p;
														
 
															-	l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect);
														
 
															+	l = read_part_sector(state, offset + 1, &sect);
														
 
															 	if (!l)
														
 
															 		return;
														
 
															 	if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
														
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 
															 }
														
 
															 #endif
														
 
															-static void
														
 
															-parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-		sector_t offset, sector_t size, int origin)
														
 
															+static void parse_freebsd(struct parsed_partitions *state,
														
 
															+			  sector_t offset, sector_t size, int origin)
														
 
															 {
														
 
															 #ifdef CONFIG_BSD_DISKLABEL
														
 
															-	parse_bsd(state, bdev, offset, size, origin,
														
 
															-			"bsd", BSD_MAXPARTITIONS);
														
 
															+	parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
														
 
															 #endif
														
 
															 }
														
 
															-static void
														
 
															-parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-		sector_t offset, sector_t size, int origin)
														
 
															+static void parse_netbsd(struct parsed_partitions *state,
														
 
															+			 sector_t offset, sector_t size, int origin)
														
 
															 {
														
 
															 #ifdef CONFIG_BSD_DISKLABEL
														
 
															-	parse_bsd(state, bdev, offset, size, origin,
														
 
															-			"netbsd", BSD_MAXPARTITIONS);
														
 
															+	parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
														
 
															 #endif
														
 
															 }
														
 
															-static void
														
 
															-parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-		sector_t offset, sector_t size, int origin)
														
 
															+static void parse_openbsd(struct parsed_partitions *state,
														
 
															+			  sector_t offset, sector_t size, int origin)
														
 
															 {
														
 
															 #ifdef CONFIG_BSD_DISKLABEL
														
 
															-	parse_bsd(state, bdev, offset, size, origin,
														
 
															-			"openbsd", OPENBSD_MAXPARTITIONS);
														
 
															+	parse_bsd(state, offset, size, origin, "openbsd",
														
 
															+		  OPENBSD_MAXPARTITIONS);
														
 
															 #endif
														
 
															 }
														
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
 
															  * Create devices for Unixware partitions listed in a disklabel, under a
														
 
															  * dos-like partition. See parse_extended() for more information.
														
 
															  */
														
 
															-static void
														
 
															-parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-		sector_t offset, sector_t size, int origin)
														
 
															+static void parse_unixware(struct parsed_partitions *state,
														
 
															+			   sector_t offset, sector_t size, int origin)
														
 
															 {
														
 
															 #ifdef CONFIG_UNIXWARE_DISKLABEL
														
 
															 	Sector sect;
														
 
															 	struct unixware_disklabel *l;
														
 
															 	struct unixware_slice *p;
														
 
															-	l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect);
														
 
															+	l = read_part_sector(state, offset + 29, &sect);
														
 
															 	if (!l)
														
 
															 		return;
														
 
															 	if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
														
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 
															  * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
														
 
															  * Rajeev V. Pillai    <rajeevvp@yahoo.com>
														
 
															  */
														
 
															-static void
														
 
															-parse_minix(struct parsed_partitions *state, struct block_device *bdev,
														
 
															-		sector_t offset, sector_t size, int origin)
														
 
															+static void parse_minix(struct parsed_partitions *state,
														
 
															+			sector_t offset, sector_t size, int origin)
														
 
															 {
														
 
															 #ifdef CONFIG_MINIX_SUBPARTITION
														
 
															 	Sector sect;
														
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 
															 	struct partition *p;
														
 
															 	int i;
														
 
															-	data = read_dev_sector(bdev, offset, &sect);
														
 
															+	data = read_part_sector(state, offset, &sect);
														
 
															 	if (!data)
														
 
															 		return;
														
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 
															 static struct {
														
 
															 	unsigned char id;
														
 
															-	void (*parse)(struct parsed_partitions *, struct block_device *,
														
 
															-			sector_t, sector_t, int);
														
 
															+	void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
														
 
															 } subtypes[] = {
														
 
															 	{FREEBSD_PARTITION, parse_freebsd},
														
 
															 	{NETBSD_PARTITION, parse_netbsd},
														
@@ -417,16 +406,16 @@ static struct {
 
															 	{0, NULL},
														
 
															 };
														
 
															-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int msdos_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															-	sector_t sector_size = bdev_logical_block_size(bdev) / 512;
														
 
															+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
														
 
															 	Sector sect;
														
 
															 	unsigned char *data;
														
 
															 	struct partition *p;
														
 
															 	struct fat_boot_sector *fb;
														
 
															 	int slot;
														
 
															-	data = read_dev_sector(bdev, 0, &sect);
														
 
															+	data = read_part_sector(state, 0, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
 
															 	if (!msdos_magic_present(data + 510)) {
														
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 		return 0;
														
 
															 	}
														
 
															-	if (aix_magic_present(data, bdev)) {
														
 
															+	if (aix_magic_present(state, data)) {
														
 
															 		put_dev_sector(sect);
														
 
															 		printk( " [AIX]");
														
 
															 		return 0;
														
@@ -503,7 +492,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 			put_partition(state, slot, start, n);
														
 
															 			printk(" <");
														
 
															-			parse_extended(state, bdev, start, size);
														
 
															+			parse_extended(state, start, size);
														
 
															 			printk(" >");
														
 
															 			continue;
														
 
															 		}
														
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 		if (!subtypes[n].parse)
														
 
															 			continue;
														
 
															-		subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
														
 
															-						nr_sects(p)*sector_size, slot);
														
 
															+		subtypes[n].parse(state, start_sect(p) * sector_size,
														
 
															+				  nr_sects(p) * sector_size, slot);
														
 
															 	}
														
 
															 	put_dev_sector(sect);
														
 
															 	return 1;
														
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
 
															 #define MSDOS_LABEL_MAGIC		0xAA55
														
 
															-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int msdos_partition(struct parsed_partitions *state);
														
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 
															 #include "check.h"
														
 
															 #include "osf.h"
														
 
															-int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int osf_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	int i;
														
 
															 	int slot = 1;
														
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	} * label;
														
 
															 	struct d_partition * partition;
														
 
															-	data = read_dev_sector(bdev, 0, &sect);
														
 
															+	data = read_part_sector(state, 0, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
 
															 #define DISKLABELMAGIC (0x82564557UL)
														
 
															-int osf_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int osf_partition(struct parsed_partitions *state);
														
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
 
															 	__be32 _unused1;			/* Padding */
														
 
															 };
														
 
															-int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int sgi_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	int i, csum;
														
 
															 	__be32 magic;
														
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	struct sgi_partition *p;
														
 
															 	char b[BDEVNAME_SIZE];
														
 
															-	label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect);
														
 
															+	label = read_part_sector(state, 0, &sect);
														
 
															 	if (!label)
														
 
															 		return -1;
														
 
															 	p = &label->partitions[0];
														
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	}
														
 
															 	if(csum) {
														
 
															 		printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
														
 
															-		       bdevname(bdev, b));
														
 
															+		       bdevname(state->bdev, b));
														
 
															 		put_dev_sector(sect);
														
 
															 		return 0;
														
 
															 	}
														
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
 
															  *  fs/partitions/sgi.h
														
 
															  */
														
 
															-extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+extern int sgi_partition(struct parsed_partitions *state);
														
 
															 #define SGI_LABEL_MAGIC 0x0be5a941
														
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
 
															 #include "check.h"
														
 
															 #include "sun.h"
														
 
															-int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int sun_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	int i;
														
 
															 	__be16 csum;
														
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	int use_vtoc;
														
 
															 	int nparts;
														
 
															-	label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect);
														
 
															+	label = read_part_sector(state, 0, &sect);
														
 
															 	if (!label)
														
 
															 		return -1;
														
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 		csum ^= *ush--;
														
 
															 	if (csum) {
														
 
															 		printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
														
 
															-		       bdevname(bdev, b));
														
 
															+		       bdevname(state->bdev, b));
														
 
															 		put_dev_sector(sect);
														
 
															 		return 0;
														
 
															 	}
														
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
 
															 #define SUN_LABEL_MAGIC          0xDABE
														
 
															 #define SUN_VTOC_SANITY          0x600DDEEE
														
 
															-int sun_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int sun_partition(struct parsed_partitions *state);
														
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
 
															 };
														
 
															-int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int sysv68_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	int i, slices;
														
 
															 	int slot = 1;
														
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	struct dkblk0 *b;
														
 
															 	struct slice *slice;
														
 
															-	data = read_dev_sector(bdev, 0, &sect);
														
 
															+	data = read_part_sector(state, 0, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 	i = be32_to_cpu(b->dk_ios.ios_slcblk);
														
 
															 	put_dev_sector(sect);
														
 
															-	data = read_dev_sector(bdev, i, &sect);
														
 
															+	data = read_part_sector(state, i, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
 
															-extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+extern int sysv68_partition(struct parsed_partitions *state);
														
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
 
															 #include "check.h"
														
 
															 #include "ultrix.h"
														
 
															-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
														
 
															+int ultrix_partition(struct parsed_partitions *state)
														
 
															 {
														
 
															 	int i;
														
 
															 	Sector sect;
														
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
 
															 #define PT_MAGIC	0x032957	/* Partition magic number */
														
 
															 #define PT_VALID	1		/* Indicates if struct is valid */
														
 
															-	data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect);
														
 
															+	data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
														
 
															 	if (!data)
														
 
															 		return -1;
														
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
 
															  *  fs/partitions/ultrix.h
														
 
															  */
														
 
															-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev);
														
 
															+int ultrix_partition(struct parsed_partitions *state);
														
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 
															 #include <linux/module.h>
														
 
															 #include <linux/init.h>
														
 
															 #include <linux/fs.h>
														
 
															+#include <linux/log2.h>
														
 
															 #include <linux/mount.h>
														
 
															 #include <linux/pipe_fs_i.h>
														
 
															 #include <linux/uio.h>
														
@@ -18,10 +19,17 @@
 
															 #include <linux/pagemap.h>
														
 
															 #include <linux/audit.h>
														
 
															 #include <linux/syscalls.h>
														
 
															+#include <linux/fcntl.h>
														
 
															 #include <asm/uaccess.h>
														
 
															 #include <asm/ioctls.h>
														
 
															+/*
														
 
															+ * The max size that a non-root user is allowed to grow the pipe. Can
														
 
															+ * be set by root in /proc/sys/fs/pipe-max-pages
														
 
															+ */
														
 
															+unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
														
 
															+
														
 
															 /*
														
 
															  * We use a start+len construction, which provides full use of the 
														
 
															  * allocated memory.
														
@@ -390,7 +398,7 @@ redo:
 
															 			if (!buf->len) {
														
 
															 				buf->ops = NULL;
														
 
															 				ops->release(pipe, buf);
														
 
															-				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
														
 
															+				curbuf = (curbuf + 1) & (pipe->buffers - 1);
														
 
															 				pipe->curbuf = curbuf;
														
 
															 				pipe->nrbufs = --bufs;
														
 
															 				do_wakeup = 1;
														
@@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 
															 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
														
 
															 	if (pipe->nrbufs && chars != 0) {
														
 
															 		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
														
 
															-							(PIPE_BUFFERS-1);
														
 
															+							(pipe->buffers - 1);
														
 
															 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
														
 
															 		const struct pipe_buf_operations *ops = buf->ops;
														
 
															 		int offset = buf->offset + buf->len;
														
@@ -518,8 +526,8 @@ redo1:
 
															 			break;
														
 
															 		}
														
 
															 		bufs = pipe->nrbufs;
														
 
															-		if (bufs < PIPE_BUFFERS) {
														
 
															-			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
														
 
															+		if (bufs < pipe->buffers) {
														
 
															+			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
														
 
															 			struct pipe_buffer *buf = pipe->bufs + newbuf;
														
 
															 			struct page *page = pipe->tmp_page;
														
 
															 			char *src;
														
@@ -580,7 +588,7 @@ redo2:
 
															 			if (!total_len)
														
 
															 				break;
														
 
															 		}
														
 
															-		if (bufs < PIPE_BUFFERS)
														
 
															+		if (bufs < pipe->buffers)
														
 
															 			continue;
														
 
															 		if (filp->f_flags & O_NONBLOCK) {
														
 
															 			if (!ret)
														
@@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
															 			nrbufs = pipe->nrbufs;
														
 
															 			while (--nrbufs >= 0) {
														
 
															 				count += pipe->bufs[buf].len;
														
 
															-				buf = (buf+1) & (PIPE_BUFFERS-1);
														
 
															+				buf = (buf+1) & (pipe->buffers - 1);
														
 
															 			}
														
 
															 			mutex_unlock(&inode->i_mutex);
														
@@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 
															 	}
														
 
															 	if (filp->f_mode & FMODE_WRITE) {
														
 
															-		mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
														
 
															+		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
														
 
															 		/*
														
 
															 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
														
 
															 		 * behave exactly like pipes for poll().
														
@@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 
															 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
														
 
															 	if (pipe) {
														
 
															-		init_waitqueue_head(&pipe->wait);
														
 
															-		pipe->r_counter = pipe->w_counter = 1;
														
 
															-		pipe->inode = inode;
														
 
															+		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
														
 
															+		if (pipe->bufs) {
														
 
															+			init_waitqueue_head(&pipe->wait);
														
 
															+			pipe->r_counter = pipe->w_counter = 1;
														
 
															+			pipe->inode = inode;
														
 
															+			pipe->buffers = PIPE_DEF_BUFFERS;
														
 
															+			return pipe;
														
 
															+		}
														
 
															+		kfree(pipe);
														
 
															 	}
														
 
															-	return pipe;
														
 
															+	return NULL;
														
 
															 }
														
 
															 void __free_pipe_info(struct pipe_inode_info *pipe)
														
 
															 {
														
 
															 	int i;
														
 
															-	for (i = 0; i < PIPE_BUFFERS; i++) {
														
 
															+	for (i = 0; i < pipe->buffers; i++) {
														
 
															 		struct pipe_buffer *buf = pipe->bufs + i;
														
 
															 		if (buf->ops)
														
 
															 			buf->ops->release(pipe, buf);
														
 
															 	}
														
 
															 	if (pipe->tmp_page)
														
 
															 		__free_page(pipe->tmp_page);
														
 
															+	kfree(pipe->bufs);
														
 
															 	kfree(pipe);
														
 
															 }
														
@@ -1093,6 +1108,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 
															 	return sys_pipe2(fildes, 0);
														
 
															 }
														
 
															+/*
														
 
															+ * Allocate a new array of pipe buffers and copy the info over. Returns the
														
 
															+ * pipe size if successful, or return -ERROR on error.
														
 
															+ */
														
 
															+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
														
 
															+{
														
 
															+	struct pipe_buffer *bufs;
														
 
															+
														
 
															+	/*
														
 
															+	 * Must be a power-of-2 currently
														
 
															+	 */
														
 
															+	if (!is_power_of_2(arg))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	/*
														
 
															+	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
														
 
															+	 * expect a lot of shrink+grow operations, just free and allocate
														
 
															+	 * again like we would do for growing. If the pipe currently
														
 
															+	 * contains more buffers than arg, then return busy.
														
 
															+	 */
														
 
															+	if (arg < pipe->nrbufs)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
														
 
															+	if (unlikely(!bufs))
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	/*
														
 
															+	 * The pipe array wraps around, so just start the new one at zero
														
 
															+	 * and adjust the indexes.
														
 
															+	 */
														
 
															+	if (pipe->nrbufs) {
														
 
															+		const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
														
 
															+		const unsigned int head = pipe->nrbufs - tail;
														
 
															+
														
 
															+		if (head)
														
 
															+			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
														
 
															+		if (tail)
														
 
															+			memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
														
 
															+	}
														
 
															+
														
 
															+	pipe->curbuf = 0;
														
 
															+	kfree(pipe->bufs);
														
 
															+	pipe->bufs = bufs;
														
 
															+	pipe->buffers = arg;
														
 
															+	return arg;
														
 
															+}
														
 
															+
														
 
															+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
														
 
															+{
														
 
															+	struct pipe_inode_info *pipe;
														
 
															+	long ret;
														
 
															+
														
 
															+	pipe = file->f_path.dentry->d_inode->i_pipe;
														
 
															+	if (!pipe)
														
 
															+		return -EBADF;
														
 
															+
														
 
															+	mutex_lock(&pipe->inode->i_mutex);
														
 
															+
														
 
															+	switch (cmd) {
														
 
															+	case F_SETPIPE_SZ:
														
 
															+		if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
														
 
															+			return -EINVAL;
														
 
															+		/*
														
 
															+		 * The pipe needs to be at least 2 pages large to
														
 
															+		 * guarantee POSIX behaviour.
														
 
															+		 */
														
 
															+		if (arg < 2)
														
 
															+			return -EINVAL;
														
 
															+		ret = pipe_set_size(pipe, arg);
														
 
															+		break;
														
 
															+	case F_GETPIPE_SZ:
														
 
															+		ret = pipe->buffers;
														
 
															+		break;
														
 
															+	default:
														
 
															+		ret = -EINVAL;
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	mutex_unlock(&pipe->inode->i_mutex);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * pipefs should _never_ be mounted by userland - too much of security hassle,
														
 
															  * no real gain from having the whole whorehouse mounted. So we don't need
														
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
 
															 	barrier_done = reiserfs_commit_for_inode(inode);
														
 
															 	reiserfs_write_unlock(inode->i_sb);
														
 
															 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
														
 
															-		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
														
 
															+		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
														
 
															+			BLKDEV_IFL_WAIT);
														
 
															 	if (barrier_done < 0)
														
 
															 		return barrier_done;
														
 
															 	return (err < 0) ? -EIO : 0;
														
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 
															 			break;
														
 
															 		}
														
 
															-		if (pipe->nrbufs < PIPE_BUFFERS) {
														
 
															-			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
														
 
															+		if (pipe->nrbufs < pipe->buffers) {
														
 
															+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
														
 
															 			struct pipe_buffer *buf = pipe->bufs + newbuf;
														
 
															 			buf->page = spd->pages[page_nr];
														
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 
															 			if (!--spd->nr_pages)
														
 
															 				break;
														
 
															-			if (pipe->nrbufs < PIPE_BUFFERS)
														
 
															+			if (pipe->nrbufs < pipe->buffers)
														
 
															 				continue;
														
 
															 			break;
														
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 
															 	page_cache_release(spd->pages[i]);
														
 
															 }
														
 
															+/*
														
 
															+ * Check if we need to grow the arrays holding pages and partial page
														
 
															+ * descriptions.
														
 
															+ */
														
 
															+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
														
 
															+{
														
 
															+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
														
 
															+		return 0;
														
 
															+
														
 
															+	spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
														
 
															+	spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
														
 
															+
														
 
															+	if (spd->pages && spd->partial)
														
 
															+		return 0;
														
 
															+
														
 
															+	kfree(spd->pages);
														
 
															+	kfree(spd->partial);
														
 
															+	return -ENOMEM;
														
 
															+}
														
 
															+
														
 
															+void splice_shrink_spd(struct pipe_inode_info *pipe,
														
 
															+		       struct splice_pipe_desc *spd)
														
 
															+{
														
 
															+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
														
 
															+		return;
														
 
															+
														
 
															+	kfree(spd->pages);
														
 
															+	kfree(spd->partial);
														
 
															+}
														
 
															+
														
 
															 static int
														
 
															 __generic_file_splice_read(struct file *in, loff_t *ppos,
														
 
															 			   struct pipe_inode_info *pipe, size_t len,
														
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 
															 {
														
 
															 	struct address_space *mapping = in->f_mapping;
														
 
															 	unsigned int loff, nr_pages, req_pages;
														
 
															-	struct page *pages[PIPE_BUFFERS];
														
 
															-	struct partial_page partial[PIPE_BUFFERS];
														
 
															+	struct page *pages[PIPE_DEF_BUFFERS];
														
 
															+	struct partial_page partial[PIPE_DEF_BUFFERS];
														
 
															 	struct page *page;
														
 
															 	pgoff_t index, end_index;
														
 
															 	loff_t isize;
														
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 
															 		.spd_release = spd_release_page,
														
 
															 	};
														
 
															+	if (splice_grow_spd(pipe, &spd))
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															 	index = *ppos >> PAGE_CACHE_SHIFT;
														
 
															 	loff = *ppos & ~PAGE_CACHE_MASK;
														
 
															 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
														
 
															-	nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
														
 
															+	nr_pages = min(req_pages, pipe->buffers);
														
 
															 	/*
														
 
															 	 * Lookup the (hopefully) full range of pages we need.
														
 
															 	 */
														
 
															-	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
														
 
															+	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
														
 
															 	index += spd.nr_pages;
														
 
															 	/*
														
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 
															 			unlock_page(page);
														
 
															 		}
														
 
															-		pages[spd.nr_pages++] = page;
														
 
															+		spd.pages[spd.nr_pages++] = page;
														
 
															 		index++;
														
 
															 	}
														
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 
															 		 * this_len is the max we'll use from this page
														
 
															 		 */
														
 
															 		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
														
 
															-		page = pages[page_nr];
														
 
															+		page = spd.pages[page_nr];
														
 
															 		if (PageReadahead(page))
														
 
															 			page_cache_async_readahead(mapping, &in->f_ra, in,
														
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 
															 					error = -ENOMEM;
														
 
															 					break;
														
 
															 				}
														
 
															-				page_cache_release(pages[page_nr]);
														
 
															-				pages[page_nr] = page;
														
 
															+				page_cache_release(spd.pages[page_nr]);
														
 
															+				spd.pages[page_nr] = page;
														
 
															 			}
														
 
															 			/*
														
 
															 			 * page was already under io and is now done, great
														
@@ -451,8 +484,8 @@ fill_it:
 
															 			len = this_len;
														
 
															 		}
														
 
															-		partial[page_nr].offset = loff;
														
 
															-		partial[page_nr].len = this_len;
														
 
															+		spd.partial[page_nr].offset = loff;
														
 
															+		spd.partial[page_nr].len = this_len;
														
 
															 		len -= this_len;
														
 
															 		loff = 0;
														
 
															 		spd.nr_pages++;
														
@@ -464,12 +497,13 @@ fill_it:
 
															 	 * we got, 'nr_pages' is how many pages are in the map.
														
 
															 	 */
														
 
															 	while (page_nr < nr_pages)
														
 
															-		page_cache_release(pages[page_nr++]);
														
 
															+		page_cache_release(spd.pages[page_nr++]);
														
 
															 	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
														
 
															 	if (spd.nr_pages)
														
 
															-		return splice_to_pipe(pipe, &spd);
														
 
															+		error = splice_to_pipe(pipe, &spd);
														
 
															+	splice_shrink_spd(pipe, &spd);
														
 
															 	return error;
														
 
															 }
														
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
															 	unsigned int nr_pages;
														
 
															 	unsigned int nr_freed;
														
 
															 	size_t offset;
														
 
															-	struct page *pages[PIPE_BUFFERS];
														
 
															-	struct partial_page partial[PIPE_BUFFERS];
														
 
															-	struct iovec vec[PIPE_BUFFERS];
														
 
															+	struct page *pages[PIPE_DEF_BUFFERS];
														
 
															+	struct partial_page partial[PIPE_DEF_BUFFERS];
														
 
															+	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
														
 
															 	pgoff_t index;
														
 
															 	ssize_t res;
														
 
															 	size_t this_len;
														
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
															 		.spd_release = spd_release_page,
														
 
															 	};
														
 
															+	if (splice_grow_spd(pipe, &spd))
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	res = -ENOMEM;
														
 
															+	vec = __vec;
														
 
															+	if (pipe->buffers > PIPE_DEF_BUFFERS) {
														
 
															+		vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
														
 
															+		if (!vec)
														
 
															+			goto shrink_ret;
														
 
															+	}
														
 
															+
														
 
															 	index = *ppos >> PAGE_CACHE_SHIFT;
														
 
															 	offset = *ppos & ~PAGE_CACHE_MASK;
														
 
															 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
														
 
															-	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
														
 
															+	for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
														
 
															 		struct page *page;
														
 
															 		page = alloc_page(GFP_USER);
														
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
															 		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
														
 
															 		vec[i].iov_base = (void __user *) page_address(page);
														
 
															 		vec[i].iov_len = this_len;
														
 
															-		pages[i] = page;
														
 
															+		spd.pages[i] = page;
														
 
															 		spd.nr_pages++;
														
 
															 		len -= this_len;
														
 
															 		offset = 0;
														
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
															 	nr_freed = 0;
														
 
															 	for (i = 0; i < spd.nr_pages; i++) {
														
 
															 		this_len = min_t(size_t, vec[i].iov_len, res);
														
 
															-		partial[i].offset = 0;
														
 
															-		partial[i].len = this_len;
														
 
															+		spd.partial[i].offset = 0;
														
 
															+		spd.partial[i].len = this_len;
														
 
															 		if (!this_len) {
														
 
															-			__free_page(pages[i]);
														
 
															-			pages[i] = NULL;
														
 
															+			__free_page(spd.pages[i]);
														
 
															+			spd.pages[i] = NULL;
														
 
															 			nr_freed++;
														
 
															 		}
														
 
															 		res -= this_len;
														
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
															 	if (res > 0)
														
 
															 		*ppos += res;
														
 
															+shrink_ret:
														
 
															+	if (vec != __vec)
														
 
															+		kfree(vec);
														
 
															+	splice_shrink_spd(pipe, &spd);
														
 
															 	return res;
														
 
															 err:
														
 
															 	for (i = 0; i < spd.nr_pages; i++)
														
 
															-		__free_page(pages[i]);
														
 
															+		__free_page(spd.pages[i]);
														
 
															-	return error;
														
 
															+	res = error;
														
 
															+	goto shrink_ret;
														
 
															 }
														
 
															 EXPORT_SYMBOL(default_file_splice_read);
														
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 
															 		if (!buf->len) {
														
 
															 			buf->ops = NULL;
														
 
															 			ops->release(pipe, buf);
														
 
															-			pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
														
 
															+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
														
 
															 			pipe->nrbufs--;
														
 
															 			if (pipe->inode)
														
 
															 				sd->need_wakeup = true;
														
@@ -1211,7 +1261,7 @@ out_release:
 
															 	 * If we did an incomplete transfer we must release
														
 
															 	 * the pipe buffers in question:
														
 
															 	 */
														
 
															-	for (i = 0; i < PIPE_BUFFERS; i++) {
														
 
															+	for (i = 0; i < pipe->buffers; i++) {
														
 
															 		struct pipe_buffer *buf = pipe->bufs + i;
														
 
															 		if (buf->ops) {
														
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 
															  */
														
 
															 static int get_iovec_page_array(const struct iovec __user *iov,
														
 
															 				unsigned int nr_vecs, struct page **pages,
														
 
															-				struct partial_page *partial, int aligned)
														
 
															+				struct partial_page *partial, int aligned,
														
 
															+				unsigned int pipe_buffers)
														
 
															 {
														
 
															 	int buffers = 0, error = 0;
														
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 
															 			break;
														
 
															 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
														
 
															-		if (npages > PIPE_BUFFERS - buffers)
														
 
															-			npages = PIPE_BUFFERS - buffers;
														
 
															+		if (npages > pipe_buffers - buffers)
														
 
															+			npages = pipe_buffers - buffers;
														
 
															 		error = get_user_pages_fast((unsigned long)base, npages,
														
 
															 					0, &pages[buffers]);
														
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 
															 		 * or if we mapped the max number of pages that we have
														
 
															 		 * room for.
														
 
															 		 */
														
 
															-		if (error < npages || buffers == PIPE_BUFFERS)
														
 
															+		if (error < npages || buffers == pipe_buffers)
														
 
															 			break;
														
 
															 		nr_vecs--;
														
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 
															 			     unsigned long nr_segs, unsigned int flags)
														
 
															 {
														
 
															 	struct pipe_inode_info *pipe;
														
 
															-	struct page *pages[PIPE_BUFFERS];
														
 
															-	struct partial_page partial[PIPE_BUFFERS];
														
 
															+	struct page *pages[PIPE_DEF_BUFFERS];
														
 
															+	struct partial_page partial[PIPE_DEF_BUFFERS];
														
 
															 	struct splice_pipe_desc spd = {
														
 
															 		.pages = pages,
														
 
															 		.partial = partial,
														
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 
															 		.ops = &user_page_pipe_buf_ops,
														
 
															 		.spd_release = spd_release_page,
														
 
															 	};
														
 
															+	long ret;
														
 
															 	pipe = pipe_info(file->f_path.dentry->d_inode);
														
 
															 	if (!pipe)
														
 
															 		return -EBADF;
														
 
															-	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
														
 
															-					    flags & SPLICE_F_GIFT);
														
 
															+	if (splice_grow_spd(pipe, &spd))
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
														
 
															+					    spd.partial, flags & SPLICE_F_GIFT,
														
 
															+					    pipe->buffers);
														
 
															 	if (spd.nr_pages <= 0)
														
 
															-		return spd.nr_pages;
														
 
															+		ret = spd.nr_pages;
														
 
															+	else
														
 
															+		ret = splice_to_pipe(pipe, &spd);
														
 
															-	return splice_to_pipe(pipe, &spd);
														
 
															+	splice_shrink_spd(pipe, &spd);
														
 
															+	return ret;
														
 
															 }
														
 
															 /*
														
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 
															 	 * Check ->nrbufs without the inode lock first. This function
														
 
															 	 * is speculative anyways, so missing one is ok.
														
 
															 	 */
														
 
															-	if (pipe->nrbufs < PIPE_BUFFERS)
														
 
															+	if (pipe->nrbufs < pipe->buffers)
														
 
															 		return 0;
														
 
															 	ret = 0;
														
 
															 	pipe_lock(pipe);
														
 
															-	while (pipe->nrbufs >= PIPE_BUFFERS) {
														
 
															+	while (pipe->nrbufs >= pipe->buffers) {
														
 
															 		if (!pipe->readers) {
														
 
															 			send_sig(SIGPIPE, current, 0);
														
 
															 			ret = -EPIPE;
														
@@ -1810,7 +1869,7 @@ retry:
 
															 		 * Cannot make any progress, because either the input
														
 
															 		 * pipe is empty or the output pipe is full.
														
 
															 		 */
														
 
															-		if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
														
 
															+		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
														
 
															 			/* Already processed some buffers, break */
														
 
															 			if (ret)
														
 
															 				break;
														
@@ -1831,7 +1890,7 @@ retry:
 
															 		}
														
 
															 		ibuf = ipipe->bufs + ipipe->curbuf;
														
 
															-		nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
														
 
															+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
														
 
															 		obuf = opipe->bufs + nbuf;
														
 
															 		if (len >= ibuf->len) {
														
@@ -1841,7 +1900,7 @@ retry:
 
															 			*obuf = *ibuf;
														
 
															 			ibuf->ops = NULL;
														
 
															 			opipe->nrbufs++;
														
 
															-			ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
														
 
															+			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
														
 
															 			ipipe->nrbufs--;
														
 
															 			input_wakeup = true;
														
 
															 		} else {
														
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 
															 		 * If we have iterated all input buffers or ran out of
														
 
															 		 * output room, break.
														
 
															 		 */
														
 
															-		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
														
 
															+		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
														
 
															 			break;
														
 
															-		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
														
 
															-		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
														
 
															+		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
														
 
															+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
														
 
															 		/*
														
 
															 		 * Get a reference to this pipe buffer,
														
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 
															 	if (wait)
														
 
															 		sync_inodes_sb(sb);
														
 
															 	else
														
 
															-		writeback_inodes_sb(sb);
														
 
															+		writeback_inodes_sb_locked(sb);
														
 
															 	if (sb->s_op->sync_fs)
														
 
															 		sb->s_op->sync_fs(sb, wait);
														
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -725,7 +725,8 @@ void
 
															 xfs_blkdev_issue_flush(
														
 
															 	xfs_buftarg_t		*buftarg)
														
 
															 {
														
 
															-	blkdev_issue_flush(buftarg->bt_bdev, NULL);
														
 
															+	blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
														
 
															+			BLKDEV_IFL_WAIT);
														
 
															 }
														
 
															 STATIC void
														
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -14,6 +14,7 @@
 
															 #include <linux/kernel.h>
														
 
															 #include <linux/fs.h>
														
 
															 #include <linux/sched.h>
														
 
															+#include <linux/timer.h>
														
 
															 #include <linux/writeback.h>
														
 
															 #include <asm/atomic.h>
														
@@ -88,6 +89,8 @@ struct backing_dev_info {
 
															 	struct device *dev;
														
 
															+	struct timer_list laptop_mode_wb_timer;
														
 
															+
														
 
															 #ifdef CONFIG_DEBUG_FS
														
 
															 	struct dentry *debug_dir;
														
 
															 	struct dentry *debug_stats;
														
@@ -103,9 +106,10 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 
															 void bdi_unregister(struct backing_dev_info *bdi);
														
 
															 int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
														
 
															 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
														
 
															-				long nr_pages);
														
 
															+				long nr_pages, int sb_locked);
														
 
															 int bdi_writeback_task(struct bdi_writeback *wb);
														
 
															 int bdi_has_dirty_io(struct backing_dev_info *bdi);
														
 
															+void bdi_arm_supers_timer(void);
														
 
															 extern spinlock_t bdi_lock;
														
 
															 extern struct list_head bdi_list;
														
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -186,15 +186,19 @@ struct request {
 
															 	};
														
 
															 	/*
														
 
															-	 * two pointers are available for the IO schedulers, if they need
														
 
															+	 * Three pointers are available for the IO schedulers, if they need
														
 
															 	 * more they have to dynamically allocate it.
														
 
															 	 */
														
 
															 	void *elevator_private;
														
 
															 	void *elevator_private2;
														
 
															+	void *elevator_private3;
														
 
															 	struct gendisk *rq_disk;
														
 
															 	unsigned long start_time;
														
 
															-
														
 
															+#ifdef CONFIG_BLK_CGROUP
														
 
															+	unsigned long long start_time_ns;
														
 
															+	unsigned long long io_start_time_ns;    /* when passed to hardware */
														
 
															+#endif
														
 
															 	/* Number of scatter-gather DMA addr+len pairs after
														
 
															 	 * physical address coalescing is performed.
														
 
															 	 */
														
@@ -917,7 +921,12 @@ extern void blk_abort_queue(struct request_queue *);
 
															  */
														
 
															 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
														
 
															 					spinlock_t *lock, int node_id);
														
 
															+extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *,
														
 
															+							   request_fn_proc *,
														
 
															+							   spinlock_t *, int node_id);
														
 
															 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
														
 
															+extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
														
 
															+						      request_fn_proc *, spinlock_t *);
														
 
															 extern void blk_cleanup_queue(struct request_queue *);
														
 
															 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
														
 
															 extern void blk_queue_bounce_limit(struct request_queue *, u64);
														
@@ -994,20 +1003,25 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 
															 		return NULL;
														
 
															 	return bqt->tag_index[tag];
														
 
															 }
														
 
															-
														
 
															-extern int blkdev_issue_flush(struct block_device *, sector_t *);
														
 
															-#define DISCARD_FL_WAIT		0x01	/* wait for completion */
														
 
															-#define DISCARD_FL_BARRIER	0x02	/* issue DISCARD_BARRIER request */
														
 
															-extern int blkdev_issue_discard(struct block_device *, sector_t sector,
														
 
															-		sector_t nr_sects, gfp_t, int flags);
														
 
															-
														
 
															+enum{
														
 
															+	BLKDEV_WAIT,	/* wait for completion */
														
 
															+	BLKDEV_BARRIER,	/*issue request with barrier */
														
 
															+};
														
 
															+#define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
														
 
															+#define BLKDEV_IFL_BARRIER	(1 << BLKDEV_BARRIER)
														
 
															+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
														
 
															+			unsigned long);
														
 
															+extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
														
 
															+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
														
 
															+extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
														
 
															+			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
														
 
															 static inline int sb_issue_discard(struct super_block *sb,
														
 
															 				   sector_t block, sector_t nr_blocks)
														
 
															 {
														
 
															 	block <<= (sb->s_blocksize_bits - 9);
														
 
															 	nr_blocks <<= (sb->s_blocksize_bits - 9);
														
 
															 	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
														
 
															-				    DISCARD_FL_BARRIER);
														
 
															+				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
														
 
															 }
														
 
															 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
														
@@ -1196,6 +1210,39 @@ static inline void put_dev_sector(Sector p)
 
															 struct work_struct;
														
 
															 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
														
 
															+#ifdef CONFIG_BLK_CGROUP
														
 
															+static inline void set_start_time_ns(struct request *req)
														
 
															+{
														
 
															+	req->start_time_ns = sched_clock();
														
 
															+}
														
 
															+
														
 
															+static inline void set_io_start_time_ns(struct request *req)
														
 
															+{
														
 
															+	req->io_start_time_ns = sched_clock();
														
 
															+}
														
 
															+
														
 
															+static inline uint64_t rq_start_time_ns(struct request *req)
														
 
															+{
														
 
															+        return req->start_time_ns;
														
 
															+}
														
 
															+
														
 
															+static inline uint64_t rq_io_start_time_ns(struct request *req)
														
 
															+{
														
 
															+        return req->io_start_time_ns;
														
 
															+}
														
 
															+#else
														
 
															+static inline void set_start_time_ns(struct request *req) {}
														
 
															+static inline void set_io_start_time_ns(struct request *req) {}
														
 
															+static inline uint64_t rq_start_time_ns(struct request *req)
														
 
															+{
														
 
															+	return 0;
														
 
															+}
														
 
															+static inline uint64_t rq_io_start_time_ns(struct request *req)
														
 
															+{
														
 
															+	return 0;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
														
 
															 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
														
 
															 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
														
@@ -1283,8 +1330,7 @@ struct block_device_operations {
 
															 	int (*direct_access) (struct block_device *, sector_t,
														
 
															 						void **, unsigned long *);
														
 
															 	int (*media_changed) (struct gendisk *);
														
 
															-	unsigned long long (*set_capacity) (struct gendisk *,
														
 
															-						unsigned long long);
														
 
															+	void (*unlock_native_capacity) (struct gendisk *);
														
 
															 	int (*revalidate_disk) (struct gendisk *);
														
 
															 	int (*getgeo)(struct block_device *, struct hd_geometry *);
														
 
															 	struct module *owner;
														
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,10 +53,10 @@
 
															 extern const char *drbd_buildtag(void);
														
 
															-#define REL_VERSION "8.3.7"
														
 
															+#define REL_VERSION "8.3.8rc1"
														
 
															 #define API_VERSION 88
														
 
															 #define PRO_VERSION_MIN 86
														
 
															-#define PRO_VERSION_MAX 92
														
 
															+#define PRO_VERSION_MAX 94
														
 
															 enum drbd_io_error_p {
														
@@ -139,6 +139,7 @@ enum drbd_ret_codes {
 
															 	ERR_DATA_NOT_CURRENT	= 150,
														
 
															 	ERR_CONNECTED		= 151, /* DRBD 8.3 only */
														
 
															 	ERR_PERM		= 152,
														
 
															+	ERR_NEED_APV_93		= 153,
														
 
															 	/* insert new ones above this line */
														
 
															 	AFTER_LAST_ERR_CODE
														
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -133,5 +133,21 @@
 
															 #define DRBD_MAX_BIO_BVECS_MAX 128
														
 
															 #define DRBD_MAX_BIO_BVECS_DEF 0
														
 
															+#define DRBD_DP_VOLUME_MIN 4
														
 
															+#define DRBD_DP_VOLUME_MAX 1048576
														
 
															+#define DRBD_DP_VOLUME_DEF 16384
														
 
															+
														
 
															+#define DRBD_DP_INTERVAL_MIN 1
														
 
															+#define DRBD_DP_INTERVAL_MAX 600
														
 
															+#define DRBD_DP_INTERVAL_DEF 5
														
 
															+
														
 
															+#define DRBD_RS_THROTTLE_TH_MIN 1
														
 
															+#define DRBD_RS_THROTTLE_TH_MAX 600
														
 
															+#define DRBD_RS_THROTTLE_TH_DEF 20
														
 
															+
														
 
															+#define DRBD_RS_HOLD_OFF_TH_MIN 1
														
 
															+#define DRBD_RS_HOLD_OFF_TH_MAX 6000
														
 
															+#define DRBD_RS_HOLD_OFF_TH_DEF 100
														
 
															+
														
 
															 #undef RANGE
														
 
															 #endif
														
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -71,12 +71,17 @@ NL_PACKET(disconnect, 6, )
 
															 NL_PACKET(resize, 7,
														
 
															 	NL_INT64(		29,	T_MAY_IGNORE,	resize_size)
														
 
															 	NL_BIT(			68,	T_MAY_IGNORE,	resize_force)
														
 
															+	NL_BIT(			69,	T_MANDATORY,	no_resync)
														
 
															 )
														
 
															 NL_PACKET(syncer_conf, 8,
														
 
															 	NL_INTEGER(	30,	T_MAY_IGNORE,	rate)
														
 
															 	NL_INTEGER(	31,	T_MAY_IGNORE,	after)
														
 
															 	NL_INTEGER(	32,	T_MAY_IGNORE,	al_extents)
														
 
															+	NL_INTEGER(     71,	T_MAY_IGNORE,	dp_volume)
														
 
															+	NL_INTEGER(     72,	T_MAY_IGNORE,	dp_interval)
														
 
															+	NL_INTEGER(     73,	T_MAY_IGNORE,	throttle_th)
														
 
															+	NL_INTEGER(     74,	T_MAY_IGNORE,	hold_off_th)
														
 
															 	NL_STRING(      52,     T_MAY_IGNORE,   verify_alg,     SHARED_SECRET_MAX)
														
 
															 	NL_STRING(      51,     T_MAY_IGNORE,   cpu_mask,       32)
														
 
															 	NL_STRING(	64,	T_MAY_IGNORE,	csums_alg,	SHARED_SECRET_MAX)
														
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -14,6 +14,9 @@ typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int
 
															 typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
														
 
															+typedef void (elevator_bio_merged_fn) (struct request_queue *,
														
 
															+						struct request *, struct bio *);
														
 
															+
														
 
															 typedef int (elevator_dispatch_fn) (struct request_queue *, int);
														
 
															 typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
														
@@ -36,6 +39,7 @@ struct elevator_ops
 
															 	elevator_merged_fn *elevator_merged_fn;
														
 
															 	elevator_merge_req_fn *elevator_merge_req_fn;
														
 
															 	elevator_allow_merge_fn *elevator_allow_merge_fn;
														
 
															+	elevator_bio_merged_fn *elevator_bio_merged_fn;
														
 
															 	elevator_dispatch_fn *elevator_dispatch_fn;
														
 
															 	elevator_add_req_fn *elevator_add_req_fn;
														
@@ -103,6 +107,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
 
															 extern void elv_merge_requests(struct request_queue *, struct request *,
														
 
															 			       struct request *);
														
 
															 extern void elv_merged_request(struct request_queue *, struct request *, int);
														
 
															+extern void elv_bio_merged(struct request_queue *q, struct request *,
														
 
															+				struct bio *);
														
 
															 extern void elv_requeue_request(struct request_queue *, struct request *);
														
 
															 extern int elv_queue_empty(struct request_queue *);
														
 
															 extern struct request *elv_former_request(struct request_queue *, struct request *);
														
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -21,6 +21,12 @@
 
															  */
														
 
															 #define F_NOTIFY	(F_LINUX_SPECIFIC_BASE+2)
														
 
															+/*
														
 
															+ * Set and get of pipe page size array
														
 
															+ */
														
 
															+#define F_SETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 7)
														
 
															+#define F_GETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 8)
														
 
															+
														
 
															 /*
														
 
															  * Types of directory notifications that may be requested.
														
 
															  */
														
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -651,6 +651,7 @@ struct block_device {
 
															 	int			bd_openers;
														
 
															 	struct mutex		bd_mutex;	/* open/close mutex */
														
 
															 	struct list_head	bd_inodes;
														
 
															+	void *			bd_claiming;
														
 
															 	void *			bd_holder;
														
 
															 	int			bd_holders;
														
 
															 #ifdef CONFIG_SYSFS
														
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,7 +362,7 @@ struct ide_drive_s;
 
															 struct ide_disk_ops {
														
 
															 	int		(*check)(struct ide_drive_s *, const char *);
														
 
															 	int		(*get_capacity)(struct ide_drive_s *);
														
 
															-	u64		(*set_capacity)(struct ide_drive_s *, u64);
														
 
															+	void		(*unlock_native_capacity)(struct ide_drive_s *);
														
 
															 	void		(*setup)(struct ide_drive_s *);
														
 
															 	void		(*flush)(struct ide_drive_s *);
														
 
															 	int		(*init_media)(struct ide_drive_s *, struct gendisk *);
														
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -3,7 +3,7 @@
 
															 #define PIPEFS_MAGIC 0x50495045
														
 
															-#define PIPE_BUFFERS (16)
														
 
															+#define PIPE_DEF_BUFFERS	16
														
 
															 #define PIPE_BUF_FLAG_LRU	0x01	/* page is on the LRU */
														
 
															 #define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
														
@@ -44,17 +44,17 @@ struct pipe_buffer {
 
															  **/
														
 
															 struct pipe_inode_info {
														
 
															 	wait_queue_head_t wait;
														
 
															-	unsigned int nrbufs, curbuf;
														
 
															-	struct page *tmp_page;
														
 
															+	unsigned int nrbufs, curbuf, buffers;
														
 
															 	unsigned int readers;
														
 
															 	unsigned int writers;
														
 
															 	unsigned int waiting_writers;
														
 
															 	unsigned int r_counter;
														
 
															 	unsigned int w_counter;
														
 
															+	struct page *tmp_page;
														
 
															 	struct fasync_struct *fasync_readers;
														
 
															 	struct fasync_struct *fasync_writers;
														
 
															 	struct inode *inode;
														
 
															-	struct pipe_buffer bufs[PIPE_BUFFERS];
														
 
															+	struct pipe_buffer *bufs;
														
 
															 };
														
 
															 /*
														
@@ -139,6 +139,8 @@ void pipe_lock(struct pipe_inode_info *);
 
															 void pipe_unlock(struct pipe_inode_info *);
														
 
															 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
														
 
															+extern unsigned int pipe_max_pages;
														
 
															+
														
 
															 /* Drop the inode semaphore and wait for a pipe event, atomically */
														
 
															 void pipe_wait(struct pipe_inode_info *pipe);
														
@@ -154,4 +156,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 
															 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
														
 
															 void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
														
 
															+/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
														
 
															+long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
														
 
															+
														
 
															 #endif
														
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *,
 
															 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
														
 
															 				      splice_direct_actor *);
														
 
															+/*
														
 
															+ * for dynamic pipe sizing
														
 
															+ */
														
 
															+extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *);
														
 
															+extern void splice_shrink_spd(struct pipe_inode_info *,
														
 
															+				struct splice_pipe_desc *);
														
 
															+
														
 
															 #endif
														
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -65,6 +65,15 @@ struct writeback_control {
 
															 	 * so we use a single control to update them
														
 
															 	 */
														
 
															 	unsigned no_nrwrite_index_update:1;
														
 
															+
														
 
															+	/*
														
 
															+	 * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
														
 
															+	 * the writeback code will pin the sb for the caller. However,
														
 
															+	 * for eg umount, the caller does WB_SYNC_NONE but already has
														
 
															+	 * the sb pinned. If the below is set, caller already has the
														
 
															+	 * sb pinned.
														
 
															+	 */
														
 
															+	unsigned sb_pinned:1;
														
 
															 };
														
 
															 /*
														
@@ -73,6 +82,7 @@ struct writeback_control {
 
															 struct bdi_writeback;
														
 
															 int inode_wait(void *);
														
 
															 void writeback_inodes_sb(struct super_block *);
														
 
															+void writeback_inodes_sb_locked(struct super_block *);
														
 
															 int writeback_inodes_sb_if_idle(struct super_block *);
														
 
															 void sync_inodes_sb(struct super_block *);
														
 
															 void writeback_inodes_wbc(struct writeback_control *wbc);
														
@@ -96,8 +106,14 @@ static inline void inode_sync_wait(struct inode *inode)
 
															 /*
														
 
															  * mm/page-writeback.c
														
 
															  */
														
 
															-void laptop_io_completion(void);
														
 
															+#ifdef CONFIG_BLOCK
														
 
															+void laptop_io_completion(struct backing_dev_info *info);
														
 
															 void laptop_sync_completion(void);
														
 
															+void laptop_mode_sync(struct work_struct *work);
														
 
															+void laptop_mode_timer_fn(unsigned long data);
														
 
															+#else
														
 
															+static inline void laptop_sync_completion(void) { }
														
 
															+#endif
														
 
															 void throttle_vm_writeout(gfp_t gfp_mask);
														
 
															 /* These are exported to sysctl. */
														
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -611,6 +611,33 @@ config RT_GROUP_SCHED
 
															 endif #CGROUP_SCHED
														
 
															+config BLK_CGROUP
														
 
															+	tristate "Block IO controller"
														
 
															+	depends on CGROUPS && BLOCK
														
 
															+	default n
														
 
															+	---help---
														
 
															+	Generic block IO controller cgroup interface. This is the common
														
 
															+	cgroup interface which should be used by various IO controlling
														
 
															+	policies.
														
 
															+
														
 
															+	Currently, CFQ IO scheduler uses it to recognize task groups and
														
 
															+	control disk bandwidth allocation (proportional time slice allocation)
														
 
															+	to such task groups.
														
 
															+
														
 
															+	This option only enables generic Block IO controller infrastructure.
														
 
															+	One needs to also enable actual IO controlling logic in CFQ for it
														
 
															+	to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
														
 
															+
														
 
															+	See Documentation/cgroups/blkio-controller.txt for more information.
														
 
															+
														
 
															+config DEBUG_BLK_CGROUP
														
 
															+	bool "Enable Block IO controller debugging"
														
 
															+	depends on BLK_CGROUP
														
 
															+	default n
														
 
															+	---help---
														
 
															+	Enable some debugging help. Currently it exports additional stat
														
 
															+	files in a cgroup which can be useful for debugging.
														
 
															+
														
 
															 endif # CGROUPS
														
 
															 config MM_OWNER
														
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
 
															 	size_t read_subbuf = read_start / subbuf_size;
														
 
															 	size_t padding = rbuf->padding[read_subbuf];
														
 
															 	size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
														
 
															-	struct page *pages[PIPE_BUFFERS];
														
 
															-	struct partial_page partial[PIPE_BUFFERS];
														
 
															+	struct page *pages[PIPE_DEF_BUFFERS];
														
 
															+	struct partial_page partial[PIPE_DEF_BUFFERS];
														
 
															 	struct splice_pipe_desc spd = {
														
 
															 		.pages = pages,
														
 
															 		.nr_pages = 0,
														
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
 
															 	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
														
 
															 		return 0;
														
 
															+	if (splice_grow_spd(pipe, &spd))
														
 
															+		return -ENOMEM;
														
 
															 	/*
														
 
															 	 * Adjust read len, if longer than what is available
														
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
 
															 	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
														
 
															 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
														
 
															 	poff = read_start & ~PAGE_MASK;
														
 
															-	nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
														
 
															+	nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
														
 
															 	for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
														
 
															 		unsigned int this_len, this_end, private;
														
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
 
															 		}
														
 
															 	}
														
 
															+	ret = 0;
														
 
															 	if (!spd.nr_pages)
														
 
															-		return 0;
														
 
															+		goto out;
														
 
															 	ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
														
 
															 	if (ret < 0 || ret < total_len)
														
 
															-		return ret;
														
 
															+		goto out;
														
 
															         if (read_start + ret == nonpad_end)
														
 
															                 ret += padding;
														
 
															+out:
														
 
															+	splice_shrink_spd(pipe, &spd);
														
 
															         return ret;
														
 
															 }
														
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 
															 	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
														
 
															 					* (NSEC_PER_SEC / HZ);
														
 
															 }
														
 
															+EXPORT_SYMBOL_GPL(sched_clock);
														
 
															 static __read_mostly int sched_clock_running;
														
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,7 @@
 
															 #include <linux/slow-work.h>
														
 
															 #include <linux/perf_event.h>
														
 
															 #include <linux/kprobes.h>
														
 
															+#include <linux/pipe_fs_i.h>
														
 
															 #include <asm/uaccess.h>
														
 
															 #include <asm/processor.h>
														
@@ -1444,6 +1445,14 @@ static struct ctl_table fs_table[] = {
 
															 		.child		= binfmt_misc_table,
														
 
															 	},
														
 
															 #endif
														
 
															+	{
														
 
															+		.procname	= "pipe-max-pages",
														
 
															+		.data		= &pipe_max_pages,
														
 
															+		.maxlen		= sizeof(int),
														
 
															+		.mode		= 0644,
														
 
															+		.proc_handler	= &proc_dointvec_minmax,
														
 
															+		.extra1		= &two,
														
 
															+	},
														
 
															 /*
														
 
															  * NOTE: do not add new entries to this table unless you have read
														
 
															  * Documentation/sysctl/ctl_unnumbered.txt
														
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3309,12 +3309,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
															 					size_t len,
														
 
															 					unsigned int flags)
														
 
															 {
														
 
															-	struct page *pages[PIPE_BUFFERS];
														
 
															-	struct partial_page partial[PIPE_BUFFERS];
														
 
															+	struct page *pages_def[PIPE_DEF_BUFFERS];
														
 
															+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
														
 
															 	struct trace_iterator *iter = filp->private_data;
														
 
															 	struct splice_pipe_desc spd = {
														
 
															-		.pages		= pages,
														
 
															-		.partial	= partial,
														
 
															+		.pages		= pages_def,
														
 
															+		.partial	= partial_def,
														
 
															 		.nr_pages	= 0, /* This gets updated below. */
														
 
															 		.flags		= flags,
														
 
															 		.ops		= &tracing_pipe_buf_ops,
														
@@ -3325,6 +3325,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
															 	size_t rem;
														
 
															 	unsigned int i;
														
 
															+	if (splice_grow_spd(pipe, &spd))
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															 	/* copy the tracer to avoid using a global lock all around */
														
 
															 	mutex_lock(&trace_types_lock);
														
 
															 	if (unlikely(old_tracer != current_trace && current_trace)) {
														
@@ -3355,23 +3358,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
															 	trace_access_lock(iter->cpu_file);
														
 
															 	/* Fill as many pages as possible. */
														
 
															-	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
														
 
															-		pages[i] = alloc_page(GFP_KERNEL);
														
 
															-		if (!pages[i])
														
 
															+	for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
														
 
															+		spd.pages[i] = alloc_page(GFP_KERNEL);
														
 
															+		if (!spd.pages[i])
														
 
															 			break;
														
 
															 		rem = tracing_fill_pipe_page(rem, iter);
														
 
															 		/* Copy the data into the page, so we can start over. */
														
 
															 		ret = trace_seq_to_buffer(&iter->seq,
														
 
															-					  page_address(pages[i]),
														
 
															+					  page_address(spd.pages[i]),
														
 
															 					  iter->seq.len);
														
 
															 		if (ret < 0) {
														
 
															-			__free_page(pages[i]);
														
 
															+			__free_page(spd.pages[i]);
														
 
															 			break;
														
 
															 		}
														
 
															-		partial[i].offset = 0;
														
 
															-		partial[i].len = iter->seq.len;
														
 
															+		spd.partial[i].offset = 0;
														
 
															+		spd.partial[i].len = iter->seq.len;
														
 
															 		trace_seq_init(&iter->seq);
														
 
															 	}
														
@@ -3382,12 +3385,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
															 	spd.nr_pages = i;
														
 
															-	return splice_to_pipe(pipe, &spd);
														
 
															+	ret = splice_to_pipe(pipe, &spd);
														
 
															+out:
														
 
															+	splice_shrink_spd(pipe, &spd);
														
 
															+	return ret;
														
 
															 out_err:
														
 
															 	mutex_unlock(&iter->mutex);
														
 
															-
														
 
															-	return ret;
														
 
															+	goto out;
														
 
															 }
														
 
															 static ssize_t
														
@@ -3786,11 +3791,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
															 			    unsigned int flags)
														
 
															 {
														
 
															 	struct ftrace_buffer_info *info = file->private_data;
														
 
															-	struct partial_page partial[PIPE_BUFFERS];
														
 
															-	struct page *pages[PIPE_BUFFERS];
														
 
															+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
														
 
															+	struct page *pages_def[PIPE_DEF_BUFFERS];
														
 
															 	struct splice_pipe_desc spd = {
														
 
															-		.pages		= pages,
														
 
															-		.partial	= partial,
														
 
															+		.pages		= pages_def,
														
 
															+		.partial	= partial_def,
														
 
															 		.flags		= flags,
														
 
															 		.ops		= &buffer_pipe_buf_ops,
														
 
															 		.spd_release	= buffer_spd_release,
														
@@ -3799,22 +3804,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
															 	int entries, size, i;
														
 
															 	size_t ret;
														
 
															+	if (splice_grow_spd(pipe, &spd))
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															 	if (*ppos & (PAGE_SIZE - 1)) {
														
 
															 		WARN_ONCE(1, "Ftrace: previous read must page-align\n");
														
 
															-		return -EINVAL;
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															 	}
														
 
															 	if (len & (PAGE_SIZE - 1)) {
														
 
															 		WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
														
 
															-		if (len < PAGE_SIZE)
														
 
															-			return -EINVAL;
														
 
															+		if (len < PAGE_SIZE) {
														
 
															+			ret = -EINVAL;
														
 
															+			goto out;
														
 
															+		}
														
 
															 		len &= PAGE_MASK;
														
 
															 	}
														
 
															 	trace_access_lock(info->cpu);
														
 
															 	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
														
 
															-	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
														
 
															+	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
														
 
															 		struct page *page;
														
 
															 		int r;
														
@@ -3869,11 +3880,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
															 		else
														
 
															 			ret = 0;
														
 
															 		/* TODO: block */
														
 
															-		return ret;
														
 
															+		goto out;
														
 
															 	}
														
 
															 	ret = splice_to_pipe(pipe, &spd);
														
 
															-
														
 
															+	splice_shrink_spd(pipe, &spd);
														
 
															+out:
														
 
															 	return ret;
														
 
															 }
														
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -48,7 +48,6 @@ static struct timer_list sync_supers_timer;
 
															 static int bdi_sync_supers(void *);
														
 
															 static void sync_supers_timer_fn(unsigned long);
														
 
															-static void arm_supers_timer(void);
														
 
															 static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
														
@@ -252,7 +251,7 @@ static int __init default_bdi_init(void)
 
															 	init_timer(&sync_supers_timer);
														
 
															 	setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
														
 
															-	arm_supers_timer();
														
 
															+	bdi_arm_supers_timer();
														
 
															 	err = bdi_init(&default_backing_dev_info);
														
 
															 	if (!err)
														
@@ -374,10 +373,13 @@ static int bdi_sync_supers(void *unused)
 
															 	return 0;
														
 
															 }
														
 
															-static void arm_supers_timer(void)
														
 
															+void bdi_arm_supers_timer(void)
														
 
															 {
														
 
															 	unsigned long next;
														
 
															+	if (!dirty_writeback_interval)
														
 
															+		return;
														
 
															+
														
 
															 	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
														
 
															 	mod_timer(&sync_supers_timer, round_jiffies_up(next));
														
 
															 }
														
@@ -385,7 +387,7 @@ static void arm_supers_timer(void)
 
															 static void sync_supers_timer_fn(unsigned long unused)
														
 
															 {
														
 
															 	wake_up_process(sync_supers_tsk);
														
 
															-	arm_supers_timer();
														
 
															+	bdi_arm_supers_timer();
														
 
															 }
														
 
															 static int bdi_forker_task(void *ptr)
														
@@ -428,7 +430,10 @@ static int bdi_forker_task(void *ptr)
 
															 			spin_unlock_bh(&bdi_lock);
														
 
															 			wait = msecs_to_jiffies(dirty_writeback_interval * 10);
														
 
															-			schedule_timeout(wait);
														
 
															+			if (wait)
														
 
															+				schedule_timeout(wait);
														
 
															+			else
														
 
															+				schedule();
														
 
															 			try_to_freeze();
														
 
															 			continue;
														
 
															 		}
														
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 
															 	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
														
 
															 			       + global_page_state(NR_UNSTABLE_NFS))
														
 
															 					  > background_thresh)))
														
 
															-		bdi_start_writeback(bdi, NULL, 0);
														
 
															+		bdi_start_writeback(bdi, NULL, 0, 0);
														
 
															 }
														
 
															 void set_page_dirty_balance(struct page *page, int page_mkwrite)
														
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 
															         }
														
 
															 }
														
 
															-static void laptop_timer_fn(unsigned long unused);
														
 
															-
														
 
															-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
														
 
															-
														
 
															 /*
														
 
															  * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
														
 
															  */
														
@@ -694,24 +690,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 
															 	void __user *buffer, size_t *length, loff_t *ppos)
														
 
															 {
														
 
															 	proc_dointvec(table, write, buffer, length, ppos);
														
 
															+	bdi_arm_supers_timer();
														
 
															 	return 0;
														
 
															 }
														
 
															-static void do_laptop_sync(struct work_struct *work)
														
 
															+#ifdef CONFIG_BLOCK
														
 
															+void laptop_mode_timer_fn(unsigned long data)
														
 
															 {
														
 
															-	wakeup_flusher_threads(0);
														
 
															-	kfree(work);
														
 
															-}
														
 
															+	struct request_queue *q = (struct request_queue *)data;
														
 
															+	int nr_pages = global_page_state(NR_FILE_DIRTY) +
														
 
															+		global_page_state(NR_UNSTABLE_NFS);
														
 
															-static void laptop_timer_fn(unsigned long unused)
														
 
															-{
														
 
															-	struct work_struct *work;
														
 
															+	/*
														
 
															+	 * We want to write everything out, not just down to the dirty
														
 
															+	 * threshold
														
 
															+	 */
														
 
															-	work = kmalloc(sizeof(*work), GFP_ATOMIC);
														
 
															-	if (work) {
														
 
															-		INIT_WORK(work, do_laptop_sync);
														
 
															-		schedule_work(work);
														
 
															-	}
														
 
															+	if (bdi_has_dirty_io(&q->backing_dev_info))
														
 
															+		bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
														
 
															 }
														
 
															 /*
														
@@ -719,9 +715,9 @@ static void laptop_timer_fn(unsigned long unused)
 
															  * of all dirty data a few seconds from now.  If the flush is already scheduled
														
 
															  * then push it back - the user is still using the disk.
														
 
															  */
														
 
															-void laptop_io_completion(void)
														
 
															+void laptop_io_completion(struct backing_dev_info *info)
														
 
															 {
														
 
															-	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
														
 
															+	mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
														
 
															 }
														
 
															 /*
														
@@ -731,8 +727,16 @@ void laptop_io_completion(void)
 
															  */
														
 
															 void laptop_sync_completion(void)
														
 
															 {
														
 
															-	del_timer(&laptop_mode_wb_timer);
														
 
															+	struct backing_dev_info *bdi;
														
 
															+
														
 
															+	rcu_read_lock();
														
 
															+
														
 
															+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
														
 
															+		del_timer(&bdi->laptop_mode_wb_timer);
														
 
															+
														
 
															+	rcu_read_unlock();
														
 
															 }
														
 
															+#endif
														
 
															 /*
														
 
															  * If ratelimit_pages is too high then we can get into dirty-data overload
														
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
 
															 	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
														
 
															 	if (nr_blocks) {
														
 
															 		err = blkdev_issue_discard(si->bdev, start_block,
														
 
															-				nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
														
 
															+				nr_blocks, GFP_KERNEL,
														
 
															+				BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
														
 
															 		if (err)
														
 
															 			return err;
														
 
															 		cond_resched();
														
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
 
															 		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
														
 
															 		err = blkdev_issue_discard(si->bdev, start_block,
														
 
															-				nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
														
 
															+				nr_blocks, GFP_KERNEL,
														
 
															+				BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
														
 
															 		if (err)
														
 
															 			break;
														
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 
															 			start_block <<= PAGE_SHIFT - 9;
														
 
															 			nr_blocks <<= PAGE_SHIFT - 9;
														
 
															 			if (blkdev_issue_discard(si->bdev, start_block,
														
 
															-				    nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
														
 
															+				    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
														
 
															+							BLKDEV_IFL_BARRIER))
														
 
															 				break;
														
 
															 		}
														
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1406,12 +1406,13 @@ new_page:
 
															 /*
														
 
															  * Fill page/offset/length into spd, if it can hold more pages.
														
 
															  */
														
 
															-static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
														
 
															+static inline int spd_fill_page(struct splice_pipe_desc *spd,
														
 
															+				struct pipe_inode_info *pipe, struct page *page,
														
 
															 				unsigned int *len, unsigned int offset,
														
 
															 				struct sk_buff *skb, int linear,
														
 
															 				struct sock *sk)
														
 
															 {
														
 
															-	if (unlikely(spd->nr_pages == PIPE_BUFFERS))
														
 
															+	if (unlikely(spd->nr_pages == pipe->buffers))
														
 
															 		return 1;
														
 
															 	if (linear) {
														
@@ -1447,7 +1448,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 
															 				   unsigned int plen, unsigned int *off,
														
 
															 				   unsigned int *len, struct sk_buff *skb,
														
 
															 				   struct splice_pipe_desc *spd, int linear,
														
 
															-				   struct sock *sk)
														
 
															+				   struct sock *sk,
														
 
															+				   struct pipe_inode_info *pipe)
														
 
															 {
														
 
															 	if (!*len)
														
 
															 		return 1;
														
@@ -1470,7 +1472,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 
															 		/* the linear region may spread across several pages  */
														
 
															 		flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
														
 
															-		if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk))
														
 
															+		if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
														
 
															 			return 1;
														
 
															 		__segment_seek(&page, &poff, &plen, flen);
														
@@ -1485,9 +1487,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 
															  * Map linear and fragment data from the skb to spd. It reports failure if the
														
 
															  * pipe is full or if we already spliced the requested length.
														
 
															  */
														
 
															-static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
														
 
															-			     unsigned int *len, struct splice_pipe_desc *spd,
														
 
															-			     struct sock *sk)
														
 
															+static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
														
 
															+			     unsigned int *offset, unsigned int *len,
														
 
															+			     struct splice_pipe_desc *spd, struct sock *sk)
														
 
															 {
														
 
															 	int seg;
														
@@ -1497,7 +1499,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
 
															 	if (__splice_segment(virt_to_page(skb->data),
														
 
															 			     (unsigned long) skb->data & (PAGE_SIZE - 1),
														
 
															 			     skb_headlen(skb),
														
 
															-			     offset, len, skb, spd, 1, sk))
														
 
															+			     offset, len, skb, spd, 1, sk, pipe))
														
 
															 		return 1;
														
 
															 	/*
														
@@ -1507,7 +1509,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
 
															 		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
														
 
															 		if (__splice_segment(f->page, f->page_offset, f->size,
														
 
															-				     offset, len, skb, spd, 0, sk))
														
 
															+				     offset, len, skb, spd, 0, sk, pipe))
														
 
															 			return 1;
														
 
															 	}
														
@@ -1524,8 +1526,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 
															 		    struct pipe_inode_info *pipe, unsigned int tlen,
														
 
															 		    unsigned int flags)
														
 
															 {
														
 
															-	struct partial_page partial[PIPE_BUFFERS];
														
 
															-	struct page *pages[PIPE_BUFFERS];
														
 
															+	struct partial_page partial[PIPE_DEF_BUFFERS];
														
 
															+	struct page *pages[PIPE_DEF_BUFFERS];
														
 
															 	struct splice_pipe_desc spd = {
														
 
															 		.pages = pages,
														
 
															 		.partial = partial,
														
@@ -1535,12 +1537,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 
															 	};
														
 
															 	struct sk_buff *frag_iter;
														
 
															 	struct sock *sk = skb->sk;
														
 
															+	int ret = 0;
														
 
															+
														
 
															+	if (splice_grow_spd(pipe, &spd))
														
 
															+		return -ENOMEM;
														
 
															 	/*
														
 
															 	 * __skb_splice_bits() only fails if the output has no room left,
														
 
															 	 * so no point in going over the frag_list for the error case.
														
 
															 	 */
														
 
															-	if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk))
														
 
															+	if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
														
 
															 		goto done;
														
 
															 	else if (!tlen)
														
 
															 		goto done;
														
@@ -1551,14 +1557,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 
															 	skb_walk_frags(skb, frag_iter) {
														
 
															 		if (!tlen)
														
 
															 			break;
														
 
															-		if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk))
														
 
															+		if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
														
 
															 			break;
														
 
															 	}
														
 
															 done:
														
 
															 	if (spd.nr_pages) {
														
 
															-		int ret;
														
 
															-
														
 
															 		/*
														
 
															 		 * Drop the socket lock, otherwise we have reverse
														
 
															 		 * locking dependencies between sk_lock and i_mutex
														
@@ -1571,10 +1575,10 @@ done:
 
															 		release_sock(sk);
														
 
															 		ret = splice_to_pipe(pipe, &spd);
														
 
															 		lock_sock(sk);
														
 
															-		return ret;
														
 
															 	}
														
 
															-	return 0;
														
 
															+	splice_shrink_spd(pipe, &spd);
														
 
															+	return ret;
														
 
															 }
														
 
															 /**
	`@@ -1 +1 @@`
	`-int ibm_partition(struct parsed_partitions , struct block_device );`
			`+int ibm_partition(struct parsed_partitions *);`
	`@@ -1 +1 @@`
	`-extern int sysv68_partition(struct parsed_partitions state, struct block_device bdev);`
			`+extern int sysv68_partition(struct parsed_partitions *state);`