Browse Source

block: Export I/O topology for block devices and partitions

To support devices with physical block sizes bigger than 512 bytes we
need to ensure proper alignment.  This patch adds support for exposing
I/O topology characteristics as devices are stacked.

  logical_block_size is the smallest unit the device can address.

  physical_block_size indicates the smallest I/O the device can write
  without incurring a read-modify-write penalty.

  The io_min parameter is the smallest preferred I/O size reported by
  the device.  In many cases this is the same as the physical block
  size.  However, the io_min parameter can be scaled up when stacking
  (RAID5 chunk size > physical block size).

  The io_opt characteristic indicates the optimal I/O size reported by
  the device.  This is usually the stripe width for arrays.

  The alignment_offset parameter indicates the number of bytes the start
  of the device/partition is offset from the device's natural alignment.
  Partition tools and MD/DM utilities can use this to pad their offsets
  so filesystems start on proper boundaries.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Martin K. Petersen 16 years ago
parent
commit
c72758f337

+ 59 - 0
Documentation/ABI/testing/sysfs-block

@@ -60,3 +60,62 @@ Description:
 		Indicates whether the block layer should automatically
 		Indicates whether the block layer should automatically
 		generate checksums for write requests bound for
 		generate checksums for write requests bound for
 		devices that support receiving integrity metadata.
 		devices that support receiving integrity metadata.
+
+What:		/sys/block/<disk>/alignment_offset
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a physical block size that is
+		bigger than the logical block size (for instance a drive
+		with 4KB physical sectors exposing 512-byte logical
+		blocks to the operating system).  This parameter
+		indicates how many bytes the beginning of the device is
+		offset from the disk's natural alignment.
+
+What:		/sys/block/<disk>/<partition>/alignment_offset
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a physical block size that is
+		bigger than the logical block size (for instance a drive
+		with 4KB physical sectors exposing 512-byte logical
+		blocks to the operating system).  This parameter
+		indicates how many bytes the beginning of the partition
+		is offset from the disk's natural alignment.
+
+What:		/sys/block/<disk>/queue/logical_block_size
+Date:		May 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		This is the smallest unit the storage device can
+		address.  It is typically 512 bytes.
+
+What:		/sys/block/<disk>/queue/physical_block_size
+Date:		May 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		This is the smallest unit the storage device can write
+		without resorting to read-modify-write operation.  It is
+		usually the same as the logical block size but may be
+		bigger.  One example is SATA drives with 4KB sectors
+		that expose a 512-byte logical block size to the
+		operating system.
+
+What:		/sys/block/<disk>/queue/minimum_io_size
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a preferred minimum I/O size,
+		which is the smallest request the device can perform
+		without incurring a read-modify-write penalty.  For disk
+		drives this is often the physical block size.  For RAID
+		arrays it is often the stripe chunk size.
+
+What:		/sys/block/<disk>/queue/optimal_io_size
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report an optimal I/O size, which is
+		the device's preferred unit of receiving I/O.  This is
+		rarely reported for disk drives.  For RAID devices it is
+		usually the stripe width or the internal block size.

+ 186 - 0
block/blk-settings.c

@@ -309,9 +309,94 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
 {
 	q->limits.logical_block_size = size;
 	q->limits.logical_block_size = size;
+
+	if (q->limits.physical_block_size < size)
+		q->limits.physical_block_size = size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
 }
 }
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 
 
+/**
+ * blk_queue_physical_block_size - set physical block size for the queue
+ * @q:  the request queue for the device
+ * @size:  the physical block size, in bytes
+ *
+ * Description:
+ *   This should be set to the lowest possible sector size that the
+ *   hardware can operate on without reverting to read-modify-write
+ *   operations.
+ */
+void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+{
+	q->limits.physical_block_size = size;
+
+	if (q->limits.physical_block_size < q->limits.logical_block_size)
+		q->limits.physical_block_size = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_physical_block_size);
+
+/**
+ * blk_queue_alignment_offset - set physical block alignment offset
+ * @q:	the request queue for the device
+ * @alignment:	alignment offset in bytes
+ *
+ * Description:
+ *   Some devices are naturally misaligned to compensate for things like
+ *   the legacy DOS partition table 63-sector offset.  Low-level drivers
+ *   should call this function for devices whose first sector is not
+ *   naturally aligned.
+ */
+void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
+{
+	q->limits.alignment_offset =
+		offset & (q->limits.physical_block_size - 1);
+	q->limits.misaligned = 0;
+}
+EXPORT_SYMBOL(blk_queue_alignment_offset);
+
+/**
+ * blk_queue_io_min - set minimum request size for the queue
+ * @q:	the request queue for the device
+ * @io_min:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Some devices have an internal block size bigger than the reported
+ *   hardware sector size.  This function can be used to signal the
+ *   smallest I/O the device can perform without incurring a performance
+ *   penalty.
+ */
+void blk_queue_io_min(struct request_queue *q, unsigned int min)
+{
+	q->limits.io_min = min;
+
+	if (q->limits.io_min < q->limits.logical_block_size)
+		q->limits.io_min = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_io_min);
+
+/**
+ * blk_queue_io_opt - set optimal request size for the queue
+ * @q:	the request queue for the device
+ * @io_opt:  optimal request size in bytes
+ *
+ * Description:
+ *   Drivers can call this function to set the preferred I/O request
+ *   size for devices that report such a value.
+ */
+void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
+{
+	q->limits.io_opt = opt;
+}
+EXPORT_SYMBOL(blk_queue_io_opt);
+
 /*
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
  * Returns the minimum that is _not_ zero, unless both are zero.
  */
  */
@@ -357,6 +442,107 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
 
+/**
+ * blk_stack_limits - adjust queue_limits for stacked devices
+ * @t:	the stacking driver limits (top)
+ * @bdev:  the underlying queue limits (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges two queue_limit structs.  Returns 0 if alignment didn't
+ *    change.  Returns -1 if adding the bottom device caused
+ *    misalignment.
+ */
+int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+		     sector_t offset)
+{
+	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
+					    b->seg_boundary_mask);
+
+	t->max_phys_segments = min_not_zero(t->max_phys_segments,
+					    b->max_phys_segments);
+
+	t->max_hw_segments = min_not_zero(t->max_hw_segments,
+					  b->max_hw_segments);
+
+	t->max_segment_size = min_not_zero(t->max_segment_size,
+					   b->max_segment_size);
+
+	t->logical_block_size = max(t->logical_block_size,
+				    b->logical_block_size);
+
+	t->physical_block_size = max(t->physical_block_size,
+				     b->physical_block_size);
+
+	t->io_min = max(t->io_min, b->io_min);
+	t->no_cluster |= b->no_cluster;
+
+	/* Bottom device offset aligned? */
+	if (offset &&
+	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
+		t->misaligned = 1;
+		return -1;
+	}
+
+	/* If top has no alignment offset, inherit from bottom */
+	if (!t->alignment_offset)
+		t->alignment_offset =
+			b->alignment_offset & (b->physical_block_size - 1);
+
+	/* Top device aligned on logical block boundary? */
+	if (t->alignment_offset & (t->logical_block_size - 1)) {
+		t->misaligned = 1;
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * disk_stack_limits - adjust queue limits for stacked drivers
+ * @t:	MD/DM gendisk (top)
+ * @bdev:  the underlying block device (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges the limits for two queues.  Returns 0 if alignment
+ *    didn't change.  Returns -1 if adding the bottom device caused
+ *    misalignment.
+ */
+void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+		       sector_t offset)
+{
+	struct request_queue *t = disk->queue;
+	struct request_queue *b = bdev_get_queue(bdev);
+
+	offset += get_start_sect(bdev) << 9;
+
+	if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
+		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, top);
+		bdevname(bdev, bottom);
+
+		printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
+		       top, bottom);
+	}
+
+	if (!t->queue_lock)
+		WARN_ON_ONCE(1);
+	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(t->queue_lock, flags);
+		if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+			queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
+		spin_unlock_irqrestore(t->queue_lock, flags);
+	}
+}
+EXPORT_SYMBOL(disk_stack_limits);
+
 /**
 /**
  * blk_queue_dma_pad - set pad mask
  * blk_queue_dma_pad - set pad mask
  * @q:     the request queue for the device
  * @q:     the request queue for the device

+ 33 - 0
block/blk-sysfs.c

@@ -105,6 +105,21 @@ static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page
 	return queue_var_show(queue_logical_block_size(q), page);
 	return queue_var_show(queue_logical_block_size(q), page);
 }
 }
 
 
+static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_physical_block_size(q), page);
+}
+
+static ssize_t queue_io_min_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_min(q), page);
+}
+
+static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_opt(q), page);
+}
+
 static ssize_t
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
 {
@@ -257,6 +272,21 @@ static struct queue_sysfs_entry queue_logical_block_size_entry = {
 	.show = queue_logical_block_size_show,
 	.show = queue_logical_block_size_show,
 };
 };
 
 
+static struct queue_sysfs_entry queue_physical_block_size_entry = {
+	.attr = {.name = "physical_block_size", .mode = S_IRUGO },
+	.show = queue_physical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_io_min_entry = {
+	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+	.show = queue_io_min_show,
+};
+
+static struct queue_sysfs_entry queue_io_opt_entry = {
+	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+	.show = queue_io_opt_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
 	.show = queue_nonrot_show,
@@ -289,6 +319,9 @@ static struct attribute *default_attrs[] = {
 	&queue_iosched_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
 	&queue_hw_sector_size_entry.attr,
 	&queue_logical_block_size_entry.attr,
 	&queue_logical_block_size_entry.attr,
+	&queue_physical_block_size_entry.attr,
+	&queue_io_min_entry.attr,
+	&queue_io_opt_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	&queue_rq_affinity_entry.attr,

+ 11 - 0
block/genhd.c

@@ -852,11 +852,21 @@ static ssize_t disk_capability_show(struct device *dev,
 	return sprintf(buf, "%x\n", disk->flags);
 	return sprintf(buf, "%x\n", disk->flags);
 }
 }
 
 
+static ssize_t disk_alignment_offset_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -875,6 +885,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_removable.attr,
 	&dev_attr_removable.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 #ifdef CONFIG_FAIL_MAKE_REQUEST

+ 10 - 0
fs/partitions/check.c

@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 }
 
 
+ssize_t part_alignment_offset_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
+
 ssize_t part_stat_show(struct device *dev,
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 		       struct device_attribute *attr, char *buf)
 {
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_partition.attr,
 	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 	&dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev = part_to_dev(p);
 	pdev = part_to_dev(p);
 
 
 	p->start_sect = start;
 	p->start_sect = start;
+	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
 	p->nr_sects = len;
 	p->nr_sects = len;
 	p->partno = partno;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
 	p->policy = get_disk_ro(disk);

+ 47 - 0
include/linux/blkdev.h

@@ -314,11 +314,16 @@ struct queue_limits {
 	unsigned int		max_hw_sectors;
 	unsigned int		max_hw_sectors;
 	unsigned int		max_sectors;
 	unsigned int		max_sectors;
 	unsigned int		max_segment_size;
 	unsigned int		max_segment_size;
+	unsigned int		physical_block_size;
+	unsigned int		alignment_offset;
+	unsigned int		io_min;
+	unsigned int		io_opt;
 
 
 	unsigned short		logical_block_size;
 	unsigned short		logical_block_size;
 	unsigned short		max_hw_segments;
 	unsigned short		max_hw_segments;
 	unsigned short		max_phys_segments;
 	unsigned short		max_phys_segments;
 
 
+	unsigned char		misaligned;
 	unsigned char		no_cluster;
 	unsigned char		no_cluster;
 };
 };
 
 
@@ -911,6 +916,15 @@ extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_alignment_offset(struct request_queue *q,
+				       unsigned int alignment);
+extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
+extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+			    sector_t offset);
+extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+			      sector_t offset);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
@@ -1047,6 +1061,39 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 	return queue_logical_block_size(bdev_get_queue(bdev));
 	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 }
 
 
+static inline unsigned int queue_physical_block_size(struct request_queue *q)
+{
+	return q->limits.physical_block_size;
+}
+
+static inline unsigned int queue_io_min(struct request_queue *q)
+{
+	return q->limits.io_min;
+}
+
+static inline unsigned int queue_io_opt(struct request_queue *q)
+{
+	return q->limits.io_opt;
+}
+
+static inline int queue_alignment_offset(struct request_queue *q)
+{
+	if (q && q->limits.misaligned)
+		return -1;
+
+	if (q && q->limits.alignment_offset)
+		return q->limits.alignment_offset;
+
+	return 0;
+}
+
+static inline int queue_sector_alignment_offset(struct request_queue *q,
+						sector_t sector)
+{
+	return ((sector << 9) - q->limits.alignment_offset)
+		& (q->limits.io_min - 1);
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 {
 	return q ? q->dma_alignment : 511;
 	return q ? q->dma_alignment : 511;

+ 1 - 0
include/linux/genhd.h

@@ -90,6 +90,7 @@ struct disk_stats {
 struct hd_struct {
 struct hd_struct {
 	sector_t start_sect;
 	sector_t start_sect;
 	sector_t nr_sects;
 	sector_t nr_sects;
+	sector_t alignment_offset;
 	struct device __dev;
 	struct device __dev;
 	struct kobject *holder_dir;
 	struct kobject *holder_dir;
 	int policy, partno;
 	int policy, partno;