|
@@ -38,21 +38,36 @@
|
|
|
* near_copies (stored in low byte of layout)
|
|
|
* far_copies (stored in second byte of layout)
|
|
|
* far_offset (stored in bit 16 of layout )
|
|
|
+ * use_far_sets (stored in bit 17 of layout )
|
|
|
*
|
|
|
- * The data to be stored is divided into chunks using chunksize.
|
|
|
- * Each device is divided into far_copies sections.
|
|
|
- * In each section, chunks are laid out in a style similar to raid0, but
|
|
|
- * near_copies copies of each chunk is stored (each on a different drive).
|
|
|
- * The starting device for each section is offset near_copies from the starting
|
|
|
- * device of the previous section.
|
|
|
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
|
|
|
- * drive.
|
|
|
- * near_copies and far_copies must be at least one, and their product is at most
|
|
|
- * raid_disks.
|
|
|
+ * The data to be stored is divided into chunks using chunksize. Each device
|
|
|
+ * is divided into far_copies sections. In each section, chunks are laid out
|
|
|
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
|
|
|
+ * (each on a different drive). The starting device for each section is offset
|
|
|
+ * near_copies from the starting device of the previous section. Thus there
|
|
|
+ * are (near_copies * far_copies) of each chunk, and each is on a different
|
|
|
+ * drive. near_copies and far_copies must be at least one, and their product
|
|
|
+ * is at most raid_disks.
|
|
|
*
|
|
|
* If far_offset is true, then the far_copies are handled a bit differently.
|
|
|
- * The copies are still in different stripes, but instead of be very far apart
|
|
|
- * on disk, there are adjacent stripes.
|
|
|
+ * The copies are still in different stripes, but instead of being very far
|
|
|
+ * apart on disk, there are adjacent stripes.
|
|
|
+ *
|
|
|
+ * The far and offset algorithms are handled slightly differently if
|
|
|
+ * 'use_far_sets' is true. In this case, the array's devices are grouped into
|
|
|
+ * sets that are (near_copies * far_copies) in size. The far copied stripes
|
|
|
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
|
|
|
+ * to the set rather than the entire array. This is done to improve the number
|
|
|
+ * of device combinations that can fail without causing the array to fail.
|
|
|
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
|
|
|
+ * on a device):
|
|
|
+ * A B C D A B C D E
|
|
|
+ * ... ...
|
|
|
+ * D A B C E A B C D
|
|
|
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
|
|
|
+ * [A B] [C D] [A B] [C D E]
|
|
|
+ * |...| |...| |...| | ... |
|
|
|
+ * [B A] [D C] [B A] [E C D]
|
|
|
*/
|
|
|
|
|
|
/*
|
|
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
|
|
|
sector_t stripe;
|
|
|
int dev;
|
|
|
int slot = 0;
|
|
|
+ int last_far_set_start, last_far_set_size;
|
|
|
+
|
|
|
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
|
|
|
+ last_far_set_start *= geo->far_set_size;
|
|
|
+
|
|
|
+ last_far_set_size = geo->far_set_size;
|
|
|
+ last_far_set_size += (geo->raid_disks % geo->far_set_size);
|
|
|
|
|
|
/* now calculate first sector/dev */
|
|
|
chunk = r10bio->sector >> geo->chunk_shift;
|
|
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
|
|
|
/* and calculate all the others */
|
|
|
for (n = 0; n < geo->near_copies; n++) {
|
|
|
int d = dev;
|
|
|
+ int set;
|
|
|
sector_t s = sector;
|
|
|
- r10bio->devs[slot].addr = sector;
|
|
|
r10bio->devs[slot].devnum = d;
|
|
|
+ r10bio->devs[slot].addr = s;
|
|
|
slot++;
|
|
|
|
|
|
for (f = 1; f < geo->far_copies; f++) {
|
|
|
+ set = d / geo->far_set_size;
|
|
|
d += geo->near_copies;
|
|
|
- if (d >= geo->raid_disks)
|
|
|
- d -= geo->raid_disks;
|
|
|
+
|
|
|
+ if ((geo->raid_disks % geo->far_set_size) &&
|
|
|
+ (d > last_far_set_start)) {
|
|
|
+ d -= last_far_set_start;
|
|
|
+ d %= last_far_set_size;
|
|
|
+ d += last_far_set_start;
|
|
|
+ } else {
|
|
|
+ d %= geo->far_set_size;
|
|
|
+ d += geo->far_set_size * set;
|
|
|
+ }
|
|
|
s += geo->stride;
|
|
|
r10bio->devs[slot].devnum = d;
|
|
|
r10bio->devs[slot].addr = s;
|
|
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
|
|
|
* or recovery, so reshape isn't happening
|
|
|
*/
|
|
|
struct geom *geo = &conf->geo;
|
|
|
+ int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
|
|
|
+ int far_set_size = geo->far_set_size;
|
|
|
+ int last_far_set_start;
|
|
|
+
|
|
|
+ if (geo->raid_disks % geo->far_set_size) {
|
|
|
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
|
|
|
+ last_far_set_start *= geo->far_set_size;
|
|
|
+
|
|
|
+ if (dev >= last_far_set_start) {
|
|
|
+ far_set_size = geo->far_set_size;
|
|
|
+ far_set_size += (geo->raid_disks % geo->far_set_size);
|
|
|
+ far_set_start = last_far_set_start;
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
offset = sector & geo->chunk_mask;
|
|
|
if (geo->far_offset) {
|
|
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
|
|
|
chunk = sector >> geo->chunk_shift;
|
|
|
fc = sector_div(chunk, geo->far_copies);
|
|
|
dev -= fc * geo->near_copies;
|
|
|
- if (dev < 0)
|
|
|
- dev += geo->raid_disks;
|
|
|
+ if (dev < far_set_start)
|
|
|
+ dev += far_set_size;
|
|
|
} else {
|
|
|
while (sector >= geo->stride) {
|
|
|
sector -= geo->stride;
|
|
|
- if (dev < geo->near_copies)
|
|
|
- dev += geo->raid_disks - geo->near_copies;
|
|
|
+ if (dev < (geo->near_copies + far_set_start))
|
|
|
+ dev += far_set_size - geo->near_copies;
|
|
|
else
|
|
|
dev -= geo->near_copies;
|
|
|
}
|
|
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
|
|
|
bio_list_merge(&conf->pending_bio_list, &plug->pending);
|
|
|
conf->pending_count += plug->pending_cnt;
|
|
|
spin_unlock_irq(&conf->device_lock);
|
|
|
+ wake_up(&conf->wait_barrier);
|
|
|
md_wakeup_thread(mddev->thread);
|
|
|
kfree(plug);
|
|
|
return;
|
|
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
|
|
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
|
|
|
const unsigned long do_discard = (bio->bi_rw
|
|
|
& (REQ_DISCARD | REQ_SECURE));
|
|
|
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
|
|
|
unsigned long flags;
|
|
|
struct md_rdev *blocked_rdev;
|
|
|
struct blk_plug_cb *cb;
|
|
@@ -1460,7 +1508,8 @@ retry_write:
|
|
|
rdev));
|
|
|
mbio->bi_bdev = rdev->bdev;
|
|
|
mbio->bi_end_io = raid10_end_write_request;
|
|
|
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
|
|
|
+ mbio->bi_rw =
|
|
|
+ WRITE | do_sync | do_fua | do_discard | do_same;
|
|
|
mbio->bi_private = r10_bio;
|
|
|
|
|
|
atomic_inc(&r10_bio->remaining);
|
|
@@ -1502,7 +1551,8 @@ retry_write:
|
|
|
r10_bio, rdev));
|
|
|
mbio->bi_bdev = rdev->bdev;
|
|
|
mbio->bi_end_io = raid10_end_write_request;
|
|
|
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
|
|
|
+ mbio->bi_rw =
|
|
|
+ WRITE | do_sync | do_fua | do_discard | do_same;
|
|
|
mbio->bi_private = r10_bio;
|
|
|
|
|
|
atomic_inc(&r10_bio->remaining);
|
|
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
|
|
|
disks = mddev->raid_disks + mddev->delta_disks;
|
|
|
break;
|
|
|
}
|
|
|
- if (layout >> 17)
|
|
|
+ if (layout >> 18)
|
|
|
return -1;
|
|
|
if (chunk < (PAGE_SIZE >> 9) ||
|
|
|
!is_power_of_2(chunk))
|
|
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
|
|
|
geo->near_copies = nc;
|
|
|
geo->far_copies = fc;
|
|
|
geo->far_offset = fo;
|
|
|
+ geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
|
|
|
geo->chunk_mask = chunk - 1;
|
|
|
geo->chunk_shift = ffz(~chunk);
|
|
|
return nc*fc;
|
|
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
|
|
|
if (mddev->queue) {
|
|
|
blk_queue_max_discard_sectors(mddev->queue,
|
|
|
mddev->chunk_sectors);
|
|
|
+ blk_queue_max_write_same_sectors(mddev->queue,
|
|
|
+ mddev->chunk_sectors);
|
|
|
blk_queue_io_min(mddev->queue, chunk_size);
|
|
|
if (conf->geo.raid_disks % conf->geo.near_copies)
|
|
|
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
|