|
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
|
|
|
rw = WRITE_FUA;
|
|
|
else
|
|
|
rw = WRITE;
|
|
|
+ if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
|
|
|
+ rw |= REQ_DISCARD;
|
|
|
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
|
|
|
rw = READ;
|
|
|
else if (test_and_clear_bit(R5_WantReplace,
|
|
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
|
|
|
set_bit(R5_WantFUA, &dev->flags);
|
|
|
if (wbi->bi_rw & REQ_SYNC)
|
|
|
set_bit(R5_SyncIO, &dev->flags);
|
|
|
- tx = async_copy_data(1, wbi, dev->page,
|
|
|
- dev->sector, tx);
|
|
|
+ if (wbi->bi_rw & REQ_DISCARD) {
|
|
|
+ memset(page_address(dev->page), 0,
|
|
|
+ STRIPE_SECTORS << 9);
|
|
|
+ set_bit(R5_Discard, &dev->flags);
|
|
|
+ } else
|
|
|
+ tx = async_copy_data(1, wbi, dev->page,
|
|
|
+ dev->sector, tx);
|
|
|
wbi = r5_next_bio(wbi, dev->sector);
|
|
|
}
|
|
|
}
|
|
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
|
|
|
pr_debug("%s: stripe %llu\n", __func__,
|
|
|
(unsigned long long)sh->sector);
|
|
|
|
|
|
+ for (i = 0; i < sh->disks; i++) {
|
|
|
+ if (pd_idx == i)
|
|
|
+ continue;
|
|
|
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ if (i >= sh->disks) {
|
|
|
+ atomic_inc(&sh->count);
|
|
|
+ memset(page_address(sh->dev[pd_idx].page), 0,
|
|
|
+ STRIPE_SECTORS << 9);
|
|
|
+ set_bit(R5_Discard, &sh->dev[pd_idx].flags);
|
|
|
+ ops_complete_reconstruct(sh);
|
|
|
+ return;
|
|
|
+ }
|
|
|
/* check if prexor is active which means only process blocks
|
|
|
* that are part of a read-modify-write (written)
|
|
|
*/
|
|
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
|
|
|
{
|
|
|
struct async_submit_ctl submit;
|
|
|
struct page **blocks = percpu->scribble;
|
|
|
- int count;
|
|
|
+ int count, i;
|
|
|
|
|
|
pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
|
|
|
|
|
|
+ for (i = 0; i < sh->disks; i++) {
|
|
|
+ if (sh->pd_idx == i || sh->qd_idx == i)
|
|
|
+ continue;
|
|
|
+ if (!test_bit(R5_Discard, &sh->dev[i].flags))
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ if (i >= sh->disks) {
|
|
|
+ atomic_inc(&sh->count);
|
|
|
+ memset(page_address(sh->dev[sh->pd_idx].page), 0,
|
|
|
+ STRIPE_SECTORS << 9);
|
|
|
+ memset(page_address(sh->dev[sh->qd_idx].page), 0,
|
|
|
+ STRIPE_SECTORS << 9);
|
|
|
+ set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
|
|
|
+ set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
|
|
|
+ ops_complete_reconstruct(sh);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
count = set_syndrome_sources(blocks, sh);
|
|
|
|
|
|
atomic_inc(&sh->count);
|
|
@@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct mddev *mddev,
|
|
|
release_stripe(sh);
|
|
|
}
|
|
|
|
|
|
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
|
|
+{
|
|
|
+ struct r5conf *conf = mddev->private;
|
|
|
+ sector_t logical_sector, last_sector;
|
|
|
+ struct stripe_head *sh;
|
|
|
+ int remaining;
|
|
|
+ int stripe_sectors;
|
|
|
+
|
|
|
+ if (mddev->reshape_position != MaxSector)
|
|
|
+ /* Skip discard while reshape is happening */
|
|
|
+ return;
|
|
|
+
|
|
|
+ logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
|
|
|
+ last_sector = bi->bi_sector + (bi->bi_size>>9);
|
|
|
+
|
|
|
+ bi->bi_next = NULL;
|
|
|
+ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
|
|
|
+
|
|
|
+ stripe_sectors = conf->chunk_sectors *
|
|
|
+ (conf->raid_disks - conf->max_degraded);
|
|
|
+ logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
|
|
|
+ stripe_sectors);
|
|
|
+ sector_div(last_sector, stripe_sectors);
|
|
|
+
|
|
|
+ logical_sector *= conf->chunk_sectors;
|
|
|
+ last_sector *= conf->chunk_sectors;
|
|
|
+
|
|
|
+ for (; logical_sector < last_sector;
|
|
|
+ logical_sector += STRIPE_SECTORS) {
|
|
|
+ DEFINE_WAIT(w);
|
|
|
+ int d;
|
|
|
+ again:
|
|
|
+ sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
|
|
|
+ prepare_to_wait(&conf->wait_for_overlap, &w,
|
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
|
+ spin_lock_irq(&sh->stripe_lock);
|
|
|
+ for (d = 0; d < conf->raid_disks; d++) {
|
|
|
+ if (d == sh->pd_idx || d == sh->qd_idx)
|
|
|
+ continue;
|
|
|
+ if (sh->dev[d].towrite || sh->dev[d].toread) {
|
|
|
+ set_bit(R5_Overlap, &sh->dev[d].flags);
|
|
|
+ spin_unlock_irq(&sh->stripe_lock);
|
|
|
+ release_stripe(sh);
|
|
|
+ schedule();
|
|
|
+ goto again;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ finish_wait(&conf->wait_for_overlap, &w);
|
|
|
+ for (d = 0; d < conf->raid_disks; d++) {
|
|
|
+ if (d == sh->pd_idx || d == sh->qd_idx)
|
|
|
+ continue;
|
|
|
+ sh->dev[d].towrite = bi;
|
|
|
+ set_bit(R5_OVERWRITE, &sh->dev[d].flags);
|
|
|
+ raid5_inc_bi_active_stripes(bi);
|
|
|
+ }
|
|
|
+ spin_unlock_irq(&sh->stripe_lock);
|
|
|
+ if (conf->mddev->bitmap) {
|
|
|
+ for (d = 0;
|
|
|
+ d < conf->raid_disks - conf->max_degraded;
|
|
|
+ d++)
|
|
|
+ bitmap_startwrite(mddev->bitmap,
|
|
|
+ sh->sector,
|
|
|
+ STRIPE_SECTORS,
|
|
|
+ 0);
|
|
|
+ sh->bm_seq = conf->seq_flush + 1;
|
|
|
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
|
|
|
+ }
|
|
|
+
|
|
|
+ set_bit(STRIPE_HANDLE, &sh->state);
|
|
|
+ clear_bit(STRIPE_DELAYED, &sh->state);
|
|
|
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
|
|
|
+ atomic_inc(&conf->preread_active_stripes);
|
|
|
+ release_stripe_plug(mddev, sh);
|
|
|
+ }
|
|
|
+
|
|
|
+ remaining = raid5_dec_bi_active_stripes(bi);
|
|
|
+ if (remaining == 0) {
|
|
|
+ md_write_end(mddev);
|
|
|
+ bio_endio(bi, 0);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
static void make_request(struct mddev *mddev, struct bio * bi)
|
|
|
{
|
|
|
struct r5conf *conf = mddev->private;
|
|
@@ -4089,6 +4210,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
|
|
|
chunk_aligned_read(mddev,bi))
|
|
|
return;
|
|
|
|
|
|
+ if (unlikely(bi->bi_rw & REQ_DISCARD)) {
|
|
|
+ make_discard_request(mddev, bi);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
|
|
|
last_sector = bi->bi_sector + (bi->bi_size>>9);
|
|
|
bi->bi_next = NULL;
|
|
@@ -5362,6 +5488,7 @@ static int run(struct mddev *mddev)
|
|
|
|
|
|
if (mddev->queue) {
|
|
|
int chunk_size;
|
|
|
+ bool discard_supported = true;
|
|
|
/* read-ahead size must cover two whole stripes, which
|
|
|
* is 2 * (datadisks) * chunksize where 'n' is the
|
|
|
* number of raid devices
|
|
@@ -5381,13 +5508,48 @@ static int run(struct mddev *mddev)
|
|
|
blk_queue_io_min(mddev->queue, chunk_size);
|
|
|
blk_queue_io_opt(mddev->queue, chunk_size *
|
|
|
(conf->raid_disks - conf->max_degraded));
|
|
|
+ /*
|
|
|
+ * We can only discard a whole stripe. It doesn't make sense to
|
|
|
+ * discard data disk but write parity disk
|
|
|
+ */
|
|
|
+ stripe = stripe * PAGE_SIZE;
|
|
|
+ mddev->queue->limits.discard_alignment = stripe;
|
|
|
+ mddev->queue->limits.discard_granularity = stripe;
|
|
|
+ /*
|
|
|
+ * unaligned part of discard request will be ignored, so can't
|
|
|
+ * guarantee discard_zerors_data
|
|
|
+ */
|
|
|
+ mddev->queue->limits.discard_zeroes_data = 0;
|
|
|
|
|
|
rdev_for_each(rdev, mddev) {
|
|
|
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
|
|
rdev->data_offset << 9);
|
|
|
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
|
|
rdev->new_data_offset << 9);
|
|
|
+ /*
|
|
|
+ * discard_zeroes_data is required, otherwise data
|
|
|
+ * could be lost. Consider a scenario: discard a stripe
|
|
|
+ * (the stripe could be inconsistent if
|
|
|
+ * discard_zeroes_data is 0); write one disk of the
|
|
|
+ * stripe (the stripe could be inconsistent again
|
|
|
+ * depending on which disks are used to calculate
|
|
|
+ * parity); the disk is broken; The stripe data of this
|
|
|
+ * disk is lost.
|
|
|
+ */
|
|
|
+ if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
|
|
|
+ !bdev_get_queue(rdev->bdev)->
|
|
|
+ limits.discard_zeroes_data)
|
|
|
+ discard_supported = false;
|
|
|
}
|
|
|
+
|
|
|
+ if (discard_supported &&
|
|
|
+ mddev->queue->limits.max_discard_sectors >= stripe &&
|
|
|
+ mddev->queue->limits.discard_granularity >= stripe)
|
|
|
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
|
|
|
+ mddev->queue);
|
|
|
+ else
|
|
|
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
|
|
|
+ mddev->queue);
|
|
|
}
|
|
|
|
|
|
return 0;
|