|
@@ -6,6 +6,7 @@
|
|
|
|
|
|
#include "dm.h"
|
|
|
#include "dm-bio-list.h"
|
|
|
+#include "dm-bio-record.h"
|
|
|
#include "dm-io.h"
|
|
|
#include "dm-log.h"
|
|
|
#include "kcopyd.h"
|
|
@@ -141,6 +142,7 @@ struct mirror_set {
|
|
|
struct bio_list failures;
|
|
|
|
|
|
struct dm_io_client *io_client;
|
|
|
+ mempool_t *read_record_pool;
|
|
|
|
|
|
/* recovery */
|
|
|
region_t nr_regions;
|
|
@@ -647,24 +649,30 @@ static void rh_start_recovery(struct region_hash *rh)
|
|
|
wake(rh->ms);
|
|
|
}
|
|
|
|
|
|
+#define MIN_READ_RECORDS 20
|
|
|
+struct dm_raid1_read_record {
|
|
|
+ struct mirror *m;
|
|
|
+ struct dm_bio_details details;
|
|
|
+};
|
|
|
+
|
|
|
/*
|
|
|
* Every mirror should look like this one.
|
|
|
*/
|
|
|
#define DEFAULT_MIRROR 0
|
|
|
|
|
|
/*
|
|
|
- * This is yucky. We squirrel the mirror_set struct away inside
|
|
|
- * bi_next for write buffers. This is safe since the bh
|
|
|
+ * This is yucky. We squirrel the mirror struct away inside
|
|
|
+ * bi_next for read/write buffers. This is safe since the bh
|
|
|
* doesn't get submitted to the lower levels of block layer.
|
|
|
*/
|
|
|
-static struct mirror_set *bio_get_ms(struct bio *bio)
|
|
|
+static struct mirror *bio_get_m(struct bio *bio)
|
|
|
{
|
|
|
- return (struct mirror_set *) bio->bi_next;
|
|
|
+ return (struct mirror *) bio->bi_next;
|
|
|
}
|
|
|
|
|
|
-static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
|
|
|
+static void bio_set_m(struct bio *bio, struct mirror *m)
|
|
|
{
|
|
|
- bio->bi_next = (struct bio *) ms;
|
|
|
+ bio->bi_next = (struct bio *) m;
|
|
|
}
|
|
|
|
|
|
static struct mirror *get_default_mirror(struct mirror_set *ms)
|
|
@@ -857,17 +865,105 @@ static void do_recovery(struct mirror_set *ms)
|
|
|
*---------------------------------------------------------------*/
|
|
|
static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
|
|
|
{
|
|
|
- /* FIXME: add read balancing */
|
|
|
- return get_default_mirror(ms);
|
|
|
+ struct mirror *m = get_default_mirror(ms);
|
|
|
+
|
|
|
+ do {
|
|
|
+ if (likely(!atomic_read(&m->error_count)))
|
|
|
+ return m;
|
|
|
+
|
|
|
+ if (m-- == ms->mirror)
|
|
|
+ m += ms->nr_mirrors;
|
|
|
+ } while (m != get_default_mirror(ms));
|
|
|
+
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+static int default_ok(struct mirror *m)
|
|
|
+{
|
|
|
+ struct mirror *default_mirror = get_default_mirror(m->ms);
|
|
|
+
|
|
|
+ return !atomic_read(&default_mirror->error_count);
|
|
|
+}
|
|
|
+
|
|
|
+static int mirror_available(struct mirror_set *ms, struct bio *bio)
|
|
|
+{
|
|
|
+ region_t region = bio_to_region(&ms->rh, bio);
|
|
|
+
|
|
|
+ if (ms->rh.log->type->in_sync(ms->rh.log, region, 0))
|
|
|
+ return choose_mirror(ms, bio->bi_sector) ? 1 : 0;
|
|
|
+
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
* remap a buffer to a particular mirror.
|
|
|
*/
|
|
|
-static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
|
|
|
+static sector_t map_sector(struct mirror *m, struct bio *bio)
|
|
|
+{
|
|
|
+ return m->offset + (bio->bi_sector - m->ms->ti->begin);
|
|
|
+}
|
|
|
+
|
|
|
+static void map_bio(struct mirror *m, struct bio *bio)
|
|
|
{
|
|
|
bio->bi_bdev = m->dev->bdev;
|
|
|
- bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
|
|
|
+ bio->bi_sector = map_sector(m, bio);
|
|
|
+}
|
|
|
+
|
|
|
+static void map_region(struct io_region *io, struct mirror *m,
|
|
|
+ struct bio *bio)
|
|
|
+{
|
|
|
+ io->bdev = m->dev->bdev;
|
|
|
+ io->sector = map_sector(m, bio);
|
|
|
+ io->count = bio->bi_size >> 9;
|
|
|
+}
|
|
|
+
|
|
|
+/*-----------------------------------------------------------------
|
|
|
+ * Reads
|
|
|
+ *---------------------------------------------------------------*/
|
|
|
+static void read_callback(unsigned long error, void *context)
|
|
|
+{
|
|
|
+ struct bio *bio = context;
|
|
|
+ struct mirror *m;
|
|
|
+
|
|
|
+ m = bio_get_m(bio);
|
|
|
+ bio_set_m(bio, NULL);
|
|
|
+
|
|
|
+ if (likely(!error)) {
|
|
|
+ bio_endio(bio, 0);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ fail_mirror(m, DM_RAID1_READ_ERROR);
|
|
|
+
|
|
|
+ if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
|
|
|
+ DMWARN_LIMIT("Read failure on mirror device %s. "
|
|
|
+ "Trying alternative device.",
|
|
|
+ m->dev->name);
|
|
|
+ queue_bio(m->ms, bio, bio_rw(bio));
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
|
|
|
+ m->dev->name);
|
|
|
+ bio_endio(bio, -EIO);
|
|
|
+}
|
|
|
+
|
|
|
+/* Asynchronous read. */
|
|
|
+static void read_async_bio(struct mirror *m, struct bio *bio)
|
|
|
+{
|
|
|
+ struct io_region io;
|
|
|
+ struct dm_io_request io_req = {
|
|
|
+ .bi_rw = READ,
|
|
|
+ .mem.type = DM_IO_BVEC,
|
|
|
+ .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
|
|
|
+ .notify.fn = read_callback,
|
|
|
+ .notify.context = bio,
|
|
|
+ .client = m->ms->io_client,
|
|
|
+ };
|
|
|
+
|
|
|
+ map_region(&io, m, bio);
|
|
|
+ bio_set_m(bio, m);
|
|
|
+ (void) dm_io(&io_req, 1, &io, NULL);
|
|
|
}
|
|
|
|
|
|
static void do_reads(struct mirror_set *ms, struct bio_list *reads)
|
|
@@ -878,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
|
|
|
|
|
|
while ((bio = bio_list_pop(reads))) {
|
|
|
region = bio_to_region(&ms->rh, bio);
|
|
|
+ m = get_default_mirror(ms);
|
|
|
|
|
|
/*
|
|
|
* We can only read balance if the region is in sync.
|
|
|
*/
|
|
|
- if (rh_in_sync(&ms->rh, region, 1))
|
|
|
+ if (likely(rh_in_sync(&ms->rh, region, 1)))
|
|
|
m = choose_mirror(ms, bio->bi_sector);
|
|
|
- else
|
|
|
- m = get_default_mirror(ms);
|
|
|
+ else if (m && atomic_read(&m->error_count))
|
|
|
+ m = NULL;
|
|
|
|
|
|
- map_bio(ms, m, bio);
|
|
|
- generic_make_request(bio);
|
|
|
+ if (likely(m))
|
|
|
+ read_async_bio(m, bio);
|
|
|
+ else
|
|
|
+ bio_endio(bio, -EIO);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -964,8 +1063,8 @@ static void write_callback(unsigned long error, void *context)
|
|
|
int should_wake = 0;
|
|
|
unsigned long flags;
|
|
|
|
|
|
- ms = bio_get_ms(bio);
|
|
|
- bio_set_ms(bio, NULL);
|
|
|
+ ms = bio_get_m(bio)->ms;
|
|
|
+ bio_set_m(bio, NULL);
|
|
|
|
|
|
/*
|
|
|
* NOTE: We don't decrement the pending count here,
|
|
@@ -1008,7 +1107,7 @@ out:
|
|
|
static void do_write(struct mirror_set *ms, struct bio *bio)
|
|
|
{
|
|
|
unsigned int i;
|
|
|
- struct io_region io[KCOPYD_MAX_REGIONS+1];
|
|
|
+ struct io_region io[ms->nr_mirrors], *dest = io;
|
|
|
struct mirror *m;
|
|
|
struct dm_io_request io_req = {
|
|
|
.bi_rw = WRITE,
|
|
@@ -1019,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
|
|
|
.client = ms->io_client,
|
|
|
};
|
|
|
|
|
|
- for (i = 0; i < ms->nr_mirrors; i++) {
|
|
|
- m = ms->mirror + i;
|
|
|
-
|
|
|
- io[i].bdev = m->dev->bdev;
|
|
|
- io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
|
|
|
- io[i].count = bio->bi_size >> 9;
|
|
|
- }
|
|
|
+ for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
|
|
|
+ map_region(dest++, m, bio);
|
|
|
|
|
|
- bio_set_ms(bio, ms);
|
|
|
+ /*
|
|
|
+ * Use default mirror because we only need it to retrieve the reference
|
|
|
+ * to the mirror set in write_callback().
|
|
|
+ */
|
|
|
+ bio_set_m(bio, get_default_mirror(ms));
|
|
|
|
|
|
(void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
|
|
|
}
|
|
@@ -1092,7 +1190,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
|
|
|
rh_delay(&ms->rh, bio);
|
|
|
|
|
|
while ((bio = bio_list_pop(&nosync))) {
|
|
|
- map_bio(ms, get_default_mirror(ms), bio);
|
|
|
+ map_bio(get_default_mirror(ms), bio);
|
|
|
generic_make_request(bio);
|
|
|
}
|
|
|
}
|
|
@@ -1231,9 +1329,19 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
|
|
|
atomic_set(&ms->suspend, 0);
|
|
|
atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
|
|
|
|
|
|
+ len = sizeof(struct dm_raid1_read_record);
|
|
|
+ ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
|
|
|
+ len);
|
|
|
+ if (!ms->read_record_pool) {
|
|
|
+ ti->error = "Error creating mirror read_record_pool";
|
|
|
+ kfree(ms);
|
|
|
+ return NULL;
|
|
|
+ }
|
|
|
+
|
|
|
ms->io_client = dm_io_client_create(DM_IO_PAGES);
|
|
|
if (IS_ERR(ms->io_client)) {
|
|
|
ti->error = "Error creating dm_io client";
|
|
|
+ mempool_destroy(ms->read_record_pool);
|
|
|
kfree(ms);
|
|
|
return NULL;
|
|
|
}
|
|
@@ -1241,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
|
|
|
if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
|
|
|
ti->error = "Error creating dirty region hash";
|
|
|
dm_io_client_destroy(ms->io_client);
|
|
|
+ mempool_destroy(ms->read_record_pool);
|
|
|
kfree(ms);
|
|
|
return NULL;
|
|
|
}
|
|
@@ -1256,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
|
|
|
|
|
|
dm_io_client_destroy(ms->io_client);
|
|
|
rh_exit(&ms->rh);
|
|
|
+ mempool_destroy(ms->read_record_pool);
|
|
|
kfree(ms);
|
|
|
}
|
|
|
|
|
@@ -1510,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
|
|
|
int r, rw = bio_rw(bio);
|
|
|
struct mirror *m;
|
|
|
struct mirror_set *ms = ti->private;
|
|
|
-
|
|
|
- map_context->ll = bio_to_region(&ms->rh, bio);
|
|
|
+ struct dm_raid1_read_record *read_record = NULL;
|
|
|
|
|
|
if (rw == WRITE) {
|
|
|
+ /* Save region for mirror_end_io() handler */
|
|
|
+ map_context->ll = bio_to_region(&ms->rh, bio);
|
|
|
queue_bio(ms, bio, rw);
|
|
|
return DM_MAPIO_SUBMITTED;
|
|
|
}
|
|
@@ -1523,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
|
|
|
if (r < 0 && r != -EWOULDBLOCK)
|
|
|
return r;
|
|
|
|
|
|
- if (r == -EWOULDBLOCK) /* FIXME: ugly */
|
|
|
- r = DM_MAPIO_SUBMITTED;
|
|
|
-
|
|
|
/*
|
|
|
- * We don't want to fast track a recovery just for a read
|
|
|
- * ahead. So we just let it silently fail.
|
|
|
- * FIXME: get rid of this.
|
|
|
+ * If region is not in-sync queue the bio.
|
|
|
*/
|
|
|
- if (!r && rw == READA)
|
|
|
- return -EIO;
|
|
|
+ if (!r || (r == -EWOULDBLOCK)) {
|
|
|
+ if (rw == READA)
|
|
|
+ return -EWOULDBLOCK;
|
|
|
|
|
|
- if (!r) {
|
|
|
- /* Pass this io over to the daemon */
|
|
|
queue_bio(ms, bio, rw);
|
|
|
return DM_MAPIO_SUBMITTED;
|
|
|
}
|
|
|
|
|
|
+ /*
|
|
|
+ * The region is in-sync and we can perform reads directly.
|
|
|
+ * Store enough information so we can retry if it fails.
|
|
|
+ */
|
|
|
m = choose_mirror(ms, bio->bi_sector);
|
|
|
- if (!m)
|
|
|
+ if (unlikely(!m))
|
|
|
return -EIO;
|
|
|
|
|
|
- map_bio(ms, m, bio);
|
|
|
+ read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
|
|
|
+ if (likely(read_record)) {
|
|
|
+ dm_bio_record(&read_record->details, bio);
|
|
|
+ map_context->ptr = read_record;
|
|
|
+ read_record->m = m;
|
|
|
+ }
|
|
|
+
|
|
|
+ map_bio(m, bio);
|
|
|
+
|
|
|
return DM_MAPIO_REMAPPED;
|
|
|
}
|
|
|
|
|
@@ -1553,15 +1670,64 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
|
|
|
{
|
|
|
int rw = bio_rw(bio);
|
|
|
struct mirror_set *ms = (struct mirror_set *) ti->private;
|
|
|
- region_t region = map_context->ll;
|
|
|
+ struct mirror *m = NULL;
|
|
|
+ struct dm_bio_details *bd = NULL;
|
|
|
+ struct dm_raid1_read_record *read_record = map_context->ptr;
|
|
|
|
|
|
/*
|
|
|
* We need to dec pending if this was a write.
|
|
|
*/
|
|
|
- if (rw == WRITE)
|
|
|
- rh_dec(&ms->rh, region);
|
|
|
+ if (rw == WRITE) {
|
|
|
+ rh_dec(&ms->rh, map_context->ll);
|
|
|
+ return error;
|
|
|
+ }
|
|
|
|
|
|
- return 0;
|
|
|
+ if (error == -EOPNOTSUPP)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ if (unlikely(error)) {
|
|
|
+ if (!read_record) {
|
|
|
+ /*
|
|
|
+ * There wasn't enough memory to record necessary
|
|
|
+ * information for a retry or there was no other
|
|
|
+ * mirror in-sync.
|
|
|
+ */
|
|
|
+ DMERR_LIMIT("Mirror read failed from %s.",
|
|
|
+ m->dev->name);
|
|
|
+ return -EIO;
|
|
|
+ }
|
|
|
+ DMERR("Mirror read failed from %s. Trying alternative device.",
|
|
|
+ m->dev->name);
|
|
|
+
|
|
|
+ m = read_record->m;
|
|
|
+ fail_mirror(m, DM_RAID1_READ_ERROR);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * A failed read is requeued for another attempt using an intact
|
|
|
+ * mirror.
|
|
|
+ */
|
|
|
+ if (default_ok(m) || mirror_available(ms, bio)) {
|
|
|
+ bd = &read_record->details;
|
|
|
+
|
|
|
+ dm_bio_restore(bd, bio);
|
|
|
+ mempool_free(read_record, ms->read_record_pool);
|
|
|
+ map_context->ptr = NULL;
|
|
|
+ queue_bio(ms, bio, rw);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ DMERR("All replicated volumes dead, failing I/O");
|
|
|
+ }
|
|
|
+
|
|
|
+out:
|
|
|
+ if (read_record) {
|
|
|
+ mempool_free(read_record, ms->read_record_pool);
|
|
|
+ map_context->ptr = NULL;
|
|
|
+ }
|
|
|
+
|
|
|
+ return error;
|
|
|
}
|
|
|
|
|
|
static void mirror_presuspend(struct dm_target *ti)
|