|
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(bio_clone_mddev);
|
|
|
|
|
|
+void md_trim_bio(struct bio *bio, int offset, int size)
|
|
|
+{
|
|
|
+ /* 'bio' is a cloned bio which we need to trim to match
|
|
|
+ * the given offset and size.
|
|
|
+ * This requires adjusting bi_sector, bi_size, and bi_io_vec
|
|
|
+ */
|
|
|
+ int i;
|
|
|
+ struct bio_vec *bvec;
|
|
|
+ int sofar = 0;
|
|
|
+
|
|
|
+ size <<= 9;
|
|
|
+ if (offset == 0 && size == bio->bi_size)
|
|
|
+ return;
|
|
|
+
|
|
|
+ bio->bi_sector += offset;
|
|
|
+ bio->bi_size = size;
|
|
|
+ offset <<= 9;
|
|
|
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
|
|
|
+
|
|
|
+ while (bio->bi_idx < bio->bi_vcnt &&
|
|
|
+ bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
|
|
|
+ /* remove this whole bio_vec */
|
|
|
+ offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
|
|
|
+ bio->bi_idx++;
|
|
|
+ }
|
|
|
+ if (bio->bi_idx < bio->bi_vcnt) {
|
|
|
+ bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
|
|
|
+ bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
|
|
|
+ }
|
|
|
+ /* avoid any complications with bi_idx being non-zero*/
|
|
|
+ if (bio->bi_idx) {
|
|
|
+ memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
|
|
|
+ (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
|
|
|
+ bio->bi_vcnt -= bio->bi_idx;
|
|
|
+ bio->bi_idx = 0;
|
|
|
+ }
|
|
|
+ /* Make sure vcnt and last bv are not too big */
|
|
|
+ bio_for_each_segment(bvec, bio, i) {
|
|
|
+ if (sofar + bvec->bv_len > size)
|
|
|
+ bvec->bv_len = size - sofar;
|
|
|
+ if (bvec->bv_len == 0) {
|
|
|
+ bio->bi_vcnt = i;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ sofar += bvec->bv_len;
|
|
|
+ }
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(md_trim_bio);
|
|
|
+
|
|
|
/*
|
|
|
* We have a system wide 'event count' that is incremented
|
|
|
* on any 'interesting' event, and readers of /proc/mdstat
|
|
@@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
|
|
|
rdev->sb_start = 0;
|
|
|
rdev->sectors = 0;
|
|
|
}
|
|
|
+ if (rdev->bb_page) {
|
|
|
+ put_page(rdev->bb_page);
|
|
|
+ rdev->bb_page = NULL;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
|
|
@@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
|
|
|
ret = -EINVAL;
|
|
|
|
|
|
bdevname(rdev->bdev, b);
|
|
|
- sb = (mdp_super_t*)page_address(rdev->sb_page);
|
|
|
+ sb = page_address(rdev->sb_page);
|
|
|
|
|
|
if (sb->md_magic != MD_SB_MAGIC) {
|
|
|
printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
|
|
@@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
|
|
|
rdev->preferred_minor = sb->md_minor;
|
|
|
rdev->data_offset = 0;
|
|
|
rdev->sb_size = MD_SB_BYTES;
|
|
|
+ rdev->badblocks.shift = -1;
|
|
|
|
|
|
if (sb->level == LEVEL_MULTIPATH)
|
|
|
rdev->desc_nr = -1;
|
|
@@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
|
|
|
ret = 1;
|
|
|
} else {
|
|
|
__u64 ev1, ev2;
|
|
|
- mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
|
|
|
+ mdp_super_t *refsb = page_address(refdev->sb_page);
|
|
|
if (!uuid_equal(refsb, sb)) {
|
|
|
printk(KERN_WARNING "md: %s has different UUID to %s\n",
|
|
|
b, bdevname(refdev->bdev,b2));
|
|
@@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
|
|
|
static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
|
|
|
{
|
|
|
mdp_disk_t *desc;
|
|
|
- mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
|
|
|
+ mdp_super_t *sb = page_address(rdev->sb_page);
|
|
|
__u64 ev1 = md_event(sb);
|
|
|
|
|
|
rdev->raid_disk = -1;
|
|
@@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
|
|
|
|
|
|
rdev->sb_size = MD_SB_BYTES;
|
|
|
|
|
|
- sb = (mdp_super_t*)page_address(rdev->sb_page);
|
|
|
+ sb = page_address(rdev->sb_page);
|
|
|
|
|
|
memset(sb, 0, sizeof(*sb));
|
|
|
|
|
@@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
|
|
|
return cpu_to_le32(csum);
|
|
|
}
|
|
|
|
|
|
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
|
|
|
+ int acknowledged);
|
|
|
static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
|
|
|
{
|
|
|
struct mdp_superblock_1 *sb;
|
|
@@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
|
|
|
if (ret) return ret;
|
|
|
|
|
|
|
|
|
- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
|
|
|
+ sb = page_address(rdev->sb_page);
|
|
|
|
|
|
if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
|
|
|
sb->major_version != cpu_to_le32(1) ||
|
|
@@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
|
|
|
else
|
|
|
rdev->desc_nr = le32_to_cpu(sb->dev_number);
|
|
|
|
|
|
+ if (!rdev->bb_page) {
|
|
|
+ rdev->bb_page = alloc_page(GFP_KERNEL);
|
|
|
+ if (!rdev->bb_page)
|
|
|
+ return -ENOMEM;
|
|
|
+ }
|
|
|
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
|
|
|
+ rdev->badblocks.count == 0) {
|
|
|
+ /* need to load the bad block list.
|
|
|
+ * Currently we limit it to one page.
|
|
|
+ */
|
|
|
+ s32 offset;
|
|
|
+ sector_t bb_sector;
|
|
|
+ u64 *bbp;
|
|
|
+ int i;
|
|
|
+ int sectors = le16_to_cpu(sb->bblog_size);
|
|
|
+ if (sectors > (PAGE_SIZE / 512))
|
|
|
+ return -EINVAL;
|
|
|
+ offset = le32_to_cpu(sb->bblog_offset);
|
|
|
+ if (offset == 0)
|
|
|
+ return -EINVAL;
|
|
|
+ bb_sector = (long long)offset;
|
|
|
+ if (!sync_page_io(rdev, bb_sector, sectors << 9,
|
|
|
+ rdev->bb_page, READ, true))
|
|
|
+ return -EIO;
|
|
|
+ bbp = (u64 *)page_address(rdev->bb_page);
|
|
|
+ rdev->badblocks.shift = sb->bblog_shift;
|
|
|
+ for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
|
|
|
+ u64 bb = le64_to_cpu(*bbp);
|
|
|
+ int count = bb & (0x3ff);
|
|
|
+ u64 sector = bb >> 10;
|
|
|
+ sector <<= sb->bblog_shift;
|
|
|
+ count <<= sb->bblog_shift;
|
|
|
+ if (bb + 1 == 0)
|
|
|
+ break;
|
|
|
+ if (md_set_badblocks(&rdev->badblocks,
|
|
|
+ sector, count, 1) == 0)
|
|
|
+ return -EINVAL;
|
|
|
+ }
|
|
|
+ } else if (sb->bblog_offset == 0)
|
|
|
+ rdev->badblocks.shift = -1;
|
|
|
+
|
|
|
if (!refdev) {
|
|
|
ret = 1;
|
|
|
} else {
|
|
|
__u64 ev1, ev2;
|
|
|
- struct mdp_superblock_1 *refsb =
|
|
|
- (struct mdp_superblock_1*)page_address(refdev->sb_page);
|
|
|
+ struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
|
|
|
|
|
|
if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
|
|
|
sb->level != refsb->level ||
|
|
@@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
|
|
|
|
|
|
static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
|
|
|
{
|
|
|
- struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
|
|
|
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
|
|
|
__u64 ev1 = le64_to_cpu(sb->events);
|
|
|
|
|
|
rdev->raid_disk = -1;
|
|
@@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
|
|
|
int max_dev, i;
|
|
|
/* make rdev->sb match mddev and rdev data. */
|
|
|
|
|
|
- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
|
|
|
+ sb = page_address(rdev->sb_page);
|
|
|
|
|
|
sb->feature_map = 0;
|
|
|
sb->pad0 = 0;
|
|
|
sb->recovery_offset = cpu_to_le64(0);
|
|
|
memset(sb->pad1, 0, sizeof(sb->pad1));
|
|
|
- memset(sb->pad2, 0, sizeof(sb->pad2));
|
|
|
memset(sb->pad3, 0, sizeof(sb->pad3));
|
|
|
|
|
|
sb->utime = cpu_to_le64((__u64)mddev->utime);
|
|
@@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
|
|
|
sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
|
|
|
}
|
|
|
|
|
|
+ if (rdev->badblocks.count == 0)
|
|
|
+ /* Nothing to do for bad blocks*/ ;
|
|
|
+ else if (sb->bblog_offset == 0)
|
|
|
+ /* Cannot record bad blocks on this device */
|
|
|
+ md_error(mddev, rdev);
|
|
|
+ else {
|
|
|
+ struct badblocks *bb = &rdev->badblocks;
|
|
|
+ u64 *bbp = (u64 *)page_address(rdev->bb_page);
|
|
|
+ u64 *p = bb->page;
|
|
|
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
|
|
|
+ if (bb->changed) {
|
|
|
+ unsigned seq;
|
|
|
+
|
|
|
+retry:
|
|
|
+ seq = read_seqbegin(&bb->lock);
|
|
|
+
|
|
|
+ memset(bbp, 0xff, PAGE_SIZE);
|
|
|
+
|
|
|
+ for (i = 0 ; i < bb->count ; i++) {
|
|
|
+ u64 internal_bb = *p++;
|
|
|
+ u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
|
|
|
+ | BB_LEN(internal_bb));
|
|
|
+ *bbp++ = cpu_to_le64(store_bb);
|
|
|
+ }
|
|
|
+ if (read_seqretry(&bb->lock, seq))
|
|
|
+ goto retry;
|
|
|
+
|
|
|
+ bb->sector = (rdev->sb_start +
|
|
|
+ (int)le32_to_cpu(sb->bblog_offset));
|
|
|
+ bb->size = le16_to_cpu(sb->bblog_size);
|
|
|
+ bb->changed = 0;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
max_dev = 0;
|
|
|
list_for_each_entry(rdev2, &mddev->disks, same_set)
|
|
|
if (rdev2->desc_nr+1 > max_dev)
|
|
@@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
|
|
|
num_sectors = max_sectors;
|
|
|
rdev->sb_start = sb_start;
|
|
|
}
|
|
|
- sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
|
|
|
+ sb = page_address(rdev->sb_page);
|
|
|
sb->data_size = cpu_to_le64(num_sectors);
|
|
|
sb->super_offset = rdev->sb_start;
|
|
|
sb->sb_csum = calc_sb_1_csum(sb);
|
|
@@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
|
|
|
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
|
|
|
|
|
|
/* May as well allow recovery to be retried once */
|
|
|
- mddev->recovery_disabled = 0;
|
|
|
+ mddev->recovery_disabled++;
|
|
|
|
|
|
return 0;
|
|
|
|
|
@@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
|
|
|
sysfs_remove_link(&rdev->kobj, "block");
|
|
|
sysfs_put(rdev->sysfs_state);
|
|
|
rdev->sysfs_state = NULL;
|
|
|
+ kfree(rdev->badblocks.page);
|
|
|
+ rdev->badblocks.count = 0;
|
|
|
+ rdev->badblocks.page = NULL;
|
|
|
/* We need to delay this, otherwise we can deadlock when
|
|
|
* writing to 'remove' to "dev/state". We also need
|
|
|
* to delay it due to rcu usage.
|
|
@@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
|
|
|
printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
|
|
|
switch (major_version) {
|
|
|
case 0:
|
|
|
- print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
|
|
|
+ print_sb_90(page_address(rdev->sb_page));
|
|
|
break;
|
|
|
case 1:
|
|
|
- print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
|
|
|
+ print_sb_1(page_address(rdev->sb_page));
|
|
|
break;
|
|
|
}
|
|
|
} else
|
|
@@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
|
|
|
mdk_rdev_t *rdev;
|
|
|
int sync_req;
|
|
|
int nospares = 0;
|
|
|
+ int any_badblocks_changed = 0;
|
|
|
|
|
|
repeat:
|
|
|
/* First make sure individual recovery_offsets are correct */
|
|
@@ -2208,8 +2341,18 @@ repeat:
|
|
|
if (!mddev->persistent) {
|
|
|
clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
|
|
|
clear_bit(MD_CHANGE_DEVS, &mddev->flags);
|
|
|
- if (!mddev->external)
|
|
|
+ if (!mddev->external) {
|
|
|
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
|
|
|
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
|
|
|
+ if (rdev->badblocks.changed) {
|
|
|
+ md_ack_all_badblocks(&rdev->badblocks);
|
|
|
+ md_error(mddev, rdev);
|
|
|
+ }
|
|
|
+ clear_bit(Blocked, &rdev->flags);
|
|
|
+ clear_bit(BlockedBadBlocks, &rdev->flags);
|
|
|
+ wake_up(&rdev->blocked_wait);
|
|
|
+ }
|
|
|
+ }
|
|
|
wake_up(&mddev->sb_wait);
|
|
|
return;
|
|
|
}
|
|
@@ -2265,6 +2408,14 @@ repeat:
|
|
|
MD_BUG();
|
|
|
mddev->events --;
|
|
|
}
|
|
|
+
|
|
|
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
|
|
|
+ if (rdev->badblocks.changed)
|
|
|
+ any_badblocks_changed++;
|
|
|
+ if (test_bit(Faulty, &rdev->flags))
|
|
|
+ set_bit(FaultRecorded, &rdev->flags);
|
|
|
+ }
|
|
|
+
|
|
|
sync_sbs(mddev, nospares);
|
|
|
spin_unlock_irq(&mddev->write_lock);
|
|
|
|
|
@@ -2290,6 +2441,13 @@ repeat:
|
|
|
bdevname(rdev->bdev,b),
|
|
|
(unsigned long long)rdev->sb_start);
|
|
|
rdev->sb_events = mddev->events;
|
|
|
+ if (rdev->badblocks.size) {
|
|
|
+ md_super_write(mddev, rdev,
|
|
|
+ rdev->badblocks.sector,
|
|
|
+ rdev->badblocks.size << 9,
|
|
|
+ rdev->bb_page);
|
|
|
+ rdev->badblocks.size = 0;
|
|
|
+ }
|
|
|
|
|
|
} else
|
|
|
dprintk(")\n");
|
|
@@ -2313,6 +2471,15 @@ repeat:
|
|
|
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
|
|
|
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
|
|
|
|
|
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
|
|
|
+ if (test_and_clear_bit(FaultRecorded, &rdev->flags))
|
|
|
+ clear_bit(Blocked, &rdev->flags);
|
|
|
+
|
|
|
+ if (any_badblocks_changed)
|
|
|
+ md_ack_all_badblocks(&rdev->badblocks);
|
|
|
+ clear_bit(BlockedBadBlocks, &rdev->flags);
|
|
|
+ wake_up(&rdev->blocked_wait);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/* words written to sysfs files may, or may not, be \n terminated.
|
|
@@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
|
|
|
char *sep = "";
|
|
|
size_t len = 0;
|
|
|
|
|
|
- if (test_bit(Faulty, &rdev->flags)) {
|
|
|
+ if (test_bit(Faulty, &rdev->flags) ||
|
|
|
+ rdev->badblocks.unacked_exist) {
|
|
|
len+= sprintf(page+len, "%sfaulty",sep);
|
|
|
sep = ",";
|
|
|
}
|
|
@@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
|
|
|
len += sprintf(page+len, "%swrite_mostly",sep);
|
|
|
sep = ",";
|
|
|
}
|
|
|
- if (test_bit(Blocked, &rdev->flags)) {
|
|
|
+ if (test_bit(Blocked, &rdev->flags) ||
|
|
|
+ rdev->badblocks.unacked_exist) {
|
|
|
len += sprintf(page+len, "%sblocked", sep);
|
|
|
sep = ",";
|
|
|
}
|
|
@@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page)
|
|
|
len += sprintf(page+len, "%sspare", sep);
|
|
|
sep = ",";
|
|
|
}
|
|
|
+ if (test_bit(WriteErrorSeen, &rdev->flags)) {
|
|
|
+ len += sprintf(page+len, "%swrite_error", sep);
|
|
|
+ sep = ",";
|
|
|
+ }
|
|
|
return len+sprintf(page+len, "\n");
|
|
|
}
|
|
|
|
|
@@ -2375,13 +2548,15 @@ static ssize_t
|
|
|
state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
|
|
|
{
|
|
|
/* can write
|
|
|
- * faulty - simulates and error
|
|
|
+ * faulty - simulates an error
|
|
|
* remove - disconnects the device
|
|
|
* writemostly - sets write_mostly
|
|
|
* -writemostly - clears write_mostly
|
|
|
- * blocked - sets the Blocked flag
|
|
|
- * -blocked - clears the Blocked flag
|
|
|
+ * blocked - sets the Blocked flags
|
|
|
+ * -blocked - clears the Blocked and possibly simulates an error
|
|
|
* insync - sets Insync providing device isn't active
|
|
|
+ * write_error - sets WriteErrorSeen
|
|
|
+ * -write_error - clears WriteErrorSeen
|
|
|
*/
|
|
|
int err = -EINVAL;
|
|
|
if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
|
|
@@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
|
|
|
set_bit(Blocked, &rdev->flags);
|
|
|
err = 0;
|
|
|
} else if (cmd_match(buf, "-blocked")) {
|
|
|
+ if (!test_bit(Faulty, &rdev->flags) &&
|
|
|
+ test_bit(BlockedBadBlocks, &rdev->flags)) {
|
|
|
+ /* metadata handler doesn't understand badblocks,
|
|
|
+ * so we need to fail the device
|
|
|
+ */
|
|
|
+ md_error(rdev->mddev, rdev);
|
|
|
+ }
|
|
|
clear_bit(Blocked, &rdev->flags);
|
|
|
+ clear_bit(BlockedBadBlocks, &rdev->flags);
|
|
|
wake_up(&rdev->blocked_wait);
|
|
|
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
|
|
|
md_wakeup_thread(rdev->mddev->thread);
|
|
@@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
|
|
|
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
|
|
|
set_bit(In_sync, &rdev->flags);
|
|
|
err = 0;
|
|
|
+ } else if (cmd_match(buf, "write_error")) {
|
|
|
+ set_bit(WriteErrorSeen, &rdev->flags);
|
|
|
+ err = 0;
|
|
|
+ } else if (cmd_match(buf, "-write_error")) {
|
|
|
+ clear_bit(WriteErrorSeen, &rdev->flags);
|
|
|
+ err = 0;
|
|
|
}
|
|
|
if (!err)
|
|
|
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
|
@@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
|
|
|
{
|
|
|
char *e;
|
|
|
int err;
|
|
|
- char nm[20];
|
|
|
int slot = simple_strtoul(buf, &e, 10);
|
|
|
if (strncmp(buf, "none", 4)==0)
|
|
|
slot = -1;
|
|
@@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
|
|
|
hot_remove_disk(rdev->mddev, rdev->raid_disk);
|
|
|
if (err)
|
|
|
return err;
|
|
|
- sprintf(nm, "rd%d", rdev->raid_disk);
|
|
|
- sysfs_remove_link(&rdev->mddev->kobj, nm);
|
|
|
+ sysfs_unlink_rdev(rdev->mddev, rdev);
|
|
|
rdev->raid_disk = -1;
|
|
|
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
|
|
|
md_wakeup_thread(rdev->mddev->thread);
|
|
@@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
|
|
|
return err;
|
|
|
} else
|
|
|
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
|
|
- sprintf(nm, "rd%d", rdev->raid_disk);
|
|
|
- if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
|
|
|
+ if (sysfs_link_rdev(rdev->mddev, rdev))
|
|
|
/* failure here is OK */;
|
|
|
/* don't wakeup anyone, leave that to userspace. */
|
|
|
} else {
|
|
@@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le
|
|
|
static struct rdev_sysfs_entry rdev_recovery_start =
|
|
|
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
|
|
|
|
|
|
+
|
|
|
+static ssize_t
|
|
|
+badblocks_show(struct badblocks *bb, char *page, int unack);
|
|
|
+static ssize_t
|
|
|
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
|
|
|
+
|
|
|
+static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
|
|
|
+{
|
|
|
+ return badblocks_show(&rdev->badblocks, page, 0);
|
|
|
+}
|
|
|
+static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
|
|
|
+{
|
|
|
+ int rv = badblocks_store(&rdev->badblocks, page, len, 0);
|
|
|
+ /* Maybe that ack was all we needed */
|
|
|
+ if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
|
|
|
+ wake_up(&rdev->blocked_wait);
|
|
|
+ return rv;
|
|
|
+}
|
|
|
+static struct rdev_sysfs_entry rdev_bad_blocks =
|
|
|
+__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
|
|
|
+
|
|
|
+
|
|
|
+static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
|
|
|
+{
|
|
|
+ return badblocks_show(&rdev->badblocks, page, 1);
|
|
|
+}
|
|
|
+static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
|
|
|
+{
|
|
|
+ return badblocks_store(&rdev->badblocks, page, len, 1);
|
|
|
+}
|
|
|
+static struct rdev_sysfs_entry rdev_unack_bad_blocks =
|
|
|
+__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
|
|
|
+
|
|
|
static struct attribute *rdev_default_attrs[] = {
|
|
|
&rdev_state.attr,
|
|
|
&rdev_errors.attr,
|
|
@@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = {
|
|
|
&rdev_offset.attr,
|
|
|
&rdev_size.attr,
|
|
|
&rdev_recovery_start.attr,
|
|
|
+ &rdev_bad_blocks.attr,
|
|
|
+ &rdev_unack_bad_blocks.attr,
|
|
|
NULL,
|
|
|
};
|
|
|
static ssize_t
|
|
@@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = {
|
|
|
.default_attrs = rdev_default_attrs,
|
|
|
};
|
|
|
|
|
|
-void md_rdev_init(mdk_rdev_t *rdev)
|
|
|
+int md_rdev_init(mdk_rdev_t *rdev)
|
|
|
{
|
|
|
rdev->desc_nr = -1;
|
|
|
rdev->saved_raid_disk = -1;
|
|
@@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev)
|
|
|
rdev->sb_events = 0;
|
|
|
rdev->last_read_error.tv_sec = 0;
|
|
|
rdev->last_read_error.tv_nsec = 0;
|
|
|
+ rdev->sb_loaded = 0;
|
|
|
+ rdev->bb_page = NULL;
|
|
|
atomic_set(&rdev->nr_pending, 0);
|
|
|
atomic_set(&rdev->read_errors, 0);
|
|
|
atomic_set(&rdev->corrected_errors, 0);
|
|
|
|
|
|
INIT_LIST_HEAD(&rdev->same_set);
|
|
|
init_waitqueue_head(&rdev->blocked_wait);
|
|
|
+
|
|
|
+ /* Add space to store bad block list.
|
|
|
+ * This reserves the space even on arrays where it cannot
|
|
|
+ * be used - I wonder if that matters
|
|
|
+ */
|
|
|
+ rdev->badblocks.count = 0;
|
|
|
+ rdev->badblocks.shift = 0;
|
|
|
+ rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
|
|
+ seqlock_init(&rdev->badblocks.lock);
|
|
|
+ if (rdev->badblocks.page == NULL)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ return 0;
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(md_rdev_init);
|
|
|
/*
|
|
@@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
}
|
|
|
|
|
|
- md_rdev_init(rdev);
|
|
|
- if ((err = alloc_disk_sb(rdev)))
|
|
|
+ err = md_rdev_init(rdev);
|
|
|
+ if (err)
|
|
|
+ goto abort_free;
|
|
|
+ err = alloc_disk_sb(rdev);
|
|
|
+ if (err)
|
|
|
goto abort_free;
|
|
|
|
|
|
err = lock_rdev(rdev, newdev, super_format == -2);
|
|
@@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
|
|
|
goto abort_free;
|
|
|
}
|
|
|
}
|
|
|
+ if (super_format == -1)
|
|
|
+ /* hot-add for 0.90, or non-persistent: so no badblocks */
|
|
|
+ rdev->badblocks.shift = -1;
|
|
|
|
|
|
return rdev;
|
|
|
|
|
|
abort_free:
|
|
|
- if (rdev->sb_page) {
|
|
|
- if (rdev->bdev)
|
|
|
- unlock_rdev(rdev);
|
|
|
- free_disk_sb(rdev);
|
|
|
- }
|
|
|
+ if (rdev->bdev)
|
|
|
+ unlock_rdev(rdev);
|
|
|
+ free_disk_sb(rdev);
|
|
|
+ kfree(rdev->badblocks.page);
|
|
|
kfree(rdev);
|
|
|
return ERR_PTR(err);
|
|
|
}
|
|
@@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
|
|
|
}
|
|
|
|
|
|
list_for_each_entry(rdev, &mddev->disks, same_set) {
|
|
|
- char nm[20];
|
|
|
if (rdev->raid_disk < 0)
|
|
|
continue;
|
|
|
if (rdev->new_raid_disk >= mddev->raid_disks)
|
|
|
rdev->new_raid_disk = -1;
|
|
|
if (rdev->new_raid_disk == rdev->raid_disk)
|
|
|
continue;
|
|
|
- sprintf(nm, "rd%d", rdev->raid_disk);
|
|
|
- sysfs_remove_link(&mddev->kobj, nm);
|
|
|
+ sysfs_unlink_rdev(mddev, rdev);
|
|
|
}
|
|
|
list_for_each_entry(rdev, &mddev->disks, same_set) {
|
|
|
if (rdev->raid_disk < 0)
|
|
@@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
|
|
|
if (rdev->raid_disk < 0)
|
|
|
clear_bit(In_sync, &rdev->flags);
|
|
|
else {
|
|
|
- char nm[20];
|
|
|
- sprintf(nm, "rd%d", rdev->raid_disk);
|
|
|
- if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
|
|
|
- printk("md: cannot register %s for %s after level change\n",
|
|
|
- nm, mdname(mddev));
|
|
|
+ if (sysfs_link_rdev(mddev, rdev))
|
|
|
+ printk(KERN_WARNING "md: cannot register rd%d"
|
|
|
+ " for %s after level change\n",
|
|
|
+ rdev->raid_disk, mdname(mddev));
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev)
|
|
|
}
|
|
|
|
|
|
if (mddev->bio_set == NULL)
|
|
|
- mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
|
|
|
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE,
|
|
|
+ sizeof(mddev_t *));
|
|
|
|
|
|
spin_lock(&pers_lock);
|
|
|
pers = find_pers(mddev->level, mddev->clevel);
|
|
@@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev)
|
|
|
smp_wmb();
|
|
|
mddev->ready = 1;
|
|
|
list_for_each_entry(rdev, &mddev->disks, same_set)
|
|
|
- if (rdev->raid_disk >= 0) {
|
|
|
- char nm[20];
|
|
|
- sprintf(nm, "rd%d", rdev->raid_disk);
|
|
|
- if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
|
|
|
+ if (rdev->raid_disk >= 0)
|
|
|
+ if (sysfs_link_rdev(mddev, rdev))
|
|
|
/* failure here is OK */;
|
|
|
- }
|
|
|
|
|
|
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
|
|
|
|
@@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
|
|
|
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
|
|
|
|
|
list_for_each_entry(rdev, &mddev->disks, same_set)
|
|
|
- if (rdev->raid_disk >= 0) {
|
|
|
- char nm[20];
|
|
|
- sprintf(nm, "rd%d", rdev->raid_disk);
|
|
|
- sysfs_remove_link(&mddev->kobj, nm);
|
|
|
- }
|
|
|
+ if (rdev->raid_disk >= 0)
|
|
|
+ sysfs_unlink_rdev(mddev, rdev);
|
|
|
|
|
|
set_capacity(disk, 0);
|
|
|
mutex_unlock(&mddev->open_mutex);
|
|
@@ -6198,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
|
|
|
if (!rdev || test_bit(Faulty, &rdev->flags))
|
|
|
return;
|
|
|
|
|
|
- if (mddev->external)
|
|
|
- set_bit(Blocked, &rdev->flags);
|
|
|
-/*
|
|
|
- dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
|
|
|
- mdname(mddev),
|
|
|
- MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
|
|
|
- __builtin_return_address(0),__builtin_return_address(1),
|
|
|
- __builtin_return_address(2),__builtin_return_address(3));
|
|
|
-*/
|
|
|
- if (!mddev->pers)
|
|
|
- return;
|
|
|
- if (!mddev->pers->error_handler)
|
|
|
+ if (!mddev->pers || !mddev->pers->error_handler)
|
|
|
return;
|
|
|
mddev->pers->error_handler(mddev,rdev);
|
|
|
if (mddev->degraded)
|
|
@@ -6933,11 +7155,14 @@ void md_do_sync(mddev_t *mddev)
|
|
|
atomic_add(sectors, &mddev->recovery_active);
|
|
|
}
|
|
|
|
|
|
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
|
|
+ break;
|
|
|
+
|
|
|
j += sectors;
|
|
|
if (j>1) mddev->curr_resync = j;
|
|
|
mddev->curr_mark_cnt = io_sectors;
|
|
|
if (last_check == 0)
|
|
|
- /* this is the earliers that rebuilt will be
|
|
|
+ /* this is the earliest that rebuild will be
|
|
|
* visible in /proc/mdstat
|
|
|
*/
|
|
|
md_new_event(mddev);
|
|
@@ -6946,10 +7171,6 @@ void md_do_sync(mddev_t *mddev)
|
|
|
continue;
|
|
|
|
|
|
last_check = io_sectors;
|
|
|
-
|
|
|
- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
|
|
- break;
|
|
|
-
|
|
|
repeat:
|
|
|
if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
|
|
|
/* step marks */
|
|
@@ -7067,29 +7288,23 @@ static int remove_and_add_spares(mddev_t *mddev)
|
|
|
atomic_read(&rdev->nr_pending)==0) {
|
|
|
if (mddev->pers->hot_remove_disk(
|
|
|
mddev, rdev->raid_disk)==0) {
|
|
|
- char nm[20];
|
|
|
- sprintf(nm,"rd%d", rdev->raid_disk);
|
|
|
- sysfs_remove_link(&mddev->kobj, nm);
|
|
|
+ sysfs_unlink_rdev(mddev, rdev);
|
|
|
rdev->raid_disk = -1;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (mddev->degraded && !mddev->recovery_disabled) {
|
|
|
+ if (mddev->degraded) {
|
|
|
list_for_each_entry(rdev, &mddev->disks, same_set) {
|
|
|
if (rdev->raid_disk >= 0 &&
|
|
|
!test_bit(In_sync, &rdev->flags) &&
|
|
|
- !test_bit(Faulty, &rdev->flags) &&
|
|
|
- !test_bit(Blocked, &rdev->flags))
|
|
|
+ !test_bit(Faulty, &rdev->flags))
|
|
|
spares++;
|
|
|
if (rdev->raid_disk < 0
|
|
|
&& !test_bit(Faulty, &rdev->flags)) {
|
|
|
rdev->recovery_offset = 0;
|
|
|
if (mddev->pers->
|
|
|
hot_add_disk(mddev, rdev) == 0) {
|
|
|
- char nm[20];
|
|
|
- sprintf(nm, "rd%d", rdev->raid_disk);
|
|
|
- if (sysfs_create_link(&mddev->kobj,
|
|
|
- &rdev->kobj, nm))
|
|
|
+ if (sysfs_link_rdev(mddev, rdev))
|
|
|
/* failure here is OK */;
|
|
|
spares++;
|
|
|
md_new_event(mddev);
|
|
@@ -7138,6 +7353,8 @@ static void reap_sync_thread(mddev_t *mddev)
|
|
|
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
|
|
sysfs_notify_dirent_safe(mddev->sysfs_action);
|
|
|
md_new_event(mddev);
|
|
|
+ if (mddev->event_work.func)
|
|
|
+ queue_work(md_misc_wq, &mddev->event_work);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -7170,9 +7387,6 @@ void md_check_recovery(mddev_t *mddev)
|
|
|
if (mddev->bitmap)
|
|
|
bitmap_daemon_work(mddev);
|
|
|
|
|
|
- if (mddev->ro)
|
|
|
- return;
|
|
|
-
|
|
|
if (signal_pending(current)) {
|
|
|
if (mddev->pers->sync_request && !mddev->external) {
|
|
|
printk(KERN_INFO "md: %s in immediate safe mode\n",
|
|
@@ -7209,9 +7423,7 @@ void md_check_recovery(mddev_t *mddev)
|
|
|
atomic_read(&rdev->nr_pending)==0) {
|
|
|
if (mddev->pers->hot_remove_disk(
|
|
|
mddev, rdev->raid_disk)==0) {
|
|
|
- char nm[20];
|
|
|
- sprintf(nm,"rd%d", rdev->raid_disk);
|
|
|
- sysfs_remove_link(&mddev->kobj, nm);
|
|
|
+ sysfs_unlink_rdev(mddev, rdev);
|
|
|
rdev->raid_disk = -1;
|
|
|
}
|
|
|
}
|
|
@@ -7331,12 +7543,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
|
|
|
{
|
|
|
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
|
|
wait_event_timeout(rdev->blocked_wait,
|
|
|
- !test_bit(Blocked, &rdev->flags),
|
|
|
+ !test_bit(Blocked, &rdev->flags) &&
|
|
|
+ !test_bit(BlockedBadBlocks, &rdev->flags),
|
|
|
msecs_to_jiffies(5000));
|
|
|
rdev_dec_pending(rdev, mddev);
|
|
|
}
|
|
|
EXPORT_SYMBOL(md_wait_for_blocked_rdev);
|
|
|
|
|
|
+
|
|
|
+/* Bad block management.
|
|
|
+ * We can record which blocks on each device are 'bad' and so just
|
|
|
+ * fail those blocks, or that stripe, rather than the whole device.
|
|
|
+ * Entries in the bad-block table are 64bits wide. This comprises:
|
|
|
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
|
|
|
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
|
|
|
+ * A 'shift' can be set so that larger blocks are tracked and
|
|
|
+ * consequently larger devices can be covered.
|
|
|
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
|
|
|
+ *
|
|
|
+ * Locking of the bad-block table uses a seqlock so md_is_badblock
|
|
|
+ * might need to retry if it is very unlucky.
|
|
|
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
|
|
|
+ * so we use the write_seqlock_irq variant.
|
|
|
+ *
|
|
|
+ * When looking for a bad block we specify a range and want to
|
|
|
+ * know if any block in the range is bad. So we binary-search
|
|
|
+ * to the last range that starts at-or-before the given endpoint,
|
|
|
+ * (or "before the sector after the target range")
|
|
|
+ * then see if it ends after the given start.
|
|
|
+ * We return
|
|
|
+ * 0 if there are no known bad blocks in the range
|
|
|
+ * 1 if there are known bad block which are all acknowledged
|
|
|
+ * -1 if there are bad blocks which have not yet been acknowledged in metadata.
|
|
|
+ * plus the start/length of the first bad section we overlap.
|
|
|
+ */
|
|
|
+int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
|
|
|
+ sector_t *first_bad, int *bad_sectors)
|
|
|
+{
|
|
|
+ int hi;
|
|
|
+ int lo = 0;
|
|
|
+ u64 *p = bb->page;
|
|
|
+ int rv = 0;
|
|
|
+ sector_t target = s + sectors;
|
|
|
+ unsigned seq;
|
|
|
+
|
|
|
+ if (bb->shift > 0) {
|
|
|
+ /* round the start down, and the end up */
|
|
|
+ s >>= bb->shift;
|
|
|
+ target += (1<<bb->shift) - 1;
|
|
|
+ target >>= bb->shift;
|
|
|
+ sectors = target - s;
|
|
|
+ }
|
|
|
+ /* 'target' is now the first block after the bad range */
|
|
|
+
|
|
|
+retry:
|
|
|
+ seq = read_seqbegin(&bb->lock);
|
|
|
+
|
|
|
+ hi = bb->count;
|
|
|
+
|
|
|
+ /* Binary search between lo and hi for 'target'
|
|
|
+ * i.e. for the last range that starts before 'target'
|
|
|
+ */
|
|
|
+ /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
|
|
|
+ * are known not to be the last range before target.
|
|
|
+ * VARIANT: hi-lo is the number of possible
|
|
|
+ * ranges, and decreases until it reaches 1
|
|
|
+ */
|
|
|
+ while (hi - lo > 1) {
|
|
|
+ int mid = (lo + hi) / 2;
|
|
|
+ sector_t a = BB_OFFSET(p[mid]);
|
|
|
+ if (a < target)
|
|
|
+ /* This could still be the one, earlier ranges
|
|
|
+ * could not. */
|
|
|
+ lo = mid;
|
|
|
+ else
|
|
|
+ /* This and later ranges are definitely out. */
|
|
|
+ hi = mid;
|
|
|
+ }
|
|
|
+ /* 'lo' might be the last that started before target, but 'hi' isn't */
|
|
|
+ if (hi > lo) {
|
|
|
+ /* need to check all range that end after 's' to see if
|
|
|
+ * any are unacknowledged.
|
|
|
+ */
|
|
|
+ while (lo >= 0 &&
|
|
|
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
|
|
|
+ if (BB_OFFSET(p[lo]) < target) {
|
|
|
+ /* starts before the end, and finishes after
|
|
|
+ * the start, so they must overlap
|
|
|
+ */
|
|
|
+ if (rv != -1 && BB_ACK(p[lo]))
|
|
|
+ rv = 1;
|
|
|
+ else
|
|
|
+ rv = -1;
|
|
|
+ *first_bad = BB_OFFSET(p[lo]);
|
|
|
+ *bad_sectors = BB_LEN(p[lo]);
|
|
|
+ }
|
|
|
+ lo--;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (read_seqretry(&bb->lock, seq))
|
|
|
+ goto retry;
|
|
|
+
|
|
|
+ return rv;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(md_is_badblock);
|
|
|
+
|
|
|
+/*
|
|
|
+ * Add a range of bad blocks to the table.
|
|
|
+ * This might extend the table, or might contract it
|
|
|
+ * if two adjacent ranges can be merged.
|
|
|
+ * We binary-search to find the 'insertion' point, then
|
|
|
+ * decide how best to handle it.
|
|
|
+ */
|
|
|
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
|
|
|
+ int acknowledged)
|
|
|
+{
|
|
|
+ u64 *p;
|
|
|
+ int lo, hi;
|
|
|
+ int rv = 1;
|
|
|
+
|
|
|
+ if (bb->shift < 0)
|
|
|
+ /* badblocks are disabled */
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ if (bb->shift) {
|
|
|
+ /* round the start down, and the end up */
|
|
|
+ sector_t next = s + sectors;
|
|
|
+ s >>= bb->shift;
|
|
|
+ next += (1<<bb->shift) - 1;
|
|
|
+ next >>= bb->shift;
|
|
|
+ sectors = next - s;
|
|
|
+ }
|
|
|
+
|
|
|
+ write_seqlock_irq(&bb->lock);
|
|
|
+
|
|
|
+ p = bb->page;
|
|
|
+ lo = 0;
|
|
|
+ hi = bb->count;
|
|
|
+ /* Find the last range that starts at-or-before 's' */
|
|
|
+ while (hi - lo > 1) {
|
|
|
+ int mid = (lo + hi) / 2;
|
|
|
+ sector_t a = BB_OFFSET(p[mid]);
|
|
|
+ if (a <= s)
|
|
|
+ lo = mid;
|
|
|
+ else
|
|
|
+ hi = mid;
|
|
|
+ }
|
|
|
+ if (hi > lo && BB_OFFSET(p[lo]) > s)
|
|
|
+ hi = lo;
|
|
|
+
|
|
|
+ if (hi > lo) {
|
|
|
+ /* we found a range that might merge with the start
|
|
|
+ * of our new range
|
|
|
+ */
|
|
|
+ sector_t a = BB_OFFSET(p[lo]);
|
|
|
+ sector_t e = a + BB_LEN(p[lo]);
|
|
|
+ int ack = BB_ACK(p[lo]);
|
|
|
+ if (e >= s) {
|
|
|
+ /* Yes, we can merge with a previous range */
|
|
|
+ if (s == a && s + sectors >= e)
|
|
|
+ /* new range covers old */
|
|
|
+ ack = acknowledged;
|
|
|
+ else
|
|
|
+ ack = ack && acknowledged;
|
|
|
+
|
|
|
+ if (e < s + sectors)
|
|
|
+ e = s + sectors;
|
|
|
+ if (e - a <= BB_MAX_LEN) {
|
|
|
+ p[lo] = BB_MAKE(a, e-a, ack);
|
|
|
+ s = e;
|
|
|
+ } else {
|
|
|
+ /* does not all fit in one range,
|
|
|
+ * make p[lo] maximal
|
|
|
+ */
|
|
|
+ if (BB_LEN(p[lo]) != BB_MAX_LEN)
|
|
|
+ p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
|
|
|
+ s = a + BB_MAX_LEN;
|
|
|
+ }
|
|
|
+ sectors = e - s;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (sectors && hi < bb->count) {
|
|
|
+ /* 'hi' points to the first range that starts after 's'.
|
|
|
+ * Maybe we can merge with the start of that range */
|
|
|
+ sector_t a = BB_OFFSET(p[hi]);
|
|
|
+ sector_t e = a + BB_LEN(p[hi]);
|
|
|
+ int ack = BB_ACK(p[hi]);
|
|
|
+ if (a <= s + sectors) {
|
|
|
+ /* merging is possible */
|
|
|
+ if (e <= s + sectors) {
|
|
|
+ /* full overlap */
|
|
|
+ e = s + sectors;
|
|
|
+ ack = acknowledged;
|
|
|
+ } else
|
|
|
+ ack = ack && acknowledged;
|
|
|
+
|
|
|
+ a = s;
|
|
|
+ if (e - a <= BB_MAX_LEN) {
|
|
|
+ p[hi] = BB_MAKE(a, e-a, ack);
|
|
|
+ s = e;
|
|
|
+ } else {
|
|
|
+ p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
|
|
|
+ s = a + BB_MAX_LEN;
|
|
|
+ }
|
|
|
+ sectors = e - s;
|
|
|
+ lo = hi;
|
|
|
+ hi++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (sectors == 0 && hi < bb->count) {
|
|
|
+ /* we might be able to combine lo and hi */
|
|
|
+ /* Note: 's' is at the end of 'lo' */
|
|
|
+ sector_t a = BB_OFFSET(p[hi]);
|
|
|
+ int lolen = BB_LEN(p[lo]);
|
|
|
+ int hilen = BB_LEN(p[hi]);
|
|
|
+ int newlen = lolen + hilen - (s - a);
|
|
|
+ if (s >= a && newlen < BB_MAX_LEN) {
|
|
|
+ /* yes, we can combine them */
|
|
|
+ int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
|
|
|
+ p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
|
|
|
+ memmove(p + hi, p + hi + 1,
|
|
|
+ (bb->count - hi - 1) * 8);
|
|
|
+ bb->count--;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ while (sectors) {
|
|
|
+ /* didn't merge (it all).
|
|
|
+ * Need to add a range just before 'hi' */
|
|
|
+ if (bb->count >= MD_MAX_BADBLOCKS) {
|
|
|
+ /* No room for more */
|
|
|
+ rv = 0;
|
|
|
+ break;
|
|
|
+ } else {
|
|
|
+ int this_sectors = sectors;
|
|
|
+ memmove(p + hi + 1, p + hi,
|
|
|
+ (bb->count - hi) * 8);
|
|
|
+ bb->count++;
|
|
|
+
|
|
|
+ if (this_sectors > BB_MAX_LEN)
|
|
|
+ this_sectors = BB_MAX_LEN;
|
|
|
+ p[hi] = BB_MAKE(s, this_sectors, acknowledged);
|
|
|
+ sectors -= this_sectors;
|
|
|
+ s += this_sectors;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ bb->changed = 1;
|
|
|
+ if (!acknowledged)
|
|
|
+ bb->unacked_exist = 1;
|
|
|
+ write_sequnlock_irq(&bb->lock);
|
|
|
+
|
|
|
+ return rv;
|
|
|
+}
|
|
|
+
|
|
|
+int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
|
|
|
+ int acknowledged)
|
|
|
+{
|
|
|
+ int rv = md_set_badblocks(&rdev->badblocks,
|
|
|
+ s + rdev->data_offset, sectors, acknowledged);
|
|
|
+ if (rv) {
|
|
|
+ /* Make sure they get written out promptly */
|
|
|
+ set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
|
|
|
+ md_wakeup_thread(rdev->mddev->thread);
|
|
|
+ }
|
|
|
+ return rv;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(rdev_set_badblocks);
|
|
|
+
|
|
|
+/*
|
|
|
+ * Remove a range of bad blocks from the table.
|
|
|
+ * This may involve extending the table if we spilt a region,
|
|
|
+ * but it must not fail. So if the table becomes full, we just
|
|
|
+ * drop the remove request.
|
|
|
+ */
|
|
|
+static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
|
|
|
+{
|
|
|
+ u64 *p;
|
|
|
+ int lo, hi;
|
|
|
+ sector_t target = s + sectors;
|
|
|
+ int rv = 0;
|
|
|
+
|
|
|
+ if (bb->shift > 0) {
|
|
|
+ /* When clearing we round the start up and the end down.
|
|
|
+ * This should not matter as the shift should align with
|
|
|
+ * the block size and no rounding should ever be needed.
|
|
|
+ * However it is better the think a block is bad when it
|
|
|
+ * isn't than to think a block is not bad when it is.
|
|
|
+ */
|
|
|
+ s += (1<<bb->shift) - 1;
|
|
|
+ s >>= bb->shift;
|
|
|
+ target >>= bb->shift;
|
|
|
+ sectors = target - s;
|
|
|
+ }
|
|
|
+
|
|
|
+ write_seqlock_irq(&bb->lock);
|
|
|
+
|
|
|
+ p = bb->page;
|
|
|
+ lo = 0;
|
|
|
+ hi = bb->count;
|
|
|
+ /* Find the last range that starts before 'target' */
|
|
|
+ while (hi - lo > 1) {
|
|
|
+ int mid = (lo + hi) / 2;
|
|
|
+ sector_t a = BB_OFFSET(p[mid]);
|
|
|
+ if (a < target)
|
|
|
+ lo = mid;
|
|
|
+ else
|
|
|
+ hi = mid;
|
|
|
+ }
|
|
|
+ if (hi > lo) {
|
|
|
+ /* p[lo] is the last range that could overlap the
|
|
|
+ * current range. Earlier ranges could also overlap,
|
|
|
+ * but only this one can overlap the end of the range.
|
|
|
+ */
|
|
|
+ if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
|
|
|
+ /* Partial overlap, leave the tail of this range */
|
|
|
+ int ack = BB_ACK(p[lo]);
|
|
|
+ sector_t a = BB_OFFSET(p[lo]);
|
|
|
+ sector_t end = a + BB_LEN(p[lo]);
|
|
|
+
|
|
|
+ if (a < s) {
|
|
|
+ /* we need to split this range */
|
|
|
+ if (bb->count >= MD_MAX_BADBLOCKS) {
|
|
|
+ rv = 0;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+ memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
|
|
|
+ bb->count++;
|
|
|
+ p[lo] = BB_MAKE(a, s-a, ack);
|
|
|
+ lo++;
|
|
|
+ }
|
|
|
+ p[lo] = BB_MAKE(target, end - target, ack);
|
|
|
+ /* there is no longer an overlap */
|
|
|
+ hi = lo;
|
|
|
+ lo--;
|
|
|
+ }
|
|
|
+ while (lo >= 0 &&
|
|
|
+ BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
|
|
|
+ /* This range does overlap */
|
|
|
+ if (BB_OFFSET(p[lo]) < s) {
|
|
|
+ /* Keep the early parts of this range. */
|
|
|
+ int ack = BB_ACK(p[lo]);
|
|
|
+ sector_t start = BB_OFFSET(p[lo]);
|
|
|
+ p[lo] = BB_MAKE(start, s - start, ack);
|
|
|
+ /* now low doesn't overlap, so.. */
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ lo--;
|
|
|
+ }
|
|
|
+ /* 'lo' is strictly before, 'hi' is strictly after,
|
|
|
+ * anything between needs to be discarded
|
|
|
+ */
|
|
|
+ if (hi - lo > 1) {
|
|
|
+ memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
|
|
|
+ bb->count -= (hi - lo - 1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ bb->changed = 1;
|
|
|
+out:
|
|
|
+ write_sequnlock_irq(&bb->lock);
|
|
|
+ return rv;
|
|
|
+}
|
|
|
+
|
|
|
+int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
|
|
|
+{
|
|
|
+ return md_clear_badblocks(&rdev->badblocks,
|
|
|
+ s + rdev->data_offset,
|
|
|
+ sectors);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
|
|
|
+
|
|
|
+/*
|
|
|
+ * Acknowledge all bad blocks in a list.
|
|
|
+ * This only succeeds if ->changed is clear. It is used by
|
|
|
+ * in-kernel metadata updates
|
|
|
+ */
|
|
|
+void md_ack_all_badblocks(struct badblocks *bb)
|
|
|
+{
|
|
|
+ if (bb->page == NULL || bb->changed)
|
|
|
+ /* no point even trying */
|
|
|
+ return;
|
|
|
+ write_seqlock_irq(&bb->lock);
|
|
|
+
|
|
|
+ if (bb->changed == 0) {
|
|
|
+ u64 *p = bb->page;
|
|
|
+ int i;
|
|
|
+ for (i = 0; i < bb->count ; i++) {
|
|
|
+ if (!BB_ACK(p[i])) {
|
|
|
+ sector_t start = BB_OFFSET(p[i]);
|
|
|
+ int len = BB_LEN(p[i]);
|
|
|
+ p[i] = BB_MAKE(start, len, 1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ bb->unacked_exist = 0;
|
|
|
+ }
|
|
|
+ write_sequnlock_irq(&bb->lock);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
|
|
|
+
|
|
|
+/* sysfs access to bad-blocks list.
|
|
|
+ * We present two files.
|
|
|
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
|
|
|
+ * are recorded as bad. The list is truncated to fit within
|
|
|
+ * the one-page limit of sysfs.
|
|
|
+ * Writing "sector length" to this file adds an acknowledged
|
|
|
+ * bad block list.
|
|
|
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
|
|
|
+ * been acknowledged. Writing to this file adds bad blocks
|
|
|
+ * without acknowledging them. This is largely for testing.
|
|
|
+ */
|
|
|
+
|
|
|
+static ssize_t
|
|
|
+badblocks_show(struct badblocks *bb, char *page, int unack)
|
|
|
+{
|
|
|
+ size_t len;
|
|
|
+ int i;
|
|
|
+ u64 *p = bb->page;
|
|
|
+ unsigned seq;
|
|
|
+
|
|
|
+ if (bb->shift < 0)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+retry:
|
|
|
+ seq = read_seqbegin(&bb->lock);
|
|
|
+
|
|
|
+ len = 0;
|
|
|
+ i = 0;
|
|
|
+
|
|
|
+ while (len < PAGE_SIZE && i < bb->count) {
|
|
|
+ sector_t s = BB_OFFSET(p[i]);
|
|
|
+ unsigned int length = BB_LEN(p[i]);
|
|
|
+ int ack = BB_ACK(p[i]);
|
|
|
+ i++;
|
|
|
+
|
|
|
+ if (unack && ack)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
|
|
|
+ (unsigned long long)s << bb->shift,
|
|
|
+ length << bb->shift);
|
|
|
+ }
|
|
|
+ if (unack && len == 0)
|
|
|
+ bb->unacked_exist = 0;
|
|
|
+
|
|
|
+ if (read_seqretry(&bb->lock, seq))
|
|
|
+ goto retry;
|
|
|
+
|
|
|
+ return len;
|
|
|
+}
|
|
|
+
|
|
|
+#define DO_DEBUG 1
|
|
|
+
|
|
|
+static ssize_t
|
|
|
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
|
|
|
+{
|
|
|
+ unsigned long long sector;
|
|
|
+ int length;
|
|
|
+ char newline;
|
|
|
+#ifdef DO_DEBUG
|
|
|
+ /* Allow clearing via sysfs *only* for testing/debugging.
|
|
|
+ * Normally only a successful write may clear a badblock
|
|
|
+ */
|
|
|
+ int clear = 0;
|
|
|
+ if (page[0] == '-') {
|
|
|
+ clear = 1;
|
|
|
+ page++;
|
|
|
+ }
|
|
|
+#endif /* DO_DEBUG */
|
|
|
+
|
|
|
+ switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) {
|
|
|
+ case 3:
|
|
|
+ if (newline != '\n')
|
|
|
+ return -EINVAL;
|
|
|
+ case 2:
|
|
|
+ if (length <= 0)
|
|
|
+ return -EINVAL;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ return -EINVAL;
|
|
|
+ }
|
|
|
+
|
|
|
+#ifdef DO_DEBUG
|
|
|
+ if (clear) {
|
|
|
+ md_clear_badblocks(bb, sector, length);
|
|
|
+ return len;
|
|
|
+ }
|
|
|
+#endif /* DO_DEBUG */
|
|
|
+ if (md_set_badblocks(bb, sector, length, !unack))
|
|
|
+ return len;
|
|
|
+ else
|
|
|
+ return -ENOSPC;
|
|
|
+}
|
|
|
+
|
|
|
static int md_notify_reboot(struct notifier_block *this,
|
|
|
unsigned long code, void *x)
|
|
|
{
|