dm-raid.c 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
  1. /*
  2. * Copyright (C) 2010-2011 Neil Brown
  3. * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
  4. *
  5. * This file is released under the GPL.
  6. */
  7. #include <linux/slab.h>
  8. #include <linux/module.h>
  9. #include "md.h"
  10. #include "raid1.h"
  11. #include "raid5.h"
  12. #include "bitmap.h"
  13. #include <linux/device-mapper.h>
  14. #define DM_MSG_PREFIX "raid"
  15. /*
  16. * The following flags are used by dm-raid.c to set up the array state.
  17. * They must be cleared before md_run is called.
  18. */
  19. #define FirstUse 10 /* rdev flag */
  20. struct raid_dev {
  21. /*
  22. * Two DM devices, one to hold metadata and one to hold the
  23. * actual data/parity. The reason for this is to not confuse
  24. * ti->len and give more flexibility in altering size and
  25. * characteristics.
  26. *
  27. * While it is possible for this device to be associated
  28. * with a different physical device than the data_dev, it
  29. * is intended for it to be the same.
  30. * |--------- Physical Device ---------|
  31. * |- meta_dev -|------ data_dev ------|
  32. */
  33. struct dm_dev *meta_dev;
  34. struct dm_dev *data_dev;
  35. struct md_rdev rdev;
  36. };
  37. /*
  38. * Flags for rs->print_flags field.
  39. */
  40. #define DMPF_SYNC 0x1
  41. #define DMPF_NOSYNC 0x2
  42. #define DMPF_REBUILD 0x4
  43. #define DMPF_DAEMON_SLEEP 0x8
  44. #define DMPF_MIN_RECOVERY_RATE 0x10
  45. #define DMPF_MAX_RECOVERY_RATE 0x20
  46. #define DMPF_MAX_WRITE_BEHIND 0x40
  47. #define DMPF_STRIPE_CACHE 0x80
  48. #define DMPF_REGION_SIZE 0X100
  49. struct raid_set {
  50. struct dm_target *ti;
  51. uint64_t print_flags;
  52. struct mddev md;
  53. struct raid_type *raid_type;
  54. struct dm_target_callbacks callbacks;
  55. struct raid_dev dev[0];
  56. };
  57. /* Supported raid types and properties. */
  58. static struct raid_type {
  59. const char *name; /* RAID algorithm. */
  60. const char *descr; /* Descriptor text for logging. */
  61. const unsigned parity_devs; /* # of parity devices. */
  62. const unsigned minimal_devs; /* minimal # of devices in set. */
  63. const unsigned level; /* RAID level. */
  64. const unsigned algorithm; /* RAID algorithm. */
  65. } raid_types[] = {
  66. {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
  67. {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
  68. {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
  69. {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
  70. {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
  71. {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
  72. {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
  73. {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
  74. {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
  75. };
  76. static struct raid_type *get_raid_type(char *name)
  77. {
  78. int i;
  79. for (i = 0; i < ARRAY_SIZE(raid_types); i++)
  80. if (!strcmp(raid_types[i].name, name))
  81. return &raid_types[i];
  82. return NULL;
  83. }
  84. static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
  85. {
  86. unsigned i;
  87. struct raid_set *rs;
  88. sector_t sectors_per_dev;
  89. if (raid_devs <= raid_type->parity_devs) {
  90. ti->error = "Insufficient number of devices";
  91. return ERR_PTR(-EINVAL);
  92. }
  93. sectors_per_dev = ti->len;
  94. if ((raid_type->level > 1) &&
  95. sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
  96. ti->error = "Target length not divisible by number of data devices";
  97. return ERR_PTR(-EINVAL);
  98. }
  99. rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
  100. if (!rs) {
  101. ti->error = "Cannot allocate raid context";
  102. return ERR_PTR(-ENOMEM);
  103. }
  104. mddev_init(&rs->md);
  105. rs->ti = ti;
  106. rs->raid_type = raid_type;
  107. rs->md.raid_disks = raid_devs;
  108. rs->md.level = raid_type->level;
  109. rs->md.new_level = rs->md.level;
  110. rs->md.dev_sectors = sectors_per_dev;
  111. rs->md.layout = raid_type->algorithm;
  112. rs->md.new_layout = rs->md.layout;
  113. rs->md.delta_disks = 0;
  114. rs->md.recovery_cp = 0;
  115. for (i = 0; i < raid_devs; i++)
  116. md_rdev_init(&rs->dev[i].rdev);
  117. /*
  118. * Remaining items to be initialized by further RAID params:
  119. * rs->md.persistent
  120. * rs->md.external
  121. * rs->md.chunk_sectors
  122. * rs->md.new_chunk_sectors
  123. */
  124. return rs;
  125. }
  126. static void context_free(struct raid_set *rs)
  127. {
  128. int i;
  129. for (i = 0; i < rs->md.raid_disks; i++) {
  130. if (rs->dev[i].meta_dev)
  131. dm_put_device(rs->ti, rs->dev[i].meta_dev);
  132. if (rs->dev[i].rdev.sb_page)
  133. put_page(rs->dev[i].rdev.sb_page);
  134. rs->dev[i].rdev.sb_page = NULL;
  135. rs->dev[i].rdev.sb_loaded = 0;
  136. if (rs->dev[i].data_dev)
  137. dm_put_device(rs->ti, rs->dev[i].data_dev);
  138. }
  139. kfree(rs);
  140. }
  141. /*
  142. * For every device we have two words
  143. * <meta_dev>: meta device name or '-' if missing
  144. * <data_dev>: data device name or '-' if missing
  145. *
  146. * The following are permitted:
  147. * - -
  148. * - <data_dev>
  149. * <meta_dev> <data_dev>
  150. *
  151. * The following is not allowed:
  152. * <meta_dev> -
  153. *
  154. * This code parses those words. If there is a failure,
  155. * the caller must use context_free to unwind the operations.
  156. */
  157. static int dev_parms(struct raid_set *rs, char **argv)
  158. {
  159. int i;
  160. int rebuild = 0;
  161. int metadata_available = 0;
  162. int ret = 0;
  163. for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
  164. rs->dev[i].rdev.raid_disk = i;
  165. rs->dev[i].meta_dev = NULL;
  166. rs->dev[i].data_dev = NULL;
  167. /*
  168. * There are no offsets, since there is a separate device
  169. * for data and metadata.
  170. */
  171. rs->dev[i].rdev.data_offset = 0;
  172. rs->dev[i].rdev.mddev = &rs->md;
  173. if (strcmp(argv[0], "-")) {
  174. ret = dm_get_device(rs->ti, argv[0],
  175. dm_table_get_mode(rs->ti->table),
  176. &rs->dev[i].meta_dev);
  177. rs->ti->error = "RAID metadata device lookup failure";
  178. if (ret)
  179. return ret;
  180. rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
  181. if (!rs->dev[i].rdev.sb_page)
  182. return -ENOMEM;
  183. }
  184. if (!strcmp(argv[1], "-")) {
  185. if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
  186. (!rs->dev[i].rdev.recovery_offset)) {
  187. rs->ti->error = "Drive designated for rebuild not specified";
  188. return -EINVAL;
  189. }
  190. rs->ti->error = "No data device supplied with metadata device";
  191. if (rs->dev[i].meta_dev)
  192. return -EINVAL;
  193. continue;
  194. }
  195. ret = dm_get_device(rs->ti, argv[1],
  196. dm_table_get_mode(rs->ti->table),
  197. &rs->dev[i].data_dev);
  198. if (ret) {
  199. rs->ti->error = "RAID device lookup failure";
  200. return ret;
  201. }
  202. if (rs->dev[i].meta_dev) {
  203. metadata_available = 1;
  204. rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
  205. }
  206. rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
  207. list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
  208. if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
  209. rebuild++;
  210. }
  211. if (metadata_available) {
  212. rs->md.external = 0;
  213. rs->md.persistent = 1;
  214. rs->md.major_version = 2;
  215. } else if (rebuild && !rs->md.recovery_cp) {
  216. /*
  217. * Without metadata, we will not be able to tell if the array
  218. * is in-sync or not - we must assume it is not. Therefore,
  219. * it is impossible to rebuild a drive.
  220. *
  221. * Even if there is metadata, the on-disk information may
  222. * indicate that the array is not in-sync and it will then
  223. * fail at that time.
  224. *
  225. * User could specify 'nosync' option if desperate.
  226. */
  227. DMERR("Unable to rebuild drive while array is not in-sync");
  228. rs->ti->error = "RAID device lookup failure";
  229. return -EINVAL;
  230. }
  231. return 0;
  232. }
  233. /*
  234. * validate_region_size
  235. * @rs
  236. * @region_size: region size in sectors. If 0, pick a size (4MiB default).
  237. *
  238. * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
  239. * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
  240. *
  241. * Returns: 0 on success, -EINVAL on failure.
  242. */
  243. static int validate_region_size(struct raid_set *rs, unsigned long region_size)
  244. {
  245. unsigned long min_region_size = rs->ti->len / (1 << 21);
  246. if (!region_size) {
  247. /*
  248. * Choose a reasonable default. All figures in sectors.
  249. */
  250. if (min_region_size > (1 << 13)) {
  251. DMINFO("Choosing default region size of %lu sectors",
  252. region_size);
  253. region_size = min_region_size;
  254. } else {
  255. DMINFO("Choosing default region size of 4MiB");
  256. region_size = 1 << 13; /* sectors */
  257. }
  258. } else {
  259. /*
  260. * Validate user-supplied value.
  261. */
  262. if (region_size > rs->ti->len) {
  263. rs->ti->error = "Supplied region size is too large";
  264. return -EINVAL;
  265. }
  266. if (region_size < min_region_size) {
  267. DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
  268. region_size, min_region_size);
  269. rs->ti->error = "Supplied region size is too small";
  270. return -EINVAL;
  271. }
  272. if (!is_power_of_2(region_size)) {
  273. rs->ti->error = "Region size is not a power of 2";
  274. return -EINVAL;
  275. }
  276. if (region_size < rs->md.chunk_sectors) {
  277. rs->ti->error = "Region size is smaller than the chunk size";
  278. return -EINVAL;
  279. }
  280. }
  281. /*
  282. * Convert sectors to bytes.
  283. */
  284. rs->md.bitmap_info.chunksize = (region_size << 9);
  285. return 0;
  286. }
  287. /*
  288. * Possible arguments are...
  289. * <chunk_size> [optional_args]
  290. *
  291. * Argument definitions
  292. * <chunk_size> The number of sectors per disk that
  293. * will form the "stripe"
  294. * [[no]sync] Force or prevent recovery of the
  295. * entire array
  296. * [rebuild <idx>] Rebuild the drive indicated by the index
  297. * [daemon_sleep <ms>] Time between bitmap daemon work to
  298. * clear bits
  299. * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
  300. * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
  301. * [write_mostly <idx>] Indicate a write mostly drive via index
  302. * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
  303. * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
  304. * [region_size <sectors>] Defines granularity of bitmap
  305. */
  306. static int parse_raid_params(struct raid_set *rs, char **argv,
  307. unsigned num_raid_params)
  308. {
  309. unsigned i, rebuild_cnt = 0;
  310. unsigned long value, region_size = 0;
  311. char *key;
  312. /*
  313. * First, parse the in-order required arguments
  314. * "chunk_size" is the only argument of this type.
  315. */
  316. if ((strict_strtoul(argv[0], 10, &value) < 0)) {
  317. rs->ti->error = "Bad chunk size";
  318. return -EINVAL;
  319. } else if (rs->raid_type->level == 1) {
  320. if (value)
  321. DMERR("Ignoring chunk size parameter for RAID 1");
  322. value = 0;
  323. } else if (!is_power_of_2(value)) {
  324. rs->ti->error = "Chunk size must be a power of 2";
  325. return -EINVAL;
  326. } else if (value < 8) {
  327. rs->ti->error = "Chunk size value is too small";
  328. return -EINVAL;
  329. }
  330. rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
  331. argv++;
  332. num_raid_params--;
  333. /*
  334. * We set each individual device as In_sync with a completed
  335. * 'recovery_offset'. If there has been a device failure or
  336. * replacement then one of the following cases applies:
  337. *
  338. * 1) User specifies 'rebuild'.
  339. * - Device is reset when param is read.
  340. * 2) A new device is supplied.
  341. * - No matching superblock found, resets device.
  342. * 3) Device failure was transient and returns on reload.
  343. * - Failure noticed, resets device for bitmap replay.
  344. * 4) Device hadn't completed recovery after previous failure.
  345. * - Superblock is read and overrides recovery_offset.
  346. *
  347. * What is found in the superblocks of the devices is always
  348. * authoritative, unless 'rebuild' or '[no]sync' was specified.
  349. */
  350. for (i = 0; i < rs->md.raid_disks; i++) {
  351. set_bit(In_sync, &rs->dev[i].rdev.flags);
  352. rs->dev[i].rdev.recovery_offset = MaxSector;
  353. }
  354. /*
  355. * Second, parse the unordered optional arguments
  356. */
  357. for (i = 0; i < num_raid_params; i++) {
  358. if (!strcasecmp(argv[i], "nosync")) {
  359. rs->md.recovery_cp = MaxSector;
  360. rs->print_flags |= DMPF_NOSYNC;
  361. continue;
  362. }
  363. if (!strcasecmp(argv[i], "sync")) {
  364. rs->md.recovery_cp = 0;
  365. rs->print_flags |= DMPF_SYNC;
  366. continue;
  367. }
  368. /* The rest of the optional arguments come in key/value pairs */
  369. if ((i + 1) >= num_raid_params) {
  370. rs->ti->error = "Wrong number of raid parameters given";
  371. return -EINVAL;
  372. }
  373. key = argv[i++];
  374. if (strict_strtoul(argv[i], 10, &value) < 0) {
  375. rs->ti->error = "Bad numerical argument given in raid params";
  376. return -EINVAL;
  377. }
  378. if (!strcasecmp(key, "rebuild")) {
  379. rebuild_cnt++;
  380. if (((rs->raid_type->level != 1) &&
  381. (rebuild_cnt > rs->raid_type->parity_devs)) ||
  382. ((rs->raid_type->level == 1) &&
  383. (rebuild_cnt > (rs->md.raid_disks - 1)))) {
  384. rs->ti->error = "Too many rebuild devices specified for given RAID type";
  385. return -EINVAL;
  386. }
  387. if (value > rs->md.raid_disks) {
  388. rs->ti->error = "Invalid rebuild index given";
  389. return -EINVAL;
  390. }
  391. clear_bit(In_sync, &rs->dev[value].rdev.flags);
  392. rs->dev[value].rdev.recovery_offset = 0;
  393. rs->print_flags |= DMPF_REBUILD;
  394. } else if (!strcasecmp(key, "write_mostly")) {
  395. if (rs->raid_type->level != 1) {
  396. rs->ti->error = "write_mostly option is only valid for RAID1";
  397. return -EINVAL;
  398. }
  399. if (value >= rs->md.raid_disks) {
  400. rs->ti->error = "Invalid write_mostly drive index given";
  401. return -EINVAL;
  402. }
  403. set_bit(WriteMostly, &rs->dev[value].rdev.flags);
  404. } else if (!strcasecmp(key, "max_write_behind")) {
  405. if (rs->raid_type->level != 1) {
  406. rs->ti->error = "max_write_behind option is only valid for RAID1";
  407. return -EINVAL;
  408. }
  409. rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
  410. /*
  411. * In device-mapper, we specify things in sectors, but
  412. * MD records this value in kB
  413. */
  414. value /= 2;
  415. if (value > COUNTER_MAX) {
  416. rs->ti->error = "Max write-behind limit out of range";
  417. return -EINVAL;
  418. }
  419. rs->md.bitmap_info.max_write_behind = value;
  420. } else if (!strcasecmp(key, "daemon_sleep")) {
  421. rs->print_flags |= DMPF_DAEMON_SLEEP;
  422. if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
  423. rs->ti->error = "daemon sleep period out of range";
  424. return -EINVAL;
  425. }
  426. rs->md.bitmap_info.daemon_sleep = value;
  427. } else if (!strcasecmp(key, "stripe_cache")) {
  428. rs->print_flags |= DMPF_STRIPE_CACHE;
  429. /*
  430. * In device-mapper, we specify things in sectors, but
  431. * MD records this value in kB
  432. */
  433. value /= 2;
  434. if (rs->raid_type->level < 5) {
  435. rs->ti->error = "Inappropriate argument: stripe_cache";
  436. return -EINVAL;
  437. }
  438. if (raid5_set_cache_size(&rs->md, (int)value)) {
  439. rs->ti->error = "Bad stripe_cache size";
  440. return -EINVAL;
  441. }
  442. } else if (!strcasecmp(key, "min_recovery_rate")) {
  443. rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
  444. if (value > INT_MAX) {
  445. rs->ti->error = "min_recovery_rate out of range";
  446. return -EINVAL;
  447. }
  448. rs->md.sync_speed_min = (int)value;
  449. } else if (!strcasecmp(key, "max_recovery_rate")) {
  450. rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
  451. if (value > INT_MAX) {
  452. rs->ti->error = "max_recovery_rate out of range";
  453. return -EINVAL;
  454. }
  455. rs->md.sync_speed_max = (int)value;
  456. } else if (!strcasecmp(key, "region_size")) {
  457. rs->print_flags |= DMPF_REGION_SIZE;
  458. region_size = value;
  459. } else {
  460. DMERR("Unable to parse RAID parameter: %s", key);
  461. rs->ti->error = "Unable to parse RAID parameters";
  462. return -EINVAL;
  463. }
  464. }
  465. if (validate_region_size(rs, region_size))
  466. return -EINVAL;
  467. if (rs->md.chunk_sectors)
  468. rs->ti->split_io = rs->md.chunk_sectors;
  469. else
  470. rs->ti->split_io = region_size;
  471. if (rs->md.chunk_sectors)
  472. rs->ti->split_io = rs->md.chunk_sectors;
  473. else
  474. rs->ti->split_io = region_size;
  475. /* Assume there are no metadata devices until the drives are parsed */
  476. rs->md.persistent = 0;
  477. rs->md.external = 1;
  478. return 0;
  479. }
  480. static void do_table_event(struct work_struct *ws)
  481. {
  482. struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
  483. dm_table_event(rs->ti->table);
  484. }
  485. static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
  486. {
  487. struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
  488. if (rs->raid_type->level == 1)
  489. return md_raid1_congested(&rs->md, bits);
  490. return md_raid5_congested(&rs->md, bits);
  491. }
  492. /*
  493. * This structure is never routinely used by userspace, unlike md superblocks.
  494. * Devices with this superblock should only ever be accessed via device-mapper.
  495. */
  496. #define DM_RAID_MAGIC 0x64526D44
  497. struct dm_raid_superblock {
  498. __le32 magic; /* "DmRd" */
  499. __le32 features; /* Used to indicate possible future changes */
  500. __le32 num_devices; /* Number of devices in this array. (Max 64) */
  501. __le32 array_position; /* The position of this drive in the array */
  502. __le64 events; /* Incremented by md when superblock updated */
  503. __le64 failed_devices; /* Bit field of devices to indicate failures */
  504. /*
  505. * This offset tracks the progress of the repair or replacement of
  506. * an individual drive.
  507. */
  508. __le64 disk_recovery_offset;
  509. /*
  510. * This offset tracks the progress of the initial array
  511. * synchronisation/parity calculation.
  512. */
  513. __le64 array_resync_offset;
  514. /*
  515. * RAID characteristics
  516. */
  517. __le32 level;
  518. __le32 layout;
  519. __le32 stripe_sectors;
  520. __u8 pad[452]; /* Round struct to 512 bytes. */
  521. /* Always set to 0 when writing. */
  522. } __packed;
  523. static int read_disk_sb(struct md_rdev *rdev, int size)
  524. {
  525. BUG_ON(!rdev->sb_page);
  526. if (rdev->sb_loaded)
  527. return 0;
  528. if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
  529. DMERR("Failed to read device superblock");
  530. return -EINVAL;
  531. }
  532. rdev->sb_loaded = 1;
  533. return 0;
  534. }
  535. static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
  536. {
  537. struct md_rdev *r, *t;
  538. uint64_t failed_devices;
  539. struct dm_raid_superblock *sb;
  540. sb = page_address(rdev->sb_page);
  541. failed_devices = le64_to_cpu(sb->failed_devices);
  542. rdev_for_each(r, t, mddev)
  543. if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
  544. failed_devices |= (1ULL << r->raid_disk);
  545. memset(sb, 0, sizeof(*sb));
  546. sb->magic = cpu_to_le32(DM_RAID_MAGIC);
  547. sb->features = cpu_to_le32(0); /* No features yet */
  548. sb->num_devices = cpu_to_le32(mddev->raid_disks);
  549. sb->array_position = cpu_to_le32(rdev->raid_disk);
  550. sb->events = cpu_to_le64(mddev->events);
  551. sb->failed_devices = cpu_to_le64(failed_devices);
  552. sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
  553. sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
  554. sb->level = cpu_to_le32(mddev->level);
  555. sb->layout = cpu_to_le32(mddev->layout);
  556. sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
  557. }
  558. /*
  559. * super_load
  560. *
  561. * This function creates a superblock if one is not found on the device
  562. * and will decide which superblock to use if there's a choice.
  563. *
  564. * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
  565. */
  566. static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
  567. {
  568. int ret;
  569. struct dm_raid_superblock *sb;
  570. struct dm_raid_superblock *refsb;
  571. uint64_t events_sb, events_refsb;
  572. rdev->sb_start = 0;
  573. rdev->sb_size = sizeof(*sb);
  574. ret = read_disk_sb(rdev, rdev->sb_size);
  575. if (ret)
  576. return ret;
  577. sb = page_address(rdev->sb_page);
  578. if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
  579. super_sync(rdev->mddev, rdev);
  580. set_bit(FirstUse, &rdev->flags);
  581. /* Force writing of superblocks to disk */
  582. set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
  583. /* Any superblock is better than none, choose that if given */
  584. return refdev ? 0 : 1;
  585. }
  586. if (!refdev)
  587. return 1;
  588. events_sb = le64_to_cpu(sb->events);
  589. refsb = page_address(refdev->sb_page);
  590. events_refsb = le64_to_cpu(refsb->events);
  591. return (events_sb > events_refsb) ? 1 : 0;
  592. }
  593. static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
  594. {
  595. int role;
  596. struct raid_set *rs = container_of(mddev, struct raid_set, md);
  597. uint64_t events_sb;
  598. uint64_t failed_devices;
  599. struct dm_raid_superblock *sb;
  600. uint32_t new_devs = 0;
  601. uint32_t rebuilds = 0;
  602. struct md_rdev *r, *t;
  603. struct dm_raid_superblock *sb2;
  604. sb = page_address(rdev->sb_page);
  605. events_sb = le64_to_cpu(sb->events);
  606. failed_devices = le64_to_cpu(sb->failed_devices);
  607. /*
  608. * Initialise to 1 if this is a new superblock.
  609. */
  610. mddev->events = events_sb ? : 1;
  611. /*
  612. * Reshaping is not currently allowed
  613. */
  614. if ((le32_to_cpu(sb->level) != mddev->level) ||
  615. (le32_to_cpu(sb->layout) != mddev->layout) ||
  616. (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
  617. DMERR("Reshaping arrays not yet supported.");
  618. return -EINVAL;
  619. }
  620. /* We can only change the number of devices in RAID1 right now */
  621. if ((rs->raid_type->level != 1) &&
  622. (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
  623. DMERR("Reshaping arrays not yet supported.");
  624. return -EINVAL;
  625. }
  626. if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
  627. mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
  628. /*
  629. * During load, we set FirstUse if a new superblock was written.
  630. * There are two reasons we might not have a superblock:
  631. * 1) The array is brand new - in which case, all of the
  632. * devices must have their In_sync bit set. Also,
  633. * recovery_cp must be 0, unless forced.
  634. * 2) This is a new device being added to an old array
  635. * and the new device needs to be rebuilt - in which
  636. * case the In_sync bit will /not/ be set and
  637. * recovery_cp must be MaxSector.
  638. */
  639. rdev_for_each(r, t, mddev) {
  640. if (!test_bit(In_sync, &r->flags)) {
  641. if (!test_bit(FirstUse, &r->flags))
  642. DMERR("Superblock area of "
  643. "rebuild device %d should have been "
  644. "cleared.", r->raid_disk);
  645. set_bit(FirstUse, &r->flags);
  646. rebuilds++;
  647. } else if (test_bit(FirstUse, &r->flags))
  648. new_devs++;
  649. }
  650. if (!rebuilds) {
  651. if (new_devs == mddev->raid_disks) {
  652. DMINFO("Superblocks created for new array");
  653. set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
  654. } else if (new_devs) {
  655. DMERR("New device injected "
  656. "into existing array without 'rebuild' "
  657. "parameter specified");
  658. return -EINVAL;
  659. }
  660. } else if (new_devs) {
  661. DMERR("'rebuild' devices cannot be "
  662. "injected into an array with other first-time devices");
  663. return -EINVAL;
  664. } else if (mddev->recovery_cp != MaxSector) {
  665. DMERR("'rebuild' specified while array is not in-sync");
  666. return -EINVAL;
  667. }
  668. /*
  669. * Now we set the Faulty bit for those devices that are
  670. * recorded in the superblock as failed.
  671. */
  672. rdev_for_each(r, t, mddev) {
  673. if (!r->sb_page)
  674. continue;
  675. sb2 = page_address(r->sb_page);
  676. sb2->failed_devices = 0;
  677. /*
  678. * Check for any device re-ordering.
  679. */
  680. if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
  681. role = le32_to_cpu(sb2->array_position);
  682. if (role != r->raid_disk) {
  683. if (rs->raid_type->level != 1) {
  684. rs->ti->error = "Cannot change device "
  685. "positions in RAID array";
  686. return -EINVAL;
  687. }
  688. DMINFO("RAID1 device #%d now at position #%d",
  689. role, r->raid_disk);
  690. }
  691. /*
  692. * Partial recovery is performed on
  693. * returning failed devices.
  694. */
  695. if (failed_devices & (1 << role))
  696. set_bit(Faulty, &r->flags);
  697. }
  698. }
  699. return 0;
  700. }
  701. static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
  702. {
  703. struct dm_raid_superblock *sb = page_address(rdev->sb_page);
  704. /*
  705. * If mddev->events is not set, we know we have not yet initialized
  706. * the array.
  707. */
  708. if (!mddev->events && super_init_validation(mddev, rdev))
  709. return -EINVAL;
  710. mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
  711. rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
  712. if (!test_bit(FirstUse, &rdev->flags)) {
  713. rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
  714. if (rdev->recovery_offset != MaxSector)
  715. clear_bit(In_sync, &rdev->flags);
  716. }
  717. /*
  718. * If a device comes back, set it as not In_sync and no longer faulty.
  719. */
  720. if (test_bit(Faulty, &rdev->flags)) {
  721. clear_bit(Faulty, &rdev->flags);
  722. clear_bit(In_sync, &rdev->flags);
  723. rdev->saved_raid_disk = rdev->raid_disk;
  724. rdev->recovery_offset = 0;
  725. }
  726. clear_bit(FirstUse, &rdev->flags);
  727. return 0;
  728. }
  729. /*
  730. * Analyse superblocks and select the freshest.
  731. */
  732. static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
  733. {
  734. int ret;
  735. struct md_rdev *rdev, *freshest, *tmp;
  736. struct mddev *mddev = &rs->md;
  737. freshest = NULL;
  738. rdev_for_each(rdev, tmp, mddev) {
  739. if (!rdev->meta_bdev)
  740. continue;
  741. ret = super_load(rdev, freshest);
  742. switch (ret) {
  743. case 1:
  744. freshest = rdev;
  745. break;
  746. case 0:
  747. break;
  748. default:
  749. ti->error = "Failed to load superblock";
  750. return ret;
  751. }
  752. }
  753. if (!freshest)
  754. return 0;
  755. /*
  756. * Validation of the freshest device provides the source of
  757. * validation for the remaining devices.
  758. */
  759. ti->error = "Unable to assemble array: Invalid superblocks";
  760. if (super_validate(mddev, freshest))
  761. return -EINVAL;
  762. rdev_for_each(rdev, tmp, mddev)
  763. if ((rdev != freshest) && super_validate(mddev, rdev))
  764. return -EINVAL;
  765. return 0;
  766. }
  767. /*
  768. * Construct a RAID4/5/6 mapping:
  769. * Args:
  770. * <raid_type> <#raid_params> <raid_params> \
  771. * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
  772. *
  773. * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
  774. * details on possible <raid_params>.
  775. */
  776. static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
  777. {
  778. int ret;
  779. struct raid_type *rt;
  780. unsigned long num_raid_params, num_raid_devs;
  781. struct raid_set *rs = NULL;
  782. /* Must have at least <raid_type> <#raid_params> */
  783. if (argc < 2) {
  784. ti->error = "Too few arguments";
  785. return -EINVAL;
  786. }
  787. /* raid type */
  788. rt = get_raid_type(argv[0]);
  789. if (!rt) {
  790. ti->error = "Unrecognised raid_type";
  791. return -EINVAL;
  792. }
  793. argc--;
  794. argv++;
  795. /* number of RAID parameters */
  796. if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
  797. ti->error = "Cannot understand number of RAID parameters";
  798. return -EINVAL;
  799. }
  800. argc--;
  801. argv++;
  802. /* Skip over RAID params for now and find out # of devices */
  803. if (num_raid_params + 1 > argc) {
  804. ti->error = "Arguments do not agree with counts given";
  805. return -EINVAL;
  806. }
  807. if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
  808. (num_raid_devs >= INT_MAX)) {
  809. ti->error = "Cannot understand number of raid devices";
  810. return -EINVAL;
  811. }
  812. rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
  813. if (IS_ERR(rs))
  814. return PTR_ERR(rs);
  815. ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
  816. if (ret)
  817. goto bad;
  818. ret = -EINVAL;
  819. argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
  820. argv += num_raid_params + 1;
  821. if (argc != (num_raid_devs * 2)) {
  822. ti->error = "Supplied RAID devices does not match the count given";
  823. goto bad;
  824. }
  825. ret = dev_parms(rs, argv);
  826. if (ret)
  827. goto bad;
  828. rs->md.sync_super = super_sync;
  829. ret = analyse_superblocks(ti, rs);
  830. if (ret)
  831. goto bad;
  832. INIT_WORK(&rs->md.event_work, do_table_event);
  833. ti->private = rs;
  834. mutex_lock(&rs->md.reconfig_mutex);
  835. ret = md_run(&rs->md);
  836. rs->md.in_sync = 0; /* Assume already marked dirty */
  837. mutex_unlock(&rs->md.reconfig_mutex);
  838. if (ret) {
  839. ti->error = "Fail to run raid array";
  840. goto bad;
  841. }
  842. rs->callbacks.congested_fn = raid_is_congested;
  843. dm_table_add_target_callbacks(ti->table, &rs->callbacks);
  844. mddev_suspend(&rs->md);
  845. return 0;
  846. bad:
  847. context_free(rs);
  848. return ret;
  849. }
  850. static void raid_dtr(struct dm_target *ti)
  851. {
  852. struct raid_set *rs = ti->private;
  853. list_del_init(&rs->callbacks.list);
  854. md_stop(&rs->md);
  855. context_free(rs);
  856. }
  857. static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
  858. {
  859. struct raid_set *rs = ti->private;
  860. struct mddev *mddev = &rs->md;
  861. mddev->pers->make_request(mddev, bio);
  862. return DM_MAPIO_SUBMITTED;
  863. }
  864. static int raid_status(struct dm_target *ti, status_type_t type,
  865. char *result, unsigned maxlen)
  866. {
  867. struct raid_set *rs = ti->private;
  868. unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
  869. unsigned sz = 0;
  870. int i, array_in_sync = 0;
  871. sector_t sync;
  872. switch (type) {
  873. case STATUSTYPE_INFO:
  874. DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
  875. if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
  876. sync = rs->md.curr_resync_completed;
  877. else
  878. sync = rs->md.recovery_cp;
  879. if (sync >= rs->md.resync_max_sectors) {
  880. array_in_sync = 1;
  881. sync = rs->md.resync_max_sectors;
  882. } else {
  883. /*
  884. * The array may be doing an initial sync, or it may
  885. * be rebuilding individual components. If all the
  886. * devices are In_sync, then it is the array that is
  887. * being initialized.
  888. */
  889. for (i = 0; i < rs->md.raid_disks; i++)
  890. if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
  891. array_in_sync = 1;
  892. }
  893. /*
  894. * Status characters:
  895. * 'D' = Dead/Failed device
  896. * 'a' = Alive but not in-sync
  897. * 'A' = Alive and in-sync
  898. */
  899. for (i = 0; i < rs->md.raid_disks; i++) {
  900. if (test_bit(Faulty, &rs->dev[i].rdev.flags))
  901. DMEMIT("D");
  902. else if (!array_in_sync ||
  903. !test_bit(In_sync, &rs->dev[i].rdev.flags))
  904. DMEMIT("a");
  905. else
  906. DMEMIT("A");
  907. }
  908. /*
  909. * In-sync ratio:
  910. * The in-sync ratio shows the progress of:
  911. * - Initializing the array
  912. * - Rebuilding a subset of devices of the array
  913. * The user can distinguish between the two by referring
  914. * to the status characters.
  915. */
  916. DMEMIT(" %llu/%llu",
  917. (unsigned long long) sync,
  918. (unsigned long long) rs->md.resync_max_sectors);
  919. break;
  920. case STATUSTYPE_TABLE:
  921. /* The string you would use to construct this array */
  922. for (i = 0; i < rs->md.raid_disks; i++) {
  923. if ((rs->print_flags & DMPF_REBUILD) &&
  924. rs->dev[i].data_dev &&
  925. !test_bit(In_sync, &rs->dev[i].rdev.flags))
  926. raid_param_cnt += 2; /* for rebuilds */
  927. if (rs->dev[i].data_dev &&
  928. test_bit(WriteMostly, &rs->dev[i].rdev.flags))
  929. raid_param_cnt += 2;
  930. }
  931. raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
  932. if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
  933. raid_param_cnt--;
  934. DMEMIT("%s %u %u", rs->raid_type->name,
  935. raid_param_cnt, rs->md.chunk_sectors);
  936. if ((rs->print_flags & DMPF_SYNC) &&
  937. (rs->md.recovery_cp == MaxSector))
  938. DMEMIT(" sync");
  939. if (rs->print_flags & DMPF_NOSYNC)
  940. DMEMIT(" nosync");
  941. for (i = 0; i < rs->md.raid_disks; i++)
  942. if ((rs->print_flags & DMPF_REBUILD) &&
  943. rs->dev[i].data_dev &&
  944. !test_bit(In_sync, &rs->dev[i].rdev.flags))
  945. DMEMIT(" rebuild %u", i);
  946. if (rs->print_flags & DMPF_DAEMON_SLEEP)
  947. DMEMIT(" daemon_sleep %lu",
  948. rs->md.bitmap_info.daemon_sleep);
  949. if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
  950. DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
  951. if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
  952. DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
  953. for (i = 0; i < rs->md.raid_disks; i++)
  954. if (rs->dev[i].data_dev &&
  955. test_bit(WriteMostly, &rs->dev[i].rdev.flags))
  956. DMEMIT(" write_mostly %u", i);
  957. if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
  958. DMEMIT(" max_write_behind %lu",
  959. rs->md.bitmap_info.max_write_behind);
  960. if (rs->print_flags & DMPF_STRIPE_CACHE) {
  961. struct r5conf *conf = rs->md.private;
  962. /* convert from kiB to sectors */
  963. DMEMIT(" stripe_cache %d",
  964. conf ? conf->max_nr_stripes * 2 : 0);
  965. }
  966. if (rs->print_flags & DMPF_REGION_SIZE)
  967. DMEMIT(" region_size %lu",
  968. rs->md.bitmap_info.chunksize >> 9);
  969. DMEMIT(" %d", rs->md.raid_disks);
  970. for (i = 0; i < rs->md.raid_disks; i++) {
  971. if (rs->dev[i].meta_dev)
  972. DMEMIT(" %s", rs->dev[i].meta_dev->name);
  973. else
  974. DMEMIT(" -");
  975. if (rs->dev[i].data_dev)
  976. DMEMIT(" %s", rs->dev[i].data_dev->name);
  977. else
  978. DMEMIT(" -");
  979. }
  980. }
  981. return 0;
  982. }
  983. static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
  984. {
  985. struct raid_set *rs = ti->private;
  986. unsigned i;
  987. int ret = 0;
  988. for (i = 0; !ret && i < rs->md.raid_disks; i++)
  989. if (rs->dev[i].data_dev)
  990. ret = fn(ti,
  991. rs->dev[i].data_dev,
  992. 0, /* No offset on data devs */
  993. rs->md.dev_sectors,
  994. data);
  995. return ret;
  996. }
  997. static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
  998. {
  999. struct raid_set *rs = ti->private;
  1000. unsigned chunk_size = rs->md.chunk_sectors << 9;
  1001. struct r5conf *conf = rs->md.private;
  1002. blk_limits_io_min(limits, chunk_size);
  1003. blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
  1004. }
  1005. static void raid_presuspend(struct dm_target *ti)
  1006. {
  1007. struct raid_set *rs = ti->private;
  1008. md_stop_writes(&rs->md);
  1009. }
  1010. static void raid_postsuspend(struct dm_target *ti)
  1011. {
  1012. struct raid_set *rs = ti->private;
  1013. mddev_suspend(&rs->md);
  1014. }
  1015. static void raid_resume(struct dm_target *ti)
  1016. {
  1017. struct raid_set *rs = ti->private;
  1018. bitmap_load(&rs->md);
  1019. mddev_resume(&rs->md);
  1020. }
  1021. static struct target_type raid_target = {
  1022. .name = "raid",
  1023. .version = {1, 1, 0},
  1024. .module = THIS_MODULE,
  1025. .ctr = raid_ctr,
  1026. .dtr = raid_dtr,
  1027. .map = raid_map,
  1028. .status = raid_status,
  1029. .iterate_devices = raid_iterate_devices,
  1030. .io_hints = raid_io_hints,
  1031. .presuspend = raid_presuspend,
  1032. .postsuspend = raid_postsuspend,
  1033. .resume = raid_resume,
  1034. };
  1035. static int __init dm_raid_init(void)
  1036. {
  1037. return dm_register_target(&raid_target);
  1038. }
  1039. static void __exit dm_raid_exit(void)
  1040. {
  1041. dm_unregister_target(&raid_target);
  1042. }
  1043. module_init(dm_raid_init);
  1044. module_exit(dm_raid_exit);
  1045. MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
  1046. MODULE_ALIAS("dm-raid4");
  1047. MODULE_ALIAS("dm-raid5");
  1048. MODULE_ALIAS("dm-raid6");
  1049. MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
  1050. MODULE_LICENSE("GPL");