dm-switch.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. /*
  2. * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
  3. * Copyright (C) 2011-2013 Red Hat, Inc.
  4. *
  5. * This file is released under the GPL.
  6. *
  7. * dm-switch is a device-mapper target that maps IO to underlying block
  8. * devices efficiently when there are a large number of fixed-sized
  9. * address regions but there is no simple pattern to allow for a compact
  10. * mapping representation such as dm-stripe.
  11. */
  12. #include <linux/device-mapper.h>
  13. #include <linux/module.h>
  14. #include <linux/init.h>
  15. #include <linux/vmalloc.h>
  16. #define DM_MSG_PREFIX "switch"
  17. /*
  18. * One region_table_slot_t holds <region_entries_per_slot> region table
  19. * entries each of which is <region_table_entry_bits> in size.
  20. */
  21. typedef unsigned long region_table_slot_t;
  22. /*
  23. * A device with the offset to its start sector.
  24. */
  25. struct switch_path {
  26. struct dm_dev *dmdev;
  27. sector_t start;
  28. };
  29. /*
  30. * Context block for a dm switch device.
  31. */
  32. struct switch_ctx {
  33. struct dm_target *ti;
  34. unsigned nr_paths; /* Number of paths in path_list. */
  35. unsigned region_size; /* Region size in 512-byte sectors */
  36. unsigned long nr_regions; /* Number of regions making up the device */
  37. signed char region_size_bits; /* log2 of region_size or -1 */
  38. unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
  39. unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
  40. signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
  41. region_table_slot_t *region_table; /* Region table */
  42. /*
  43. * Array of dm devices to switch between.
  44. */
  45. struct switch_path path_list[0];
  46. };
  47. static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
  48. unsigned region_size)
  49. {
  50. struct switch_ctx *sctx;
  51. sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
  52. GFP_KERNEL);
  53. if (!sctx)
  54. return NULL;
  55. sctx->ti = ti;
  56. sctx->region_size = region_size;
  57. ti->private = sctx;
  58. return sctx;
  59. }
  60. static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
  61. {
  62. struct switch_ctx *sctx = ti->private;
  63. sector_t nr_regions = ti->len;
  64. sector_t nr_slots;
  65. if (!(sctx->region_size & (sctx->region_size - 1)))
  66. sctx->region_size_bits = __ffs(sctx->region_size);
  67. else
  68. sctx->region_size_bits = -1;
  69. sctx->region_table_entry_bits = 1;
  70. while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
  71. (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
  72. sctx->region_table_entry_bits++;
  73. sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
  74. if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
  75. sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
  76. else
  77. sctx->region_entries_per_slot_bits = -1;
  78. if (sector_div(nr_regions, sctx->region_size))
  79. nr_regions++;
  80. sctx->nr_regions = nr_regions;
  81. if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
  82. ti->error = "Region table too large";
  83. return -EINVAL;
  84. }
  85. nr_slots = nr_regions;
  86. if (sector_div(nr_slots, sctx->region_entries_per_slot))
  87. nr_slots++;
  88. if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
  89. ti->error = "Region table too large";
  90. return -EINVAL;
  91. }
  92. sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
  93. if (!sctx->region_table) {
  94. ti->error = "Cannot allocate region table";
  95. return -ENOMEM;
  96. }
  97. return 0;
  98. }
  99. static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
  100. unsigned long *region_index, unsigned *bit)
  101. {
  102. if (sctx->region_entries_per_slot_bits >= 0) {
  103. *region_index = region_nr >> sctx->region_entries_per_slot_bits;
  104. *bit = region_nr & (sctx->region_entries_per_slot - 1);
  105. } else {
  106. *region_index = region_nr / sctx->region_entries_per_slot;
  107. *bit = region_nr % sctx->region_entries_per_slot;
  108. }
  109. *bit *= sctx->region_table_entry_bits;
  110. }
  111. /*
  112. * Find which path to use at given offset.
  113. */
  114. static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
  115. {
  116. unsigned long region_index;
  117. unsigned bit, path_nr;
  118. sector_t p;
  119. p = offset;
  120. if (sctx->region_size_bits >= 0)
  121. p >>= sctx->region_size_bits;
  122. else
  123. sector_div(p, sctx->region_size);
  124. switch_get_position(sctx, p, &region_index, &bit);
  125. path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
  126. ((1 << sctx->region_table_entry_bits) - 1);
  127. /* This can only happen if the processor uses non-atomic stores. */
  128. if (unlikely(path_nr >= sctx->nr_paths))
  129. path_nr = 0;
  130. return path_nr;
  131. }
  132. static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
  133. unsigned value)
  134. {
  135. unsigned long region_index;
  136. unsigned bit;
  137. region_table_slot_t pte;
  138. switch_get_position(sctx, region_nr, &region_index, &bit);
  139. pte = sctx->region_table[region_index];
  140. pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
  141. pte |= (region_table_slot_t)value << bit;
  142. sctx->region_table[region_index] = pte;
  143. }
  144. /*
  145. * Fill the region table with an initial round robin pattern.
  146. */
  147. static void initialise_region_table(struct switch_ctx *sctx)
  148. {
  149. unsigned path_nr = 0;
  150. unsigned long region_nr;
  151. for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
  152. switch_region_table_write(sctx, region_nr, path_nr);
  153. if (++path_nr >= sctx->nr_paths)
  154. path_nr = 0;
  155. }
  156. }
  157. static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
  158. {
  159. struct switch_ctx *sctx = ti->private;
  160. unsigned long long start;
  161. int r;
  162. r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
  163. &sctx->path_list[sctx->nr_paths].dmdev);
  164. if (r) {
  165. ti->error = "Device lookup failed";
  166. return r;
  167. }
  168. if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
  169. ti->error = "Invalid device starting offset";
  170. dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
  171. return -EINVAL;
  172. }
  173. sctx->path_list[sctx->nr_paths].start = start;
  174. sctx->nr_paths++;
  175. return 0;
  176. }
  177. /*
  178. * Destructor: Don't free the dm_target, just the ti->private data (if any).
  179. */
  180. static void switch_dtr(struct dm_target *ti)
  181. {
  182. struct switch_ctx *sctx = ti->private;
  183. while (sctx->nr_paths--)
  184. dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
  185. vfree(sctx->region_table);
  186. kfree(sctx);
  187. }
  188. /*
  189. * Constructor arguments:
  190. * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
  191. * [<dev_path> <offset>]+
  192. *
  193. * Optional args are to allow for future extension: currently this
  194. * parameter must be 0.
  195. */
  196. static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
  197. {
  198. static struct dm_arg _args[] = {
  199. {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
  200. {1, UINT_MAX, "Invalid region size"},
  201. {0, 0, "Invalid number of optional args"},
  202. };
  203. struct switch_ctx *sctx;
  204. struct dm_arg_set as;
  205. unsigned nr_paths, region_size, nr_optional_args;
  206. int r;
  207. as.argc = argc;
  208. as.argv = argv;
  209. r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
  210. if (r)
  211. return -EINVAL;
  212. r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
  213. if (r)
  214. return r;
  215. r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
  216. if (r)
  217. return r;
  218. /* parse optional arguments here, if we add any */
  219. if (as.argc != nr_paths * 2) {
  220. ti->error = "Incorrect number of path arguments";
  221. return -EINVAL;
  222. }
  223. sctx = alloc_switch_ctx(ti, nr_paths, region_size);
  224. if (!sctx) {
  225. ti->error = "Cannot allocate redirection context";
  226. return -ENOMEM;
  227. }
  228. r = dm_set_target_max_io_len(ti, region_size);
  229. if (r)
  230. goto error;
  231. while (as.argc) {
  232. r = parse_path(&as, ti);
  233. if (r)
  234. goto error;
  235. }
  236. r = alloc_region_table(ti, nr_paths);
  237. if (r)
  238. goto error;
  239. initialise_region_table(sctx);
  240. /* For UNMAP, sending the request down any path is sufficient */
  241. ti->num_discard_bios = 1;
  242. return 0;
  243. error:
  244. switch_dtr(ti);
  245. return r;
  246. }
  247. static int switch_map(struct dm_target *ti, struct bio *bio)
  248. {
  249. struct switch_ctx *sctx = ti->private;
  250. sector_t offset = dm_target_offset(ti, bio->bi_sector);
  251. unsigned path_nr = switch_get_path_nr(sctx, offset);
  252. bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
  253. bio->bi_sector = sctx->path_list[path_nr].start + offset;
  254. return DM_MAPIO_REMAPPED;
  255. }
  256. /*
  257. * We need to parse hex numbers in the message as quickly as possible.
  258. *
  259. * This table-based hex parser improves performance.
  260. * It improves a time to load 1000000 entries compared to the condition-based
  261. * parser.
  262. * table-based parser condition-based parser
  263. * PA-RISC 0.29s 0.31s
  264. * Opteron 0.0495s 0.0498s
  265. */
  266. static const unsigned char hex_table[256] = {
  267. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  268. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  269. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  270. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
  271. 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  272. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  273. 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  274. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  275. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  276. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  277. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  278. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  279. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  280. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  281. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  282. 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
  283. };
  284. static __always_inline unsigned long parse_hex(const char **string)
  285. {
  286. unsigned char d;
  287. unsigned long r = 0;
  288. while ((d = hex_table[(unsigned char)**string]) < 16) {
  289. r = (r << 4) | d;
  290. (*string)++;
  291. }
  292. return r;
  293. }
  294. static int process_set_region_mappings(struct switch_ctx *sctx,
  295. unsigned argc, char **argv)
  296. {
  297. unsigned i;
  298. unsigned long region_index = 0;
  299. for (i = 1; i < argc; i++) {
  300. unsigned long path_nr;
  301. const char *string = argv[i];
  302. if (*string == ':')
  303. region_index++;
  304. else {
  305. region_index = parse_hex(&string);
  306. if (unlikely(*string != ':')) {
  307. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  308. return -EINVAL;
  309. }
  310. }
  311. string++;
  312. if (unlikely(!*string)) {
  313. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  314. return -EINVAL;
  315. }
  316. path_nr = parse_hex(&string);
  317. if (unlikely(*string)) {
  318. DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
  319. return -EINVAL;
  320. }
  321. if (unlikely(region_index >= sctx->nr_regions)) {
  322. DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
  323. return -EINVAL;
  324. }
  325. if (unlikely(path_nr >= sctx->nr_paths)) {
  326. DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
  327. return -EINVAL;
  328. }
  329. switch_region_table_write(sctx, region_index, path_nr);
  330. }
  331. return 0;
  332. }
  333. /*
  334. * Messages are processed one-at-a-time.
  335. *
  336. * Only set_region_mappings is supported.
  337. */
  338. static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
  339. {
  340. static DEFINE_MUTEX(message_mutex);
  341. struct switch_ctx *sctx = ti->private;
  342. int r = -EINVAL;
  343. mutex_lock(&message_mutex);
  344. if (!strcasecmp(argv[0], "set_region_mappings"))
  345. r = process_set_region_mappings(sctx, argc, argv);
  346. else
  347. DMWARN("Unrecognised message received.");
  348. mutex_unlock(&message_mutex);
  349. return r;
  350. }
  351. static void switch_status(struct dm_target *ti, status_type_t type,
  352. unsigned status_flags, char *result, unsigned maxlen)
  353. {
  354. struct switch_ctx *sctx = ti->private;
  355. unsigned sz = 0;
  356. int path_nr;
  357. switch (type) {
  358. case STATUSTYPE_INFO:
  359. result[0] = '\0';
  360. break;
  361. case STATUSTYPE_TABLE:
  362. DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
  363. for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
  364. DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
  365. (unsigned long long)sctx->path_list[path_nr].start);
  366. break;
  367. }
  368. }
  369. /*
  370. * Switch ioctl:
  371. *
  372. * Passthrough all ioctls to the path for sector 0
  373. */
  374. static int switch_ioctl(struct dm_target *ti, unsigned cmd,
  375. unsigned long arg)
  376. {
  377. struct switch_ctx *sctx = ti->private;
  378. struct block_device *bdev;
  379. fmode_t mode;
  380. unsigned path_nr;
  381. int r = 0;
  382. path_nr = switch_get_path_nr(sctx, 0);
  383. bdev = sctx->path_list[path_nr].dmdev->bdev;
  384. mode = sctx->path_list[path_nr].dmdev->mode;
  385. /*
  386. * Only pass ioctls through if the device sizes match exactly.
  387. */
  388. if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
  389. r = scsi_verify_blk_ioctl(NULL, cmd);
  390. return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
  391. }
  392. static int switch_iterate_devices(struct dm_target *ti,
  393. iterate_devices_callout_fn fn, void *data)
  394. {
  395. struct switch_ctx *sctx = ti->private;
  396. int path_nr;
  397. int r;
  398. for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
  399. r = fn(ti, sctx->path_list[path_nr].dmdev,
  400. sctx->path_list[path_nr].start, ti->len, data);
  401. if (r)
  402. return r;
  403. }
  404. return 0;
  405. }
  406. static struct target_type switch_target = {
  407. .name = "switch",
  408. .version = {1, 0, 0},
  409. .module = THIS_MODULE,
  410. .ctr = switch_ctr,
  411. .dtr = switch_dtr,
  412. .map = switch_map,
  413. .message = switch_message,
  414. .status = switch_status,
  415. .ioctl = switch_ioctl,
  416. .iterate_devices = switch_iterate_devices,
  417. };
  418. static int __init dm_switch_init(void)
  419. {
  420. int r;
  421. r = dm_register_target(&switch_target);
  422. if (r < 0)
  423. DMERR("dm_register_target() failed %d", r);
  424. return r;
  425. }
  426. static void __exit dm_switch_exit(void)
  427. {
  428. dm_unregister_target(&switch_target);
  429. }
  430. module_init(dm_switch_init);
  431. module_exit(dm_switch_exit);
  432. MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
  433. MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
  434. MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
  435. MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
  436. MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
  437. MODULE_LICENSE("GPL");