blk-mq.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500
  1. #include <linux/kernel.h>
  2. #include <linux/module.h>
  3. #include <linux/backing-dev.h>
  4. #include <linux/bio.h>
  5. #include <linux/blkdev.h>
  6. #include <linux/mm.h>
  7. #include <linux/init.h>
  8. #include <linux/slab.h>
  9. #include <linux/workqueue.h>
  10. #include <linux/smp.h>
  11. #include <linux/llist.h>
  12. #include <linux/list_sort.h>
  13. #include <linux/cpu.h>
  14. #include <linux/cache.h>
  15. #include <linux/sched/sysctl.h>
  16. #include <linux/delay.h>
  17. #include <trace/events/block.h>
  18. #include <linux/blk-mq.h>
  19. #include "blk.h"
  20. #include "blk-mq.h"
  21. #include "blk-mq-tag.h"
  22. static DEFINE_MUTEX(all_q_mutex);
  23. static LIST_HEAD(all_q_list);
  24. static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
  25. DEFINE_PER_CPU(struct llist_head, ipi_lists);
  26. static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
  27. unsigned int cpu)
  28. {
  29. return per_cpu_ptr(q->queue_ctx, cpu);
  30. }
  31. /*
  32. * This assumes per-cpu software queueing queues. They could be per-node
  33. * as well, for instance. For now this is hardcoded as-is. Note that we don't
  34. * care about preemption, since we know the ctx's are persistent. This does
  35. * mean that we can't rely on ctx always matching the currently running CPU.
  36. */
  37. static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
  38. {
  39. return __blk_mq_get_ctx(q, get_cpu());
  40. }
  41. static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
  42. {
  43. put_cpu();
  44. }
  45. /*
  46. * Check if any of the ctx's have pending work in this hardware queue
  47. */
  48. static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  49. {
  50. unsigned int i;
  51. for (i = 0; i < hctx->nr_ctx_map; i++)
  52. if (hctx->ctx_map[i])
  53. return true;
  54. return false;
  55. }
  56. /*
  57. * Mark this ctx as having pending work in this hardware queue
  58. */
  59. static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  60. struct blk_mq_ctx *ctx)
  61. {
  62. if (!test_bit(ctx->index_hw, hctx->ctx_map))
  63. set_bit(ctx->index_hw, hctx->ctx_map);
  64. }
  65. static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,
  66. bool reserved)
  67. {
  68. struct request *rq;
  69. unsigned int tag;
  70. tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
  71. if (tag != BLK_MQ_TAG_FAIL) {
  72. rq = hctx->rqs[tag];
  73. rq->tag = tag;
  74. return rq;
  75. }
  76. return NULL;
  77. }
  78. static int blk_mq_queue_enter(struct request_queue *q)
  79. {
  80. int ret;
  81. __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
  82. smp_wmb();
  83. /* we have problems to freeze the queue if it's initializing */
  84. if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
  85. return 0;
  86. __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
  87. spin_lock_irq(q->queue_lock);
  88. ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
  89. !blk_queue_bypass(q), *q->queue_lock);
  90. /* inc usage with lock hold to avoid freeze_queue runs here */
  91. if (!ret)
  92. __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
  93. spin_unlock_irq(q->queue_lock);
  94. return ret;
  95. }
  96. static void blk_mq_queue_exit(struct request_queue *q)
  97. {
  98. __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
  99. }
  100. /*
  101. * Guarantee no request is in use, so we can change any data structure of
  102. * the queue afterward.
  103. */
  104. static void blk_mq_freeze_queue(struct request_queue *q)
  105. {
  106. bool drain;
  107. spin_lock_irq(q->queue_lock);
  108. drain = !q->bypass_depth++;
  109. queue_flag_set(QUEUE_FLAG_BYPASS, q);
  110. spin_unlock_irq(q->queue_lock);
  111. if (!drain)
  112. return;
  113. while (true) {
  114. s64 count;
  115. spin_lock_irq(q->queue_lock);
  116. count = percpu_counter_sum(&q->mq_usage_counter);
  117. spin_unlock_irq(q->queue_lock);
  118. if (count == 0)
  119. break;
  120. blk_mq_run_queues(q, false);
  121. msleep(10);
  122. }
  123. }
  124. static void blk_mq_unfreeze_queue(struct request_queue *q)
  125. {
  126. bool wake = false;
  127. spin_lock_irq(q->queue_lock);
  128. if (!--q->bypass_depth) {
  129. queue_flag_clear(QUEUE_FLAG_BYPASS, q);
  130. wake = true;
  131. }
  132. WARN_ON_ONCE(q->bypass_depth < 0);
  133. spin_unlock_irq(q->queue_lock);
  134. if (wake)
  135. wake_up_all(&q->mq_freeze_wq);
  136. }
  137. bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
  138. {
  139. return blk_mq_has_free_tags(hctx->tags);
  140. }
  141. EXPORT_SYMBOL(blk_mq_can_queue);
  142. static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq,
  143. unsigned int rw_flags)
  144. {
  145. rq->mq_ctx = ctx;
  146. rq->cmd_flags = rw_flags;
  147. ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
  148. }
  149. static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
  150. gfp_t gfp, bool reserved)
  151. {
  152. return blk_mq_alloc_rq(hctx, gfp, reserved);
  153. }
  154. static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
  155. int rw, gfp_t gfp,
  156. bool reserved)
  157. {
  158. struct request *rq;
  159. do {
  160. struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
  161. struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
  162. rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
  163. if (rq) {
  164. blk_mq_rq_ctx_init(ctx, rq, rw);
  165. break;
  166. } else if (!(gfp & __GFP_WAIT))
  167. break;
  168. blk_mq_put_ctx(ctx);
  169. __blk_mq_run_hw_queue(hctx);
  170. blk_mq_wait_for_tags(hctx->tags);
  171. } while (1);
  172. return rq;
  173. }
  174. struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
  175. gfp_t gfp, bool reserved)
  176. {
  177. struct request *rq;
  178. if (blk_mq_queue_enter(q))
  179. return NULL;
  180. rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
  181. blk_mq_put_ctx(rq->mq_ctx);
  182. return rq;
  183. }
  184. struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
  185. gfp_t gfp)
  186. {
  187. struct request *rq;
  188. if (blk_mq_queue_enter(q))
  189. return NULL;
  190. rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
  191. blk_mq_put_ctx(rq->mq_ctx);
  192. return rq;
  193. }
  194. EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
  195. /*
  196. * Re-init and set pdu, if we have it
  197. */
  198. static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
  199. {
  200. blk_rq_init(hctx->queue, rq);
  201. if (hctx->cmd_size)
  202. rq->special = blk_mq_rq_to_pdu(rq);
  203. }
  204. static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
  205. struct blk_mq_ctx *ctx, struct request *rq)
  206. {
  207. const int tag = rq->tag;
  208. struct request_queue *q = rq->q;
  209. blk_mq_rq_init(hctx, rq);
  210. blk_mq_put_tag(hctx->tags, tag);
  211. blk_mq_queue_exit(q);
  212. }
  213. void blk_mq_free_request(struct request *rq)
  214. {
  215. struct blk_mq_ctx *ctx = rq->mq_ctx;
  216. struct blk_mq_hw_ctx *hctx;
  217. struct request_queue *q = rq->q;
  218. ctx->rq_completed[rq_is_sync(rq)]++;
  219. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  220. __blk_mq_free_request(hctx, ctx, rq);
  221. }
  222. static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
  223. {
  224. if (error)
  225. clear_bit(BIO_UPTODATE, &bio->bi_flags);
  226. else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
  227. error = -EIO;
  228. if (unlikely(rq->cmd_flags & REQ_QUIET))
  229. set_bit(BIO_QUIET, &bio->bi_flags);
  230. /* don't actually finish bio if it's part of flush sequence */
  231. if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
  232. bio_endio(bio, error);
  233. }
  234. void blk_mq_complete_request(struct request *rq, int error)
  235. {
  236. struct bio *bio = rq->bio;
  237. unsigned int bytes = 0;
  238. trace_block_rq_complete(rq->q, rq);
  239. while (bio) {
  240. struct bio *next = bio->bi_next;
  241. bio->bi_next = NULL;
  242. bytes += bio->bi_size;
  243. blk_mq_bio_endio(rq, bio, error);
  244. bio = next;
  245. }
  246. blk_account_io_completion(rq, bytes);
  247. if (rq->end_io)
  248. rq->end_io(rq, error);
  249. else
  250. blk_mq_free_request(rq);
  251. blk_account_io_done(rq);
  252. }
  253. void __blk_mq_end_io(struct request *rq, int error)
  254. {
  255. if (!blk_mark_rq_complete(rq))
  256. blk_mq_complete_request(rq, error);
  257. }
  258. #if defined(CONFIG_SMP)
  259. /*
  260. * Called with interrupts disabled.
  261. */
  262. static void ipi_end_io(void *data)
  263. {
  264. struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
  265. struct llist_node *entry, *next;
  266. struct request *rq;
  267. entry = llist_del_all(list);
  268. while (entry) {
  269. next = entry->next;
  270. rq = llist_entry(entry, struct request, ll_list);
  271. __blk_mq_end_io(rq, rq->errors);
  272. entry = next;
  273. }
  274. }
  275. static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
  276. struct request *rq, const int error)
  277. {
  278. struct call_single_data *data = &rq->csd;
  279. rq->errors = error;
  280. rq->ll_list.next = NULL;
  281. /*
  282. * If the list is non-empty, an existing IPI must already
  283. * be "in flight". If that is the case, we need not schedule
  284. * a new one.
  285. */
  286. if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
  287. data->func = ipi_end_io;
  288. data->flags = 0;
  289. __smp_call_function_single(ctx->cpu, data, 0);
  290. }
  291. return true;
  292. }
  293. #else /* CONFIG_SMP */
  294. static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
  295. struct request *rq, const int error)
  296. {
  297. return false;
  298. }
  299. #endif
  300. /*
  301. * End IO on this request on a multiqueue enabled driver. We'll either do
  302. * it directly inline, or punt to a local IPI handler on the matching
  303. * remote CPU.
  304. */
  305. void blk_mq_end_io(struct request *rq, int error)
  306. {
  307. struct blk_mq_ctx *ctx = rq->mq_ctx;
  308. int cpu;
  309. if (!ctx->ipi_redirect)
  310. return __blk_mq_end_io(rq, error);
  311. cpu = get_cpu();
  312. if (cpu == ctx->cpu || !cpu_online(ctx->cpu) ||
  313. !ipi_remote_cpu(ctx, cpu, rq, error))
  314. __blk_mq_end_io(rq, error);
  315. put_cpu();
  316. }
  317. EXPORT_SYMBOL(blk_mq_end_io);
  318. static void blk_mq_start_request(struct request *rq)
  319. {
  320. struct request_queue *q = rq->q;
  321. trace_block_rq_issue(q, rq);
  322. /*
  323. * Just mark start time and set the started bit. Due to memory
  324. * ordering, we know we'll see the correct deadline as long as
  325. * REQ_ATOMIC_STARTED is seen.
  326. */
  327. rq->deadline = jiffies + q->rq_timeout;
  328. set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
  329. }
  330. static void blk_mq_requeue_request(struct request *rq)
  331. {
  332. struct request_queue *q = rq->q;
  333. trace_block_rq_requeue(q, rq);
  334. clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
  335. }
  336. struct blk_mq_timeout_data {
  337. struct blk_mq_hw_ctx *hctx;
  338. unsigned long *next;
  339. unsigned int *next_set;
  340. };
  341. static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
  342. {
  343. struct blk_mq_timeout_data *data = __data;
  344. struct blk_mq_hw_ctx *hctx = data->hctx;
  345. unsigned int tag;
  346. /* It may not be in flight yet (this is where
  347. * the REQ_ATOMIC_STARTED flag comes in). The requests are
  348. * statically allocated, so we know it's always safe to access the
  349. * memory associated with a bit offset into ->rqs[].
  350. */
  351. tag = 0;
  352. do {
  353. struct request *rq;
  354. tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
  355. if (tag >= hctx->queue_depth)
  356. break;
  357. rq = hctx->rqs[tag++];
  358. if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
  359. continue;
  360. blk_rq_check_expired(rq, data->next, data->next_set);
  361. } while (1);
  362. }
  363. static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
  364. unsigned long *next,
  365. unsigned int *next_set)
  366. {
  367. struct blk_mq_timeout_data data = {
  368. .hctx = hctx,
  369. .next = next,
  370. .next_set = next_set,
  371. };
  372. /*
  373. * Ask the tagging code to iterate busy requests, so we can
  374. * check them for timeout.
  375. */
  376. blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
  377. }
  378. static void blk_mq_rq_timer(unsigned long data)
  379. {
  380. struct request_queue *q = (struct request_queue *) data;
  381. struct blk_mq_hw_ctx *hctx;
  382. unsigned long next = 0;
  383. int i, next_set = 0;
  384. queue_for_each_hw_ctx(q, hctx, i)
  385. blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
  386. if (next_set)
  387. mod_timer(&q->timeout, round_jiffies_up(next));
  388. }
  389. /*
  390. * Reverse check our software queue for entries that we could potentially
  391. * merge with. Currently includes a hand-wavy stop count of 8, to not spend
  392. * too much time checking for merges.
  393. */
  394. static bool blk_mq_attempt_merge(struct request_queue *q,
  395. struct blk_mq_ctx *ctx, struct bio *bio)
  396. {
  397. struct request *rq;
  398. int checked = 8;
  399. list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
  400. int el_ret;
  401. if (!checked--)
  402. break;
  403. if (!blk_rq_merge_ok(rq, bio))
  404. continue;
  405. el_ret = blk_try_merge(rq, bio);
  406. if (el_ret == ELEVATOR_BACK_MERGE) {
  407. if (bio_attempt_back_merge(q, rq, bio)) {
  408. ctx->rq_merged++;
  409. return true;
  410. }
  411. break;
  412. } else if (el_ret == ELEVATOR_FRONT_MERGE) {
  413. if (bio_attempt_front_merge(q, rq, bio)) {
  414. ctx->rq_merged++;
  415. return true;
  416. }
  417. break;
  418. }
  419. }
  420. return false;
  421. }
  422. void blk_mq_add_timer(struct request *rq)
  423. {
  424. __blk_add_timer(rq, NULL);
  425. }
  426. /*
  427. * Run this hardware queue, pulling any software queues mapped to it in.
  428. * Note that this function currently has various problems around ordering
  429. * of IO. In particular, we'd like FIFO behaviour on handling existing
  430. * items on the hctx->dispatch list. Ignore that for now.
  431. */
  432. static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  433. {
  434. struct request_queue *q = hctx->queue;
  435. struct blk_mq_ctx *ctx;
  436. struct request *rq;
  437. LIST_HEAD(rq_list);
  438. int bit, queued;
  439. if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
  440. return;
  441. hctx->run++;
  442. /*
  443. * Touch any software queue that has pending entries.
  444. */
  445. for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
  446. clear_bit(bit, hctx->ctx_map);
  447. ctx = hctx->ctxs[bit];
  448. BUG_ON(bit != ctx->index_hw);
  449. spin_lock(&ctx->lock);
  450. list_splice_tail_init(&ctx->rq_list, &rq_list);
  451. spin_unlock(&ctx->lock);
  452. }
  453. /*
  454. * If we have previous entries on our dispatch list, grab them
  455. * and stuff them at the front for more fair dispatch.
  456. */
  457. if (!list_empty_careful(&hctx->dispatch)) {
  458. spin_lock(&hctx->lock);
  459. if (!list_empty(&hctx->dispatch))
  460. list_splice_init(&hctx->dispatch, &rq_list);
  461. spin_unlock(&hctx->lock);
  462. }
  463. /*
  464. * Delete and return all entries from our dispatch list
  465. */
  466. queued = 0;
  467. /*
  468. * Now process all the entries, sending them to the driver.
  469. */
  470. while (!list_empty(&rq_list)) {
  471. int ret;
  472. rq = list_first_entry(&rq_list, struct request, queuelist);
  473. list_del_init(&rq->queuelist);
  474. blk_mq_start_request(rq);
  475. /*
  476. * Last request in the series. Flag it as such, this
  477. * enables drivers to know when IO should be kicked off,
  478. * if they don't do it on a per-request basis.
  479. *
  480. * Note: the flag isn't the only condition drivers
  481. * should do kick off. If drive is busy, the last
  482. * request might not have the bit set.
  483. */
  484. if (list_empty(&rq_list))
  485. rq->cmd_flags |= REQ_END;
  486. ret = q->mq_ops->queue_rq(hctx, rq);
  487. switch (ret) {
  488. case BLK_MQ_RQ_QUEUE_OK:
  489. queued++;
  490. continue;
  491. case BLK_MQ_RQ_QUEUE_BUSY:
  492. /*
  493. * FIXME: we should have a mechanism to stop the queue
  494. * like blk_stop_queue, otherwise we will waste cpu
  495. * time
  496. */
  497. list_add(&rq->queuelist, &rq_list);
  498. blk_mq_requeue_request(rq);
  499. break;
  500. default:
  501. pr_err("blk-mq: bad return on queue: %d\n", ret);
  502. rq->errors = -EIO;
  503. case BLK_MQ_RQ_QUEUE_ERROR:
  504. blk_mq_end_io(rq, rq->errors);
  505. break;
  506. }
  507. if (ret == BLK_MQ_RQ_QUEUE_BUSY)
  508. break;
  509. }
  510. if (!queued)
  511. hctx->dispatched[0]++;
  512. else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
  513. hctx->dispatched[ilog2(queued) + 1]++;
  514. /*
  515. * Any items that need requeuing? Stuff them into hctx->dispatch,
  516. * that is where we will continue on next queue run.
  517. */
  518. if (!list_empty(&rq_list)) {
  519. spin_lock(&hctx->lock);
  520. list_splice(&rq_list, &hctx->dispatch);
  521. spin_unlock(&hctx->lock);
  522. }
  523. }
  524. void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
  525. {
  526. if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
  527. return;
  528. if (!async)
  529. __blk_mq_run_hw_queue(hctx);
  530. else {
  531. struct request_queue *q = hctx->queue;
  532. kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
  533. }
  534. }
  535. void blk_mq_run_queues(struct request_queue *q, bool async)
  536. {
  537. struct blk_mq_hw_ctx *hctx;
  538. int i;
  539. queue_for_each_hw_ctx(q, hctx, i) {
  540. if ((!blk_mq_hctx_has_pending(hctx) &&
  541. list_empty_careful(&hctx->dispatch)) ||
  542. test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
  543. continue;
  544. blk_mq_run_hw_queue(hctx, async);
  545. }
  546. }
  547. EXPORT_SYMBOL(blk_mq_run_queues);
  548. void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
  549. {
  550. cancel_delayed_work(&hctx->delayed_work);
  551. set_bit(BLK_MQ_S_STOPPED, &hctx->state);
  552. }
  553. EXPORT_SYMBOL(blk_mq_stop_hw_queue);
  554. void blk_mq_stop_hw_queues(struct request_queue *q)
  555. {
  556. struct blk_mq_hw_ctx *hctx;
  557. int i;
  558. queue_for_each_hw_ctx(q, hctx, i)
  559. blk_mq_stop_hw_queue(hctx);
  560. }
  561. EXPORT_SYMBOL(blk_mq_stop_hw_queues);
  562. void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
  563. {
  564. clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  565. __blk_mq_run_hw_queue(hctx);
  566. }
  567. EXPORT_SYMBOL(blk_mq_start_hw_queue);
  568. void blk_mq_start_stopped_hw_queues(struct request_queue *q)
  569. {
  570. struct blk_mq_hw_ctx *hctx;
  571. int i;
  572. queue_for_each_hw_ctx(q, hctx, i) {
  573. if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
  574. continue;
  575. clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  576. blk_mq_run_hw_queue(hctx, true);
  577. }
  578. }
  579. EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
  580. static void blk_mq_work_fn(struct work_struct *work)
  581. {
  582. struct blk_mq_hw_ctx *hctx;
  583. hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
  584. __blk_mq_run_hw_queue(hctx);
  585. }
  586. static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
  587. struct request *rq)
  588. {
  589. struct blk_mq_ctx *ctx = rq->mq_ctx;
  590. list_add_tail(&rq->queuelist, &ctx->rq_list);
  591. blk_mq_hctx_mark_pending(hctx, ctx);
  592. /*
  593. * We do this early, to ensure we are on the right CPU.
  594. */
  595. blk_mq_add_timer(rq);
  596. }
  597. void blk_mq_insert_request(struct request_queue *q, struct request *rq,
  598. bool run_queue)
  599. {
  600. struct blk_mq_hw_ctx *hctx;
  601. struct blk_mq_ctx *ctx, *current_ctx;
  602. ctx = rq->mq_ctx;
  603. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  604. if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
  605. blk_insert_flush(rq);
  606. } else {
  607. current_ctx = blk_mq_get_ctx(q);
  608. if (!cpu_online(ctx->cpu)) {
  609. ctx = current_ctx;
  610. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  611. rq->mq_ctx = ctx;
  612. }
  613. spin_lock(&ctx->lock);
  614. __blk_mq_insert_request(hctx, rq);
  615. spin_unlock(&ctx->lock);
  616. blk_mq_put_ctx(current_ctx);
  617. }
  618. if (run_queue)
  619. __blk_mq_run_hw_queue(hctx);
  620. }
  621. EXPORT_SYMBOL(blk_mq_insert_request);
  622. /*
  623. * This is a special version of blk_mq_insert_request to bypass FLUSH request
  624. * check. Should only be used internally.
  625. */
  626. void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
  627. {
  628. struct request_queue *q = rq->q;
  629. struct blk_mq_hw_ctx *hctx;
  630. struct blk_mq_ctx *ctx, *current_ctx;
  631. current_ctx = blk_mq_get_ctx(q);
  632. ctx = rq->mq_ctx;
  633. if (!cpu_online(ctx->cpu)) {
  634. ctx = current_ctx;
  635. rq->mq_ctx = ctx;
  636. }
  637. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  638. /* ctx->cpu might be offline */
  639. spin_lock(&ctx->lock);
  640. __blk_mq_insert_request(hctx, rq);
  641. spin_unlock(&ctx->lock);
  642. blk_mq_put_ctx(current_ctx);
  643. if (run_queue)
  644. blk_mq_run_hw_queue(hctx, async);
  645. }
  646. static void blk_mq_insert_requests(struct request_queue *q,
  647. struct blk_mq_ctx *ctx,
  648. struct list_head *list,
  649. int depth,
  650. bool from_schedule)
  651. {
  652. struct blk_mq_hw_ctx *hctx;
  653. struct blk_mq_ctx *current_ctx;
  654. trace_block_unplug(q, depth, !from_schedule);
  655. current_ctx = blk_mq_get_ctx(q);
  656. if (!cpu_online(ctx->cpu))
  657. ctx = current_ctx;
  658. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  659. /*
  660. * preemption doesn't flush plug list, so it's possible ctx->cpu is
  661. * offline now
  662. */
  663. spin_lock(&ctx->lock);
  664. while (!list_empty(list)) {
  665. struct request *rq;
  666. rq = list_first_entry(list, struct request, queuelist);
  667. list_del_init(&rq->queuelist);
  668. rq->mq_ctx = ctx;
  669. __blk_mq_insert_request(hctx, rq);
  670. }
  671. spin_unlock(&ctx->lock);
  672. blk_mq_put_ctx(current_ctx);
  673. blk_mq_run_hw_queue(hctx, from_schedule);
  674. }
  675. static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
  676. {
  677. struct request *rqa = container_of(a, struct request, queuelist);
  678. struct request *rqb = container_of(b, struct request, queuelist);
  679. return !(rqa->mq_ctx < rqb->mq_ctx ||
  680. (rqa->mq_ctx == rqb->mq_ctx &&
  681. blk_rq_pos(rqa) < blk_rq_pos(rqb)));
  682. }
  683. void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
  684. {
  685. struct blk_mq_ctx *this_ctx;
  686. struct request_queue *this_q;
  687. struct request *rq;
  688. LIST_HEAD(list);
  689. LIST_HEAD(ctx_list);
  690. unsigned int depth;
  691. list_splice_init(&plug->mq_list, &list);
  692. list_sort(NULL, &list, plug_ctx_cmp);
  693. this_q = NULL;
  694. this_ctx = NULL;
  695. depth = 0;
  696. while (!list_empty(&list)) {
  697. rq = list_entry_rq(list.next);
  698. list_del_init(&rq->queuelist);
  699. BUG_ON(!rq->q);
  700. if (rq->mq_ctx != this_ctx) {
  701. if (this_ctx) {
  702. blk_mq_insert_requests(this_q, this_ctx,
  703. &ctx_list, depth,
  704. from_schedule);
  705. }
  706. this_ctx = rq->mq_ctx;
  707. this_q = rq->q;
  708. depth = 0;
  709. }
  710. depth++;
  711. list_add_tail(&rq->queuelist, &ctx_list);
  712. }
  713. /*
  714. * If 'this_ctx' is set, we know we have entries to complete
  715. * on 'ctx_list'. Do those.
  716. */
  717. if (this_ctx) {
  718. blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
  719. from_schedule);
  720. }
  721. }
  722. static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
  723. {
  724. init_request_from_bio(rq, bio);
  725. blk_account_io_start(rq, 1);
  726. }
  727. static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
  728. {
  729. struct blk_mq_hw_ctx *hctx;
  730. struct blk_mq_ctx *ctx;
  731. const int is_sync = rw_is_sync(bio->bi_rw);
  732. const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
  733. int rw = bio_data_dir(bio);
  734. struct request *rq;
  735. unsigned int use_plug, request_count = 0;
  736. /*
  737. * If we have multiple hardware queues, just go directly to
  738. * one of those for sync IO.
  739. */
  740. use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
  741. blk_queue_bounce(q, &bio);
  742. if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
  743. return;
  744. if (blk_mq_queue_enter(q)) {
  745. bio_endio(bio, -EIO);
  746. return;
  747. }
  748. ctx = blk_mq_get_ctx(q);
  749. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  750. trace_block_getrq(q, bio, rw);
  751. rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
  752. if (likely(rq))
  753. blk_mq_rq_ctx_init(ctx, rq, rw);
  754. else {
  755. blk_mq_put_ctx(ctx);
  756. trace_block_sleeprq(q, bio, rw);
  757. rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
  758. false);
  759. ctx = rq->mq_ctx;
  760. hctx = q->mq_ops->map_queue(q, ctx->cpu);
  761. }
  762. hctx->queued++;
  763. if (unlikely(is_flush_fua)) {
  764. blk_mq_bio_to_request(rq, bio);
  765. blk_mq_put_ctx(ctx);
  766. blk_insert_flush(rq);
  767. goto run_queue;
  768. }
  769. /*
  770. * A task plug currently exists. Since this is completely lockless,
  771. * utilize that to temporarily store requests until the task is
  772. * either done or scheduled away.
  773. */
  774. if (use_plug) {
  775. struct blk_plug *plug = current->plug;
  776. if (plug) {
  777. blk_mq_bio_to_request(rq, bio);
  778. if (list_empty(&plug->mq_list))
  779. trace_block_plug(q);
  780. else if (request_count >= BLK_MAX_REQUEST_COUNT) {
  781. blk_flush_plug_list(plug, false);
  782. trace_block_plug(q);
  783. }
  784. list_add_tail(&rq->queuelist, &plug->mq_list);
  785. blk_mq_put_ctx(ctx);
  786. return;
  787. }
  788. }
  789. spin_lock(&ctx->lock);
  790. if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
  791. blk_mq_attempt_merge(q, ctx, bio))
  792. __blk_mq_free_request(hctx, ctx, rq);
  793. else {
  794. blk_mq_bio_to_request(rq, bio);
  795. __blk_mq_insert_request(hctx, rq);
  796. }
  797. spin_unlock(&ctx->lock);
  798. blk_mq_put_ctx(ctx);
  799. /*
  800. * For a SYNC request, send it to the hardware immediately. For an
  801. * ASYNC request, just ensure that we run it later on. The latter
  802. * allows for merging opportunities and more efficient dispatching.
  803. */
  804. run_queue:
  805. blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
  806. }
  807. /*
  808. * Default mapping to a software queue, since we use one per CPU.
  809. */
  810. struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
  811. {
  812. return q->queue_hw_ctx[q->mq_map[cpu]];
  813. }
  814. EXPORT_SYMBOL(blk_mq_map_queue);
  815. struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
  816. unsigned int hctx_index)
  817. {
  818. return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
  819. GFP_KERNEL | __GFP_ZERO, reg->numa_node);
  820. }
  821. EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
  822. void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
  823. unsigned int hctx_index)
  824. {
  825. kfree(hctx);
  826. }
  827. EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
  828. static void blk_mq_hctx_notify(void *data, unsigned long action,
  829. unsigned int cpu)
  830. {
  831. struct blk_mq_hw_ctx *hctx = data;
  832. struct blk_mq_ctx *ctx;
  833. LIST_HEAD(tmp);
  834. if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
  835. return;
  836. /*
  837. * Move ctx entries to new CPU, if this one is going away.
  838. */
  839. ctx = __blk_mq_get_ctx(hctx->queue, cpu);
  840. spin_lock(&ctx->lock);
  841. if (!list_empty(&ctx->rq_list)) {
  842. list_splice_init(&ctx->rq_list, &tmp);
  843. clear_bit(ctx->index_hw, hctx->ctx_map);
  844. }
  845. spin_unlock(&ctx->lock);
  846. if (list_empty(&tmp))
  847. return;
  848. ctx = blk_mq_get_ctx(hctx->queue);
  849. spin_lock(&ctx->lock);
  850. while (!list_empty(&tmp)) {
  851. struct request *rq;
  852. rq = list_first_entry(&tmp, struct request, queuelist);
  853. rq->mq_ctx = ctx;
  854. list_move_tail(&rq->queuelist, &ctx->rq_list);
  855. }
  856. blk_mq_hctx_mark_pending(hctx, ctx);
  857. spin_unlock(&ctx->lock);
  858. blk_mq_put_ctx(ctx);
  859. }
  860. static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
  861. void (*init)(void *, struct blk_mq_hw_ctx *,
  862. struct request *, unsigned int),
  863. void *data)
  864. {
  865. unsigned int i;
  866. for (i = 0; i < hctx->queue_depth; i++) {
  867. struct request *rq = hctx->rqs[i];
  868. init(data, hctx, rq, i);
  869. }
  870. }
  871. void blk_mq_init_commands(struct request_queue *q,
  872. void (*init)(void *, struct blk_mq_hw_ctx *,
  873. struct request *, unsigned int),
  874. void *data)
  875. {
  876. struct blk_mq_hw_ctx *hctx;
  877. unsigned int i;
  878. queue_for_each_hw_ctx(q, hctx, i)
  879. blk_mq_init_hw_commands(hctx, init, data);
  880. }
  881. EXPORT_SYMBOL(blk_mq_init_commands);
  882. static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
  883. {
  884. struct page *page;
  885. while (!list_empty(&hctx->page_list)) {
  886. page = list_first_entry(&hctx->page_list, struct page, list);
  887. list_del_init(&page->list);
  888. __free_pages(page, page->private);
  889. }
  890. kfree(hctx->rqs);
  891. if (hctx->tags)
  892. blk_mq_free_tags(hctx->tags);
  893. }
  894. static size_t order_to_size(unsigned int order)
  895. {
  896. size_t ret = PAGE_SIZE;
  897. while (order--)
  898. ret *= 2;
  899. return ret;
  900. }
  901. static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
  902. unsigned int reserved_tags, int node)
  903. {
  904. unsigned int i, j, entries_per_page, max_order = 4;
  905. size_t rq_size, left;
  906. INIT_LIST_HEAD(&hctx->page_list);
  907. hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
  908. GFP_KERNEL, node);
  909. if (!hctx->rqs)
  910. return -ENOMEM;
  911. /*
  912. * rq_size is the size of the request plus driver payload, rounded
  913. * to the cacheline size
  914. */
  915. rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
  916. cache_line_size());
  917. left = rq_size * hctx->queue_depth;
  918. for (i = 0; i < hctx->queue_depth;) {
  919. int this_order = max_order;
  920. struct page *page;
  921. int to_do;
  922. void *p;
  923. while (left < order_to_size(this_order - 1) && this_order)
  924. this_order--;
  925. do {
  926. page = alloc_pages_node(node, GFP_KERNEL, this_order);
  927. if (page)
  928. break;
  929. if (!this_order--)
  930. break;
  931. if (order_to_size(this_order) < rq_size)
  932. break;
  933. } while (1);
  934. if (!page)
  935. break;
  936. page->private = this_order;
  937. list_add_tail(&page->list, &hctx->page_list);
  938. p = page_address(page);
  939. entries_per_page = order_to_size(this_order) / rq_size;
  940. to_do = min(entries_per_page, hctx->queue_depth - i);
  941. left -= to_do * rq_size;
  942. for (j = 0; j < to_do; j++) {
  943. hctx->rqs[i] = p;
  944. blk_mq_rq_init(hctx, hctx->rqs[i]);
  945. p += rq_size;
  946. i++;
  947. }
  948. }
  949. if (i < (reserved_tags + BLK_MQ_TAG_MIN))
  950. goto err_rq_map;
  951. else if (i != hctx->queue_depth) {
  952. hctx->queue_depth = i;
  953. pr_warn("%s: queue depth set to %u because of low memory\n",
  954. __func__, i);
  955. }
  956. hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
  957. if (!hctx->tags) {
  958. err_rq_map:
  959. blk_mq_free_rq_map(hctx);
  960. return -ENOMEM;
  961. }
  962. return 0;
  963. }
  964. static int blk_mq_init_hw_queues(struct request_queue *q,
  965. struct blk_mq_reg *reg, void *driver_data)
  966. {
  967. struct blk_mq_hw_ctx *hctx;
  968. unsigned int i, j;
  969. /*
  970. * Initialize hardware queues
  971. */
  972. queue_for_each_hw_ctx(q, hctx, i) {
  973. unsigned int num_maps;
  974. int node;
  975. node = hctx->numa_node;
  976. if (node == NUMA_NO_NODE)
  977. node = hctx->numa_node = reg->numa_node;
  978. INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
  979. spin_lock_init(&hctx->lock);
  980. INIT_LIST_HEAD(&hctx->dispatch);
  981. hctx->queue = q;
  982. hctx->queue_num = i;
  983. hctx->flags = reg->flags;
  984. hctx->queue_depth = reg->queue_depth;
  985. hctx->cmd_size = reg->cmd_size;
  986. blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
  987. blk_mq_hctx_notify, hctx);
  988. blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
  989. if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
  990. break;
  991. /*
  992. * Allocate space for all possible cpus to avoid allocation in
  993. * runtime
  994. */
  995. hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
  996. GFP_KERNEL, node);
  997. if (!hctx->ctxs)
  998. break;
  999. num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
  1000. hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
  1001. GFP_KERNEL, node);
  1002. if (!hctx->ctx_map)
  1003. break;
  1004. hctx->nr_ctx_map = num_maps;
  1005. hctx->nr_ctx = 0;
  1006. if (reg->ops->init_hctx &&
  1007. reg->ops->init_hctx(hctx, driver_data, i))
  1008. break;
  1009. }
  1010. if (i == q->nr_hw_queues)
  1011. return 0;
  1012. /*
  1013. * Init failed
  1014. */
  1015. queue_for_each_hw_ctx(q, hctx, j) {
  1016. if (i == j)
  1017. break;
  1018. if (reg->ops->exit_hctx)
  1019. reg->ops->exit_hctx(hctx, j);
  1020. blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
  1021. blk_mq_free_rq_map(hctx);
  1022. kfree(hctx->ctxs);
  1023. }
  1024. return 1;
  1025. }
  1026. static void blk_mq_init_cpu_queues(struct request_queue *q,
  1027. unsigned int nr_hw_queues)
  1028. {
  1029. unsigned int i;
  1030. for_each_possible_cpu(i) {
  1031. struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
  1032. struct blk_mq_hw_ctx *hctx;
  1033. memset(__ctx, 0, sizeof(*__ctx));
  1034. __ctx->cpu = i;
  1035. spin_lock_init(&__ctx->lock);
  1036. INIT_LIST_HEAD(&__ctx->rq_list);
  1037. __ctx->queue = q;
  1038. /* If the cpu isn't online, the cpu is mapped to first hctx */
  1039. hctx = q->mq_ops->map_queue(q, i);
  1040. hctx->nr_ctx++;
  1041. if (!cpu_online(i))
  1042. continue;
  1043. /*
  1044. * Set local node, IFF we have more than one hw queue. If
  1045. * not, we remain on the home node of the device
  1046. */
  1047. if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
  1048. hctx->numa_node = cpu_to_node(i);
  1049. }
  1050. }
  1051. static void blk_mq_map_swqueue(struct request_queue *q)
  1052. {
  1053. unsigned int i;
  1054. struct blk_mq_hw_ctx *hctx;
  1055. struct blk_mq_ctx *ctx;
  1056. queue_for_each_hw_ctx(q, hctx, i) {
  1057. hctx->nr_ctx = 0;
  1058. }
  1059. /*
  1060. * Map software to hardware queues
  1061. */
  1062. queue_for_each_ctx(q, ctx, i) {
  1063. /* If the cpu isn't online, the cpu is mapped to first hctx */
  1064. hctx = q->mq_ops->map_queue(q, i);
  1065. ctx->index_hw = hctx->nr_ctx;
  1066. hctx->ctxs[hctx->nr_ctx++] = ctx;
  1067. }
  1068. }
  1069. struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
  1070. void *driver_data)
  1071. {
  1072. struct blk_mq_hw_ctx **hctxs;
  1073. struct blk_mq_ctx *ctx;
  1074. struct request_queue *q;
  1075. int i;
  1076. if (!reg->nr_hw_queues ||
  1077. !reg->ops->queue_rq || !reg->ops->map_queue ||
  1078. !reg->ops->alloc_hctx || !reg->ops->free_hctx)
  1079. return ERR_PTR(-EINVAL);
  1080. if (!reg->queue_depth)
  1081. reg->queue_depth = BLK_MQ_MAX_DEPTH;
  1082. else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
  1083. pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
  1084. reg->queue_depth = BLK_MQ_MAX_DEPTH;
  1085. }
  1086. /*
  1087. * Set aside a tag for flush requests. It will only be used while
  1088. * another flush request is in progress but outside the driver.
  1089. *
  1090. * TODO: only allocate if flushes are supported
  1091. */
  1092. reg->queue_depth++;
  1093. reg->reserved_tags++;
  1094. if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
  1095. return ERR_PTR(-EINVAL);
  1096. ctx = alloc_percpu(struct blk_mq_ctx);
  1097. if (!ctx)
  1098. return ERR_PTR(-ENOMEM);
  1099. hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
  1100. reg->numa_node);
  1101. if (!hctxs)
  1102. goto err_percpu;
  1103. for (i = 0; i < reg->nr_hw_queues; i++) {
  1104. hctxs[i] = reg->ops->alloc_hctx(reg, i);
  1105. if (!hctxs[i])
  1106. goto err_hctxs;
  1107. hctxs[i]->numa_node = NUMA_NO_NODE;
  1108. hctxs[i]->queue_num = i;
  1109. }
  1110. q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
  1111. if (!q)
  1112. goto err_hctxs;
  1113. q->mq_map = blk_mq_make_queue_map(reg);
  1114. if (!q->mq_map)
  1115. goto err_map;
  1116. setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
  1117. blk_queue_rq_timeout(q, 30000);
  1118. q->nr_queues = nr_cpu_ids;
  1119. q->nr_hw_queues = reg->nr_hw_queues;
  1120. q->queue_ctx = ctx;
  1121. q->queue_hw_ctx = hctxs;
  1122. q->mq_ops = reg->ops;
  1123. blk_queue_make_request(q, blk_mq_make_request);
  1124. blk_queue_rq_timed_out(q, reg->ops->timeout);
  1125. if (reg->timeout)
  1126. blk_queue_rq_timeout(q, reg->timeout);
  1127. blk_mq_init_flush(q);
  1128. blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
  1129. if (blk_mq_init_hw_queues(q, reg, driver_data))
  1130. goto err_hw;
  1131. blk_mq_map_swqueue(q);
  1132. mutex_lock(&all_q_mutex);
  1133. list_add_tail(&q->all_q_node, &all_q_list);
  1134. mutex_unlock(&all_q_mutex);
  1135. return q;
  1136. err_hw:
  1137. kfree(q->mq_map);
  1138. err_map:
  1139. blk_cleanup_queue(q);
  1140. err_hctxs:
  1141. for (i = 0; i < reg->nr_hw_queues; i++) {
  1142. if (!hctxs[i])
  1143. break;
  1144. reg->ops->free_hctx(hctxs[i], i);
  1145. }
  1146. kfree(hctxs);
  1147. err_percpu:
  1148. free_percpu(ctx);
  1149. return ERR_PTR(-ENOMEM);
  1150. }
  1151. EXPORT_SYMBOL(blk_mq_init_queue);
  1152. void blk_mq_free_queue(struct request_queue *q)
  1153. {
  1154. struct blk_mq_hw_ctx *hctx;
  1155. int i;
  1156. queue_for_each_hw_ctx(q, hctx, i) {
  1157. cancel_delayed_work_sync(&hctx->delayed_work);
  1158. kfree(hctx->ctx_map);
  1159. kfree(hctx->ctxs);
  1160. blk_mq_free_rq_map(hctx);
  1161. blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
  1162. if (q->mq_ops->exit_hctx)
  1163. q->mq_ops->exit_hctx(hctx, i);
  1164. q->mq_ops->free_hctx(hctx, i);
  1165. }
  1166. free_percpu(q->queue_ctx);
  1167. kfree(q->queue_hw_ctx);
  1168. kfree(q->mq_map);
  1169. q->queue_ctx = NULL;
  1170. q->queue_hw_ctx = NULL;
  1171. q->mq_map = NULL;
  1172. mutex_lock(&all_q_mutex);
  1173. list_del_init(&q->all_q_node);
  1174. mutex_unlock(&all_q_mutex);
  1175. }
  1176. EXPORT_SYMBOL(blk_mq_free_queue);
  1177. /* Basically redo blk_mq_init_queue with queue frozen */
  1178. static void __cpuinit blk_mq_queue_reinit(struct request_queue *q)
  1179. {
  1180. blk_mq_freeze_queue(q);
  1181. blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
  1182. /*
  1183. * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
  1184. * we should change hctx numa_node according to new topology (this
  1185. * involves free and re-allocate memory, worthy doing?)
  1186. */
  1187. blk_mq_map_swqueue(q);
  1188. blk_mq_unfreeze_queue(q);
  1189. }
  1190. static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb,
  1191. unsigned long action, void *hcpu)
  1192. {
  1193. struct request_queue *q;
  1194. /*
  1195. * Before new mapping is established, hotadded cpu might already start
  1196. * handling requests. This doesn't break anything as we map offline
  1197. * CPUs to first hardware queue. We will re-init queue below to get
  1198. * optimal settings.
  1199. */
  1200. if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
  1201. action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
  1202. return NOTIFY_OK;
  1203. mutex_lock(&all_q_mutex);
  1204. list_for_each_entry(q, &all_q_list, all_q_node)
  1205. blk_mq_queue_reinit(q);
  1206. mutex_unlock(&all_q_mutex);
  1207. return NOTIFY_OK;
  1208. }
  1209. static int __init blk_mq_init(void)
  1210. {
  1211. unsigned int i;
  1212. for_each_possible_cpu(i)
  1213. init_llist_head(&per_cpu(ipi_lists, i));
  1214. blk_mq_cpu_init();
  1215. /* Must be called after percpu_counter_hotcpu_callback() */
  1216. hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
  1217. return 0;
  1218. }
  1219. subsys_initcall(blk_mq_init);