blk-cgroup.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889
  1. /*
  2. * Common Block IO controller cgroup interface
  3. *
  4. * Based on ideas and code from CFQ, CFS and BFQ:
  5. * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  6. *
  7. * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  8. * Paolo Valente <paolo.valente@unimore.it>
  9. *
  10. * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11. * Nauman Rafique <nauman@google.com>
  12. */
  13. #include <linux/ioprio.h>
  14. #include <linux/kdev_t.h>
  15. #include <linux/module.h>
  16. #include <linux/err.h>
  17. #include <linux/blkdev.h>
  18. #include <linux/slab.h>
  19. #include <linux/genhd.h>
  20. #include <linux/delay.h>
  21. #include <linux/atomic.h>
  22. #include "blk-cgroup.h"
  23. #include "blk.h"
  24. #define MAX_KEY_LEN 100
  25. static DEFINE_SPINLOCK(blkio_list_lock);
  26. static LIST_HEAD(blkio_list);
  27. static DEFINE_MUTEX(all_q_mutex);
  28. static LIST_HEAD(all_q_list);
  29. /* List of groups pending per cpu stats allocation */
  30. static DEFINE_SPINLOCK(alloc_list_lock);
  31. static LIST_HEAD(alloc_list);
  32. static void blkio_stat_alloc_fn(struct work_struct *);
  33. static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);
  34. struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  35. EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  36. static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
  37. struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
  38. {
  39. return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
  40. struct blkio_cgroup, css);
  41. }
  42. EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
  43. static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
  44. {
  45. return container_of(task_subsys_state(tsk, blkio_subsys_id),
  46. struct blkio_cgroup, css);
  47. }
  48. struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
  49. {
  50. if (bio && bio->bi_css)
  51. return container_of(bio->bi_css, struct blkio_cgroup, css);
  52. return task_blkio_cgroup(current);
  53. }
  54. EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
  55. /*
  56. * Worker for allocating per cpu stat for blk groups. This is scheduled on
  57. * the system_nrt_wq once there are some groups on the alloc_list waiting
  58. * for allocation.
  59. */
  60. static void blkio_stat_alloc_fn(struct work_struct *work)
  61. {
  62. static void *pcpu_stats[BLKIO_NR_POLICIES];
  63. struct delayed_work *dwork = to_delayed_work(work);
  64. struct blkio_group *blkg;
  65. int i;
  66. bool empty = false;
  67. alloc_stats:
  68. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  69. if (pcpu_stats[i] != NULL)
  70. continue;
  71. pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);
  72. /* Allocation failed. Try again after some time. */
  73. if (pcpu_stats[i] == NULL) {
  74. queue_delayed_work(system_nrt_wq, dwork,
  75. msecs_to_jiffies(10));
  76. return;
  77. }
  78. }
  79. spin_lock_irq(&blkio_list_lock);
  80. spin_lock(&alloc_list_lock);
  81. /* cgroup got deleted or queue exited. */
  82. if (!list_empty(&alloc_list)) {
  83. blkg = list_first_entry(&alloc_list, struct blkio_group,
  84. alloc_node);
  85. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  86. struct blkg_policy_data *pd = blkg->pd[i];
  87. if (blkio_policy[i] && pd && !pd->stats_cpu)
  88. swap(pd->stats_cpu, pcpu_stats[i]);
  89. }
  90. list_del_init(&blkg->alloc_node);
  91. }
  92. empty = list_empty(&alloc_list);
  93. spin_unlock(&alloc_list_lock);
  94. spin_unlock_irq(&blkio_list_lock);
  95. if (!empty)
  96. goto alloc_stats;
  97. }
  98. /**
  99. * blkg_free - free a blkg
  100. * @blkg: blkg to free
  101. *
  102. * Free @blkg which may be partially allocated.
  103. */
  104. static void blkg_free(struct blkio_group *blkg)
  105. {
  106. int i;
  107. if (!blkg)
  108. return;
  109. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  110. struct blkg_policy_data *pd = blkg->pd[i];
  111. if (pd) {
  112. free_percpu(pd->stats_cpu);
  113. kfree(pd);
  114. }
  115. }
  116. kfree(blkg);
  117. }
  118. /**
  119. * blkg_alloc - allocate a blkg
  120. * @blkcg: block cgroup the new blkg is associated with
  121. * @q: request_queue the new blkg is associated with
  122. *
  123. * Allocate a new blkg assocating @blkcg and @q.
  124. */
  125. static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
  126. struct request_queue *q)
  127. {
  128. struct blkio_group *blkg;
  129. int i;
  130. /* alloc and init base part */
  131. blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
  132. if (!blkg)
  133. return NULL;
  134. blkg->q = q;
  135. INIT_LIST_HEAD(&blkg->q_node);
  136. INIT_LIST_HEAD(&blkg->alloc_node);
  137. blkg->blkcg = blkcg;
  138. blkg->refcnt = 1;
  139. cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
  140. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  141. struct blkio_policy_type *pol = blkio_policy[i];
  142. struct blkg_policy_data *pd;
  143. if (!pol)
  144. continue;
  145. /* alloc per-policy data and attach it to blkg */
  146. pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
  147. q->node);
  148. if (!pd) {
  149. blkg_free(blkg);
  150. return NULL;
  151. }
  152. blkg->pd[i] = pd;
  153. pd->blkg = blkg;
  154. }
  155. /* invoke per-policy init */
  156. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  157. struct blkio_policy_type *pol = blkio_policy[i];
  158. if (pol)
  159. pol->ops.blkio_init_group_fn(blkg);
  160. }
  161. return blkg;
  162. }
  163. struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
  164. struct request_queue *q,
  165. bool for_root)
  166. __releases(q->queue_lock) __acquires(q->queue_lock)
  167. {
  168. struct blkio_group *blkg;
  169. WARN_ON_ONCE(!rcu_read_lock_held());
  170. lockdep_assert_held(q->queue_lock);
  171. /*
  172. * This could be the first entry point of blkcg implementation and
  173. * we shouldn't allow anything to go through for a bypassing queue.
  174. * The following can be removed if blkg lookup is guaranteed to
  175. * fail on a bypassing queue.
  176. */
  177. if (unlikely(blk_queue_bypass(q)) && !for_root)
  178. return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
  179. blkg = blkg_lookup(blkcg, q);
  180. if (blkg)
  181. return blkg;
  182. /* blkg holds a reference to blkcg */
  183. if (!css_tryget(&blkcg->css))
  184. return ERR_PTR(-EINVAL);
  185. /*
  186. * Allocate and initialize.
  187. */
  188. blkg = blkg_alloc(blkcg, q);
  189. /* did alloc fail? */
  190. if (unlikely(!blkg)) {
  191. blkg = ERR_PTR(-ENOMEM);
  192. goto out;
  193. }
  194. /* insert */
  195. spin_lock(&blkcg->lock);
  196. hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
  197. list_add(&blkg->q_node, &q->blkg_list);
  198. spin_unlock(&blkcg->lock);
  199. spin_lock(&alloc_list_lock);
  200. list_add(&blkg->alloc_node, &alloc_list);
  201. /* Queue per cpu stat allocation from worker thread. */
  202. queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
  203. spin_unlock(&alloc_list_lock);
  204. out:
  205. return blkg;
  206. }
  207. EXPORT_SYMBOL_GPL(blkg_lookup_create);
  208. /* called under rcu_read_lock(). */
  209. struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
  210. struct request_queue *q)
  211. {
  212. struct blkio_group *blkg;
  213. struct hlist_node *n;
  214. hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
  215. if (blkg->q == q)
  216. return blkg;
  217. return NULL;
  218. }
  219. EXPORT_SYMBOL_GPL(blkg_lookup);
  220. static void blkg_destroy(struct blkio_group *blkg)
  221. {
  222. struct request_queue *q = blkg->q;
  223. struct blkio_cgroup *blkcg = blkg->blkcg;
  224. lockdep_assert_held(q->queue_lock);
  225. lockdep_assert_held(&blkcg->lock);
  226. /* Something wrong if we are trying to remove same group twice */
  227. WARN_ON_ONCE(list_empty(&blkg->q_node));
  228. WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
  229. list_del_init(&blkg->q_node);
  230. hlist_del_init_rcu(&blkg->blkcg_node);
  231. spin_lock(&alloc_list_lock);
  232. list_del_init(&blkg->alloc_node);
  233. spin_unlock(&alloc_list_lock);
  234. /*
  235. * Put the reference taken at the time of creation so that when all
  236. * queues are gone, group can be destroyed.
  237. */
  238. blkg_put(blkg);
  239. }
  240. /*
  241. * XXX: This updates blkg policy data in-place for root blkg, which is
  242. * necessary across elevator switch and policy registration as root blkgs
  243. * aren't shot down. This broken and racy implementation is temporary.
  244. * Eventually, blkg shoot down will be replaced by proper in-place update.
  245. */
  246. void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
  247. {
  248. struct blkio_policy_type *pol = blkio_policy[plid];
  249. struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
  250. struct blkg_policy_data *pd;
  251. if (!blkg)
  252. return;
  253. kfree(blkg->pd[plid]);
  254. blkg->pd[plid] = NULL;
  255. if (!pol)
  256. return;
  257. pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
  258. WARN_ON_ONCE(!pd);
  259. pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
  260. WARN_ON_ONCE(!pd->stats_cpu);
  261. blkg->pd[plid] = pd;
  262. pd->blkg = blkg;
  263. pol->ops.blkio_init_group_fn(blkg);
  264. }
  265. EXPORT_SYMBOL_GPL(update_root_blkg_pd);
  266. /**
  267. * blkg_destroy_all - destroy all blkgs associated with a request_queue
  268. * @q: request_queue of interest
  269. * @destroy_root: whether to destroy root blkg or not
  270. *
  271. * Destroy blkgs associated with @q. If @destroy_root is %true, all are
  272. * destroyed; otherwise, root blkg is left alone.
  273. */
  274. void blkg_destroy_all(struct request_queue *q, bool destroy_root)
  275. {
  276. struct blkio_group *blkg, *n;
  277. spin_lock_irq(q->queue_lock);
  278. list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
  279. struct blkio_cgroup *blkcg = blkg->blkcg;
  280. /* skip root? */
  281. if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
  282. continue;
  283. spin_lock(&blkcg->lock);
  284. blkg_destroy(blkg);
  285. spin_unlock(&blkcg->lock);
  286. }
  287. spin_unlock_irq(q->queue_lock);
  288. }
  289. EXPORT_SYMBOL_GPL(blkg_destroy_all);
  290. static void blkg_rcu_free(struct rcu_head *rcu_head)
  291. {
  292. blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
  293. }
  294. void __blkg_release(struct blkio_group *blkg)
  295. {
  296. /* release the extra blkcg reference this blkg has been holding */
  297. css_put(&blkg->blkcg->css);
  298. /*
  299. * A group is freed in rcu manner. But having an rcu lock does not
  300. * mean that one can access all the fields of blkg and assume these
  301. * are valid. For example, don't try to follow throtl_data and
  302. * request queue links.
  303. *
  304. * Having a reference to blkg under an rcu allows acess to only
  305. * values local to groups like group stats and group rate limits
  306. */
  307. call_rcu(&blkg->rcu_head, blkg_rcu_free);
  308. }
  309. EXPORT_SYMBOL_GPL(__blkg_release);
  310. static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
  311. {
  312. struct blkg_policy_data *pd = blkg->pd[plid];
  313. int cpu;
  314. if (pd->stats_cpu == NULL)
  315. return;
  316. for_each_possible_cpu(cpu) {
  317. struct blkio_group_stats_cpu *sc =
  318. per_cpu_ptr(pd->stats_cpu, cpu);
  319. blkg_rwstat_reset(&sc->service_bytes);
  320. blkg_rwstat_reset(&sc->serviced);
  321. }
  322. }
  323. static int
  324. blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
  325. {
  326. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  327. struct blkio_group *blkg;
  328. struct hlist_node *n;
  329. spin_lock(&blkio_list_lock);
  330. spin_lock_irq(&blkcg->lock);
  331. /*
  332. * Note that stat reset is racy - it doesn't synchronize against
  333. * stat updates. This is a debug feature which shouldn't exist
  334. * anyway. If you get hit by a race, retry.
  335. */
  336. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  337. struct blkio_policy_type *pol;
  338. list_for_each_entry(pol, &blkio_list, list) {
  339. struct blkg_policy_data *pd = blkg->pd[pol->plid];
  340. struct blkio_group_stats *stats = &pd->stats;
  341. /* queued stats shouldn't be cleared */
  342. blkg_rwstat_reset(&stats->service_bytes);
  343. blkg_rwstat_reset(&stats->serviced);
  344. blkg_rwstat_reset(&stats->merged);
  345. blkg_rwstat_reset(&stats->service_time);
  346. blkg_rwstat_reset(&stats->wait_time);
  347. blkg_stat_reset(&stats->time);
  348. #ifdef CONFIG_DEBUG_BLK_CGROUP
  349. blkg_stat_reset(&stats->unaccounted_time);
  350. blkg_stat_reset(&stats->avg_queue_size_sum);
  351. blkg_stat_reset(&stats->avg_queue_size_samples);
  352. blkg_stat_reset(&stats->dequeue);
  353. blkg_stat_reset(&stats->group_wait_time);
  354. blkg_stat_reset(&stats->idle_time);
  355. blkg_stat_reset(&stats->empty_time);
  356. #endif
  357. blkio_reset_stats_cpu(blkg, pol->plid);
  358. }
  359. }
  360. spin_unlock_irq(&blkcg->lock);
  361. spin_unlock(&blkio_list_lock);
  362. return 0;
  363. }
  364. static const char *blkg_dev_name(struct blkio_group *blkg)
  365. {
  366. /* some drivers (floppy) instantiate a queue w/o disk registered */
  367. if (blkg->q->backing_dev_info.dev)
  368. return dev_name(blkg->q->backing_dev_info.dev);
  369. return NULL;
  370. }
  371. /**
  372. * blkcg_print_blkgs - helper for printing per-blkg data
  373. * @sf: seq_file to print to
  374. * @blkcg: blkcg of interest
  375. * @prfill: fill function to print out a blkg
  376. * @pol: policy in question
  377. * @data: data to be passed to @prfill
  378. * @show_total: to print out sum of prfill return values or not
  379. *
  380. * This function invokes @prfill on each blkg of @blkcg if pd for the
  381. * policy specified by @pol exists. @prfill is invoked with @sf, the
  382. * policy data and @data. If @show_total is %true, the sum of the return
  383. * values from @prfill is printed with "Total" label at the end.
  384. *
  385. * This is to be used to construct print functions for
  386. * cftype->read_seq_string method.
  387. */
  388. void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
  389. u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
  390. int pol, int data, bool show_total)
  391. {
  392. struct blkio_group *blkg;
  393. struct hlist_node *n;
  394. u64 total = 0;
  395. spin_lock_irq(&blkcg->lock);
  396. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
  397. if (blkg->pd[pol])
  398. total += prfill(sf, blkg->pd[pol], data);
  399. spin_unlock_irq(&blkcg->lock);
  400. if (show_total)
  401. seq_printf(sf, "Total %llu\n", (unsigned long long)total);
  402. }
  403. EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
  404. /**
  405. * __blkg_prfill_u64 - prfill helper for a single u64 value
  406. * @sf: seq_file to print to
  407. * @pd: policy data of interest
  408. * @v: value to print
  409. *
  410. * Print @v to @sf for the device assocaited with @pd.
  411. */
  412. u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
  413. {
  414. const char *dname = blkg_dev_name(pd->blkg);
  415. if (!dname)
  416. return 0;
  417. seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
  418. return v;
  419. }
  420. EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
  421. /**
  422. * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
  423. * @sf: seq_file to print to
  424. * @pd: policy data of interest
  425. * @rwstat: rwstat to print
  426. *
  427. * Print @rwstat to @sf for the device assocaited with @pd.
  428. */
  429. u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  430. const struct blkg_rwstat *rwstat)
  431. {
  432. static const char *rwstr[] = {
  433. [BLKG_RWSTAT_READ] = "Read",
  434. [BLKG_RWSTAT_WRITE] = "Write",
  435. [BLKG_RWSTAT_SYNC] = "Sync",
  436. [BLKG_RWSTAT_ASYNC] = "Async",
  437. };
  438. const char *dname = blkg_dev_name(pd->blkg);
  439. u64 v;
  440. int i;
  441. if (!dname)
  442. return 0;
  443. for (i = 0; i < BLKG_RWSTAT_NR; i++)
  444. seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
  445. (unsigned long long)rwstat->cnt[i]);
  446. v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
  447. seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
  448. return v;
  449. }
  450. static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
  451. int off)
  452. {
  453. return __blkg_prfill_u64(sf, pd,
  454. blkg_stat_read((void *)&pd->stats + off));
  455. }
  456. static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  457. int off)
  458. {
  459. struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);
  460. return __blkg_prfill_rwstat(sf, pd, &rwstat);
  461. }
  462. /* print blkg_stat specified by BLKCG_STAT_PRIV() */
  463. int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
  464. struct seq_file *sf)
  465. {
  466. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  467. blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
  468. BLKCG_STAT_POL(cft->private),
  469. BLKCG_STAT_OFF(cft->private), false);
  470. return 0;
  471. }
  472. EXPORT_SYMBOL_GPL(blkcg_print_stat);
  473. /* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
  474. int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
  475. struct seq_file *sf)
  476. {
  477. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  478. blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
  479. BLKCG_STAT_POL(cft->private),
  480. BLKCG_STAT_OFF(cft->private), true);
  481. return 0;
  482. }
  483. EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
  484. /**
  485. * blkg_conf_prep - parse and prepare for per-blkg config update
  486. * @blkcg: target block cgroup
  487. * @input: input string
  488. * @ctx: blkg_conf_ctx to be filled
  489. *
  490. * Parse per-blkg config update from @input and initialize @ctx with the
  491. * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
  492. * value. This function returns with RCU read locked and must be paired
  493. * with blkg_conf_finish().
  494. */
  495. int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
  496. struct blkg_conf_ctx *ctx)
  497. __acquires(rcu)
  498. {
  499. struct gendisk *disk;
  500. struct blkio_group *blkg;
  501. unsigned int major, minor;
  502. unsigned long long v;
  503. int part, ret;
  504. if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
  505. return -EINVAL;
  506. disk = get_gendisk(MKDEV(major, minor), &part);
  507. if (!disk || part)
  508. return -EINVAL;
  509. rcu_read_lock();
  510. spin_lock_irq(disk->queue->queue_lock);
  511. blkg = blkg_lookup_create(blkcg, disk->queue, false);
  512. spin_unlock_irq(disk->queue->queue_lock);
  513. if (IS_ERR(blkg)) {
  514. ret = PTR_ERR(blkg);
  515. rcu_read_unlock();
  516. put_disk(disk);
  517. /*
  518. * If queue was bypassing, we should retry. Do so after a
  519. * short msleep(). It isn't strictly necessary but queue
  520. * can be bypassing for some time and it's always nice to
  521. * avoid busy looping.
  522. */
  523. if (ret == -EBUSY) {
  524. msleep(10);
  525. ret = restart_syscall();
  526. }
  527. return ret;
  528. }
  529. ctx->disk = disk;
  530. ctx->blkg = blkg;
  531. ctx->v = v;
  532. return 0;
  533. }
  534. EXPORT_SYMBOL_GPL(blkg_conf_prep);
  535. /**
  536. * blkg_conf_finish - finish up per-blkg config update
  537. * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
  538. *
  539. * Finish up after per-blkg config update. This function must be paired
  540. * with blkg_conf_prep().
  541. */
  542. void blkg_conf_finish(struct blkg_conf_ctx *ctx)
  543. __releases(rcu)
  544. {
  545. rcu_read_unlock();
  546. put_disk(ctx->disk);
  547. }
  548. EXPORT_SYMBOL_GPL(blkg_conf_finish);
  549. struct cftype blkio_files[] = {
  550. {
  551. .name = "reset_stats",
  552. .write_u64 = blkiocg_reset_stats,
  553. },
  554. { } /* terminate */
  555. };
  556. /**
  557. * blkiocg_pre_destroy - cgroup pre_destroy callback
  558. * @cgroup: cgroup of interest
  559. *
  560. * This function is called when @cgroup is about to go away and responsible
  561. * for shooting down all blkgs associated with @cgroup. blkgs should be
  562. * removed while holding both q and blkcg locks. As blkcg lock is nested
  563. * inside q lock, this function performs reverse double lock dancing.
  564. *
  565. * This is the blkcg counterpart of ioc_release_fn().
  566. */
  567. static int blkiocg_pre_destroy(struct cgroup *cgroup)
  568. {
  569. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  570. spin_lock_irq(&blkcg->lock);
  571. while (!hlist_empty(&blkcg->blkg_list)) {
  572. struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
  573. struct blkio_group, blkcg_node);
  574. struct request_queue *q = blkg->q;
  575. if (spin_trylock(q->queue_lock)) {
  576. blkg_destroy(blkg);
  577. spin_unlock(q->queue_lock);
  578. } else {
  579. spin_unlock_irq(&blkcg->lock);
  580. cpu_relax();
  581. spin_lock_irq(&blkcg->lock);
  582. }
  583. }
  584. spin_unlock_irq(&blkcg->lock);
  585. return 0;
  586. }
  587. static void blkiocg_destroy(struct cgroup *cgroup)
  588. {
  589. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  590. if (blkcg != &blkio_root_cgroup)
  591. kfree(blkcg);
  592. }
  593. static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
  594. {
  595. static atomic64_t id_seq = ATOMIC64_INIT(0);
  596. struct blkio_cgroup *blkcg;
  597. struct cgroup *parent = cgroup->parent;
  598. if (!parent) {
  599. blkcg = &blkio_root_cgroup;
  600. goto done;
  601. }
  602. blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
  603. if (!blkcg)
  604. return ERR_PTR(-ENOMEM);
  605. blkcg->weight = BLKIO_WEIGHT_DEFAULT;
  606. blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
  607. done:
  608. spin_lock_init(&blkcg->lock);
  609. INIT_HLIST_HEAD(&blkcg->blkg_list);
  610. return &blkcg->css;
  611. }
  612. /**
  613. * blkcg_init_queue - initialize blkcg part of request queue
  614. * @q: request_queue to initialize
  615. *
  616. * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
  617. * part of new request_queue @q.
  618. *
  619. * RETURNS:
  620. * 0 on success, -errno on failure.
  621. */
  622. int blkcg_init_queue(struct request_queue *q)
  623. {
  624. int ret;
  625. might_sleep();
  626. ret = blk_throtl_init(q);
  627. if (ret)
  628. return ret;
  629. mutex_lock(&all_q_mutex);
  630. INIT_LIST_HEAD(&q->all_q_node);
  631. list_add_tail(&q->all_q_node, &all_q_list);
  632. mutex_unlock(&all_q_mutex);
  633. return 0;
  634. }
  635. /**
  636. * blkcg_drain_queue - drain blkcg part of request_queue
  637. * @q: request_queue to drain
  638. *
  639. * Called from blk_drain_queue(). Responsible for draining blkcg part.
  640. */
  641. void blkcg_drain_queue(struct request_queue *q)
  642. {
  643. lockdep_assert_held(q->queue_lock);
  644. blk_throtl_drain(q);
  645. }
  646. /**
  647. * blkcg_exit_queue - exit and release blkcg part of request_queue
  648. * @q: request_queue being released
  649. *
  650. * Called from blk_release_queue(). Responsible for exiting blkcg part.
  651. */
  652. void blkcg_exit_queue(struct request_queue *q)
  653. {
  654. mutex_lock(&all_q_mutex);
  655. list_del_init(&q->all_q_node);
  656. mutex_unlock(&all_q_mutex);
  657. blkg_destroy_all(q, true);
  658. blk_throtl_exit(q);
  659. }
  660. /*
  661. * We cannot support shared io contexts, as we have no mean to support
  662. * two tasks with the same ioc in two different groups without major rework
  663. * of the main cic data structures. For now we allow a task to change
  664. * its cgroup only if it's the only owner of its ioc.
  665. */
  666. static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
  667. {
  668. struct task_struct *task;
  669. struct io_context *ioc;
  670. int ret = 0;
  671. /* task_lock() is needed to avoid races with exit_io_context() */
  672. cgroup_taskset_for_each(task, cgrp, tset) {
  673. task_lock(task);
  674. ioc = task->io_context;
  675. if (ioc && atomic_read(&ioc->nr_tasks) > 1)
  676. ret = -EINVAL;
  677. task_unlock(task);
  678. if (ret)
  679. break;
  680. }
  681. return ret;
  682. }
  683. static void blkcg_bypass_start(void)
  684. __acquires(&all_q_mutex)
  685. {
  686. struct request_queue *q;
  687. mutex_lock(&all_q_mutex);
  688. list_for_each_entry(q, &all_q_list, all_q_node) {
  689. blk_queue_bypass_start(q);
  690. blkg_destroy_all(q, false);
  691. }
  692. }
  693. static void blkcg_bypass_end(void)
  694. __releases(&all_q_mutex)
  695. {
  696. struct request_queue *q;
  697. list_for_each_entry(q, &all_q_list, all_q_node)
  698. blk_queue_bypass_end(q);
  699. mutex_unlock(&all_q_mutex);
  700. }
  701. struct cgroup_subsys blkio_subsys = {
  702. .name = "blkio",
  703. .create = blkiocg_create,
  704. .can_attach = blkiocg_can_attach,
  705. .pre_destroy = blkiocg_pre_destroy,
  706. .destroy = blkiocg_destroy,
  707. .subsys_id = blkio_subsys_id,
  708. .base_cftypes = blkio_files,
  709. .module = THIS_MODULE,
  710. };
  711. EXPORT_SYMBOL_GPL(blkio_subsys);
  712. void blkio_policy_register(struct blkio_policy_type *blkiop)
  713. {
  714. struct request_queue *q;
  715. blkcg_bypass_start();
  716. spin_lock(&blkio_list_lock);
  717. BUG_ON(blkio_policy[blkiop->plid]);
  718. blkio_policy[blkiop->plid] = blkiop;
  719. list_add_tail(&blkiop->list, &blkio_list);
  720. spin_unlock(&blkio_list_lock);
  721. list_for_each_entry(q, &all_q_list, all_q_node)
  722. update_root_blkg_pd(q, blkiop->plid);
  723. blkcg_bypass_end();
  724. if (blkiop->cftypes)
  725. WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
  726. }
  727. EXPORT_SYMBOL_GPL(blkio_policy_register);
  728. void blkio_policy_unregister(struct blkio_policy_type *blkiop)
  729. {
  730. struct request_queue *q;
  731. if (blkiop->cftypes)
  732. cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);
  733. blkcg_bypass_start();
  734. spin_lock(&blkio_list_lock);
  735. BUG_ON(blkio_policy[blkiop->plid] != blkiop);
  736. blkio_policy[blkiop->plid] = NULL;
  737. list_del_init(&blkiop->list);
  738. spin_unlock(&blkio_list_lock);
  739. list_for_each_entry(q, &all_q_list, all_q_node)
  740. update_root_blkg_pd(q, blkiop->plid);
  741. blkcg_bypass_end();
  742. }
  743. EXPORT_SYMBOL_GPL(blkio_policy_unregister);