blk-cgroup.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566
  1. /*
  2. * Common Block IO controller cgroup interface
  3. *
  4. * Based on ideas and code from CFQ, CFS and BFQ:
  5. * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  6. *
  7. * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  8. * Paolo Valente <paolo.valente@unimore.it>
  9. *
  10. * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11. * Nauman Rafique <nauman@google.com>
  12. */
  13. #include <linux/ioprio.h>
  14. #include <linux/kdev_t.h>
  15. #include <linux/module.h>
  16. #include <linux/err.h>
  17. #include <linux/blkdev.h>
  18. #include <linux/slab.h>
  19. #include <linux/genhd.h>
  20. #include <linux/delay.h>
  21. #include <linux/atomic.h>
  22. #include "blk-cgroup.h"
  23. #include "blk.h"
  24. #define MAX_KEY_LEN 100
  25. static DEFINE_SPINLOCK(blkio_list_lock);
  26. static LIST_HEAD(blkio_list);
  27. static DEFINE_MUTEX(all_q_mutex);
  28. static LIST_HEAD(all_q_list);
  29. /* List of groups pending per cpu stats allocation */
  30. static DEFINE_SPINLOCK(alloc_list_lock);
  31. static LIST_HEAD(alloc_list);
  32. static void blkio_stat_alloc_fn(struct work_struct *);
  33. static DECLARE_DELAYED_WORK(blkio_stat_alloc_work, blkio_stat_alloc_fn);
  34. struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  35. EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  36. static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
  37. struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
  38. {
  39. return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
  40. struct blkio_cgroup, css);
  41. }
  42. EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
  43. static struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
  44. {
  45. return container_of(task_subsys_state(tsk, blkio_subsys_id),
  46. struct blkio_cgroup, css);
  47. }
  48. struct blkio_cgroup *bio_blkio_cgroup(struct bio *bio)
  49. {
  50. if (bio && bio->bi_css)
  51. return container_of(bio->bi_css, struct blkio_cgroup, css);
  52. return task_blkio_cgroup(current);
  53. }
  54. EXPORT_SYMBOL_GPL(bio_blkio_cgroup);
  55. static inline void blkio_update_group_weight(struct blkio_group *blkg,
  56. int plid, unsigned int weight)
  57. {
  58. struct blkio_policy_type *blkiop;
  59. list_for_each_entry(blkiop, &blkio_list, list) {
  60. /* If this policy does not own the blkg, do not send updates */
  61. if (blkiop->plid != plid)
  62. continue;
  63. if (blkiop->ops.blkio_update_group_weight_fn)
  64. blkiop->ops.blkio_update_group_weight_fn(blkg->q,
  65. blkg, weight);
  66. }
  67. }
  68. static inline void blkio_update_group_bps(struct blkio_group *blkg, int plid,
  69. u64 bps, int rw)
  70. {
  71. struct blkio_policy_type *blkiop;
  72. list_for_each_entry(blkiop, &blkio_list, list) {
  73. /* If this policy does not own the blkg, do not send updates */
  74. if (blkiop->plid != plid)
  75. continue;
  76. if (rw == READ && blkiop->ops.blkio_update_group_read_bps_fn)
  77. blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
  78. blkg, bps);
  79. if (rw == WRITE && blkiop->ops.blkio_update_group_write_bps_fn)
  80. blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
  81. blkg, bps);
  82. }
  83. }
  84. static inline void blkio_update_group_iops(struct blkio_group *blkg, int plid,
  85. u64 iops, int rw)
  86. {
  87. struct blkio_policy_type *blkiop;
  88. list_for_each_entry(blkiop, &blkio_list, list) {
  89. /* If this policy does not own the blkg, do not send updates */
  90. if (blkiop->plid != plid)
  91. continue;
  92. if (rw == READ && blkiop->ops.blkio_update_group_read_iops_fn)
  93. blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
  94. blkg, iops);
  95. if (rw == WRITE && blkiop->ops.blkio_update_group_write_iops_fn)
  96. blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
  97. blkg,iops);
  98. }
  99. }
  100. #ifdef CONFIG_DEBUG_BLK_CGROUP
  101. /* This should be called with the queue_lock held. */
  102. static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
  103. struct blkio_policy_type *pol,
  104. struct blkio_group *curr_blkg)
  105. {
  106. struct blkg_policy_data *pd = blkg->pd[pol->plid];
  107. if (blkio_blkg_waiting(&pd->stats))
  108. return;
  109. if (blkg == curr_blkg)
  110. return;
  111. pd->stats.start_group_wait_time = sched_clock();
  112. blkio_mark_blkg_waiting(&pd->stats);
  113. }
  114. /* This should be called with the queue_lock held. */
  115. static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
  116. {
  117. unsigned long long now;
  118. if (!blkio_blkg_waiting(stats))
  119. return;
  120. now = sched_clock();
  121. if (time_after64(now, stats->start_group_wait_time))
  122. blkg_stat_add(&stats->group_wait_time,
  123. now - stats->start_group_wait_time);
  124. blkio_clear_blkg_waiting(stats);
  125. }
  126. /* This should be called with the queue_lock held. */
  127. static void blkio_end_empty_time(struct blkio_group_stats *stats)
  128. {
  129. unsigned long long now;
  130. if (!blkio_blkg_empty(stats))
  131. return;
  132. now = sched_clock();
  133. if (time_after64(now, stats->start_empty_time))
  134. blkg_stat_add(&stats->empty_time,
  135. now - stats->start_empty_time);
  136. blkio_clear_blkg_empty(stats);
  137. }
  138. void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg,
  139. struct blkio_policy_type *pol)
  140. {
  141. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  142. lockdep_assert_held(blkg->q->queue_lock);
  143. BUG_ON(blkio_blkg_idling(stats));
  144. stats->start_idle_time = sched_clock();
  145. blkio_mark_blkg_idling(stats);
  146. }
  147. EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
  148. void blkiocg_update_idle_time_stats(struct blkio_group *blkg,
  149. struct blkio_policy_type *pol)
  150. {
  151. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  152. lockdep_assert_held(blkg->q->queue_lock);
  153. if (blkio_blkg_idling(stats)) {
  154. unsigned long long now = sched_clock();
  155. if (time_after64(now, stats->start_idle_time))
  156. blkg_stat_add(&stats->idle_time,
  157. now - stats->start_idle_time);
  158. blkio_clear_blkg_idling(stats);
  159. }
  160. }
  161. EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
  162. void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg,
  163. struct blkio_policy_type *pol)
  164. {
  165. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  166. lockdep_assert_held(blkg->q->queue_lock);
  167. blkg_stat_add(&stats->avg_queue_size_sum,
  168. blkg_rwstat_sum(&stats->queued));
  169. blkg_stat_add(&stats->avg_queue_size_samples, 1);
  170. blkio_update_group_wait_time(stats);
  171. }
  172. EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
  173. void blkiocg_set_start_empty_time(struct blkio_group *blkg,
  174. struct blkio_policy_type *pol)
  175. {
  176. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  177. lockdep_assert_held(blkg->q->queue_lock);
  178. if (blkg_rwstat_sum(&stats->queued))
  179. return;
  180. /*
  181. * group is already marked empty. This can happen if cfqq got new
  182. * request in parent group and moved to this group while being added
  183. * to service tree. Just ignore the event and move on.
  184. */
  185. if (blkio_blkg_empty(stats))
  186. return;
  187. stats->start_empty_time = sched_clock();
  188. blkio_mark_blkg_empty(stats);
  189. }
  190. EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
  191. void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
  192. struct blkio_policy_type *pol,
  193. unsigned long dequeue)
  194. {
  195. struct blkg_policy_data *pd = blkg->pd[pol->plid];
  196. lockdep_assert_held(blkg->q->queue_lock);
  197. blkg_stat_add(&pd->stats.dequeue, dequeue);
  198. }
  199. EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
  200. #else
  201. static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
  202. struct blkio_policy_type *pol,
  203. struct blkio_group *curr_blkg) { }
  204. static inline void blkio_end_empty_time(struct blkio_group_stats *stats) { }
  205. #endif
  206. void blkiocg_update_io_add_stats(struct blkio_group *blkg,
  207. struct blkio_policy_type *pol,
  208. struct blkio_group *curr_blkg, bool direction,
  209. bool sync)
  210. {
  211. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  212. int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
  213. lockdep_assert_held(blkg->q->queue_lock);
  214. blkg_rwstat_add(&stats->queued, rw, 1);
  215. blkio_end_empty_time(stats);
  216. blkio_set_start_group_wait_time(blkg, pol, curr_blkg);
  217. }
  218. EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
  219. void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
  220. struct blkio_policy_type *pol,
  221. bool direction, bool sync)
  222. {
  223. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  224. int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
  225. lockdep_assert_held(blkg->q->queue_lock);
  226. blkg_rwstat_add(&stats->queued, rw, -1);
  227. }
  228. EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
  229. void blkiocg_update_timeslice_used(struct blkio_group *blkg,
  230. struct blkio_policy_type *pol,
  231. unsigned long time,
  232. unsigned long unaccounted_time)
  233. {
  234. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  235. lockdep_assert_held(blkg->q->queue_lock);
  236. blkg_stat_add(&stats->time, time);
  237. #ifdef CONFIG_DEBUG_BLK_CGROUP
  238. blkg_stat_add(&stats->unaccounted_time, unaccounted_time);
  239. #endif
  240. }
  241. EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
  242. /*
  243. * should be called under rcu read lock or queue lock to make sure blkg pointer
  244. * is valid.
  245. */
  246. void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
  247. struct blkio_policy_type *pol,
  248. uint64_t bytes, bool direction, bool sync)
  249. {
  250. int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
  251. struct blkg_policy_data *pd = blkg->pd[pol->plid];
  252. struct blkio_group_stats_cpu *stats_cpu;
  253. unsigned long flags;
  254. /* If per cpu stats are not allocated yet, don't do any accounting. */
  255. if (pd->stats_cpu == NULL)
  256. return;
  257. /*
  258. * Disabling interrupts to provide mutual exclusion between two
  259. * writes on same cpu. It probably is not needed for 64bit. Not
  260. * optimizing that case yet.
  261. */
  262. local_irq_save(flags);
  263. stats_cpu = this_cpu_ptr(pd->stats_cpu);
  264. blkg_stat_add(&stats_cpu->sectors, bytes >> 9);
  265. blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
  266. blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
  267. local_irq_restore(flags);
  268. }
  269. EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
  270. void blkiocg_update_completion_stats(struct blkio_group *blkg,
  271. struct blkio_policy_type *pol,
  272. uint64_t start_time,
  273. uint64_t io_start_time, bool direction,
  274. bool sync)
  275. {
  276. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  277. unsigned long long now = sched_clock();
  278. int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
  279. lockdep_assert_held(blkg->q->queue_lock);
  280. if (time_after64(now, io_start_time))
  281. blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
  282. if (time_after64(io_start_time, start_time))
  283. blkg_rwstat_add(&stats->wait_time, rw,
  284. io_start_time - start_time);
  285. }
  286. EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
  287. /* Merged stats are per cpu. */
  288. void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
  289. struct blkio_policy_type *pol,
  290. bool direction, bool sync)
  291. {
  292. struct blkio_group_stats *stats = &blkg->pd[pol->plid]->stats;
  293. int rw = (direction ? REQ_WRITE : 0) | (sync ? REQ_SYNC : 0);
  294. lockdep_assert_held(blkg->q->queue_lock);
  295. blkg_rwstat_add(&stats->merged, rw, 1);
  296. }
  297. EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
  298. /*
  299. * Worker for allocating per cpu stat for blk groups. This is scheduled on
  300. * the system_nrt_wq once there are some groups on the alloc_list waiting
  301. * for allocation.
  302. */
  303. static void blkio_stat_alloc_fn(struct work_struct *work)
  304. {
  305. static void *pcpu_stats[BLKIO_NR_POLICIES];
  306. struct delayed_work *dwork = to_delayed_work(work);
  307. struct blkio_group *blkg;
  308. int i;
  309. bool empty = false;
  310. alloc_stats:
  311. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  312. if (pcpu_stats[i] != NULL)
  313. continue;
  314. pcpu_stats[i] = alloc_percpu(struct blkio_group_stats_cpu);
  315. /* Allocation failed. Try again after some time. */
  316. if (pcpu_stats[i] == NULL) {
  317. queue_delayed_work(system_nrt_wq, dwork,
  318. msecs_to_jiffies(10));
  319. return;
  320. }
  321. }
  322. spin_lock_irq(&blkio_list_lock);
  323. spin_lock(&alloc_list_lock);
  324. /* cgroup got deleted or queue exited. */
  325. if (!list_empty(&alloc_list)) {
  326. blkg = list_first_entry(&alloc_list, struct blkio_group,
  327. alloc_node);
  328. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  329. struct blkg_policy_data *pd = blkg->pd[i];
  330. if (blkio_policy[i] && pd && !pd->stats_cpu)
  331. swap(pd->stats_cpu, pcpu_stats[i]);
  332. }
  333. list_del_init(&blkg->alloc_node);
  334. }
  335. empty = list_empty(&alloc_list);
  336. spin_unlock(&alloc_list_lock);
  337. spin_unlock_irq(&blkio_list_lock);
  338. if (!empty)
  339. goto alloc_stats;
  340. }
  341. /**
  342. * blkg_free - free a blkg
  343. * @blkg: blkg to free
  344. *
  345. * Free @blkg which may be partially allocated.
  346. */
  347. static void blkg_free(struct blkio_group *blkg)
  348. {
  349. int i;
  350. if (!blkg)
  351. return;
  352. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  353. struct blkg_policy_data *pd = blkg->pd[i];
  354. if (pd) {
  355. free_percpu(pd->stats_cpu);
  356. kfree(pd);
  357. }
  358. }
  359. kfree(blkg);
  360. }
  361. /**
  362. * blkg_alloc - allocate a blkg
  363. * @blkcg: block cgroup the new blkg is associated with
  364. * @q: request_queue the new blkg is associated with
  365. *
  366. * Allocate a new blkg assocating @blkcg and @q.
  367. */
  368. static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
  369. struct request_queue *q)
  370. {
  371. struct blkio_group *blkg;
  372. int i;
  373. /* alloc and init base part */
  374. blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
  375. if (!blkg)
  376. return NULL;
  377. blkg->q = q;
  378. INIT_LIST_HEAD(&blkg->q_node);
  379. INIT_LIST_HEAD(&blkg->alloc_node);
  380. blkg->blkcg = blkcg;
  381. blkg->refcnt = 1;
  382. cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
  383. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  384. struct blkio_policy_type *pol = blkio_policy[i];
  385. struct blkg_policy_data *pd;
  386. if (!pol)
  387. continue;
  388. /* alloc per-policy data and attach it to blkg */
  389. pd = kzalloc_node(sizeof(*pd) + pol->pdata_size, GFP_ATOMIC,
  390. q->node);
  391. if (!pd) {
  392. blkg_free(blkg);
  393. return NULL;
  394. }
  395. blkg->pd[i] = pd;
  396. pd->blkg = blkg;
  397. }
  398. /* invoke per-policy init */
  399. for (i = 0; i < BLKIO_NR_POLICIES; i++) {
  400. struct blkio_policy_type *pol = blkio_policy[i];
  401. if (pol)
  402. pol->ops.blkio_init_group_fn(blkg);
  403. }
  404. return blkg;
  405. }
  406. struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
  407. struct request_queue *q,
  408. bool for_root)
  409. __releases(q->queue_lock) __acquires(q->queue_lock)
  410. {
  411. struct blkio_group *blkg;
  412. WARN_ON_ONCE(!rcu_read_lock_held());
  413. lockdep_assert_held(q->queue_lock);
  414. /*
  415. * This could be the first entry point of blkcg implementation and
  416. * we shouldn't allow anything to go through for a bypassing queue.
  417. * The following can be removed if blkg lookup is guaranteed to
  418. * fail on a bypassing queue.
  419. */
  420. if (unlikely(blk_queue_bypass(q)) && !for_root)
  421. return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
  422. blkg = blkg_lookup(blkcg, q);
  423. if (blkg)
  424. return blkg;
  425. /* blkg holds a reference to blkcg */
  426. if (!css_tryget(&blkcg->css))
  427. return ERR_PTR(-EINVAL);
  428. /*
  429. * Allocate and initialize.
  430. */
  431. blkg = blkg_alloc(blkcg, q);
  432. /* did alloc fail? */
  433. if (unlikely(!blkg)) {
  434. blkg = ERR_PTR(-ENOMEM);
  435. goto out;
  436. }
  437. /* insert */
  438. spin_lock(&blkcg->lock);
  439. hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
  440. list_add(&blkg->q_node, &q->blkg_list);
  441. spin_unlock(&blkcg->lock);
  442. spin_lock(&alloc_list_lock);
  443. list_add(&blkg->alloc_node, &alloc_list);
  444. /* Queue per cpu stat allocation from worker thread. */
  445. queue_delayed_work(system_nrt_wq, &blkio_stat_alloc_work, 0);
  446. spin_unlock(&alloc_list_lock);
  447. out:
  448. return blkg;
  449. }
  450. EXPORT_SYMBOL_GPL(blkg_lookup_create);
  451. /* called under rcu_read_lock(). */
  452. struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
  453. struct request_queue *q)
  454. {
  455. struct blkio_group *blkg;
  456. struct hlist_node *n;
  457. hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
  458. if (blkg->q == q)
  459. return blkg;
  460. return NULL;
  461. }
  462. EXPORT_SYMBOL_GPL(blkg_lookup);
  463. static void blkg_destroy(struct blkio_group *blkg)
  464. {
  465. struct request_queue *q = blkg->q;
  466. struct blkio_cgroup *blkcg = blkg->blkcg;
  467. lockdep_assert_held(q->queue_lock);
  468. lockdep_assert_held(&blkcg->lock);
  469. /* Something wrong if we are trying to remove same group twice */
  470. WARN_ON_ONCE(list_empty(&blkg->q_node));
  471. WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
  472. list_del_init(&blkg->q_node);
  473. hlist_del_init_rcu(&blkg->blkcg_node);
  474. spin_lock(&alloc_list_lock);
  475. list_del_init(&blkg->alloc_node);
  476. spin_unlock(&alloc_list_lock);
  477. /*
  478. * Put the reference taken at the time of creation so that when all
  479. * queues are gone, group can be destroyed.
  480. */
  481. blkg_put(blkg);
  482. }
  483. /*
  484. * XXX: This updates blkg policy data in-place for root blkg, which is
  485. * necessary across elevator switch and policy registration as root blkgs
  486. * aren't shot down. This broken and racy implementation is temporary.
  487. * Eventually, blkg shoot down will be replaced by proper in-place update.
  488. */
  489. void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
  490. {
  491. struct blkio_policy_type *pol = blkio_policy[plid];
  492. struct blkio_group *blkg = blkg_lookup(&blkio_root_cgroup, q);
  493. struct blkg_policy_data *pd;
  494. if (!blkg)
  495. return;
  496. kfree(blkg->pd[plid]);
  497. blkg->pd[plid] = NULL;
  498. if (!pol)
  499. return;
  500. pd = kzalloc(sizeof(*pd) + pol->pdata_size, GFP_KERNEL);
  501. WARN_ON_ONCE(!pd);
  502. pd->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
  503. WARN_ON_ONCE(!pd->stats_cpu);
  504. blkg->pd[plid] = pd;
  505. pd->blkg = blkg;
  506. pol->ops.blkio_init_group_fn(blkg);
  507. }
  508. EXPORT_SYMBOL_GPL(update_root_blkg_pd);
  509. /**
  510. * blkg_destroy_all - destroy all blkgs associated with a request_queue
  511. * @q: request_queue of interest
  512. * @destroy_root: whether to destroy root blkg or not
  513. *
  514. * Destroy blkgs associated with @q. If @destroy_root is %true, all are
  515. * destroyed; otherwise, root blkg is left alone.
  516. */
  517. void blkg_destroy_all(struct request_queue *q, bool destroy_root)
  518. {
  519. struct blkio_group *blkg, *n;
  520. spin_lock_irq(q->queue_lock);
  521. list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
  522. struct blkio_cgroup *blkcg = blkg->blkcg;
  523. /* skip root? */
  524. if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
  525. continue;
  526. spin_lock(&blkcg->lock);
  527. blkg_destroy(blkg);
  528. spin_unlock(&blkcg->lock);
  529. }
  530. spin_unlock_irq(q->queue_lock);
  531. }
  532. EXPORT_SYMBOL_GPL(blkg_destroy_all);
  533. static void blkg_rcu_free(struct rcu_head *rcu_head)
  534. {
  535. blkg_free(container_of(rcu_head, struct blkio_group, rcu_head));
  536. }
  537. void __blkg_release(struct blkio_group *blkg)
  538. {
  539. /* release the extra blkcg reference this blkg has been holding */
  540. css_put(&blkg->blkcg->css);
  541. /*
  542. * A group is freed in rcu manner. But having an rcu lock does not
  543. * mean that one can access all the fields of blkg and assume these
  544. * are valid. For example, don't try to follow throtl_data and
  545. * request queue links.
  546. *
  547. * Having a reference to blkg under an rcu allows acess to only
  548. * values local to groups like group stats and group rate limits
  549. */
  550. call_rcu(&blkg->rcu_head, blkg_rcu_free);
  551. }
  552. EXPORT_SYMBOL_GPL(__blkg_release);
  553. static void blkio_reset_stats_cpu(struct blkio_group *blkg, int plid)
  554. {
  555. struct blkg_policy_data *pd = blkg->pd[plid];
  556. int cpu;
  557. if (pd->stats_cpu == NULL)
  558. return;
  559. for_each_possible_cpu(cpu) {
  560. struct blkio_group_stats_cpu *sc =
  561. per_cpu_ptr(pd->stats_cpu, cpu);
  562. blkg_rwstat_reset(&sc->service_bytes);
  563. blkg_rwstat_reset(&sc->serviced);
  564. blkg_stat_reset(&sc->sectors);
  565. }
  566. }
  567. static int
  568. blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
  569. {
  570. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  571. struct blkio_group *blkg;
  572. struct hlist_node *n;
  573. spin_lock(&blkio_list_lock);
  574. spin_lock_irq(&blkcg->lock);
  575. /*
  576. * Note that stat reset is racy - it doesn't synchronize against
  577. * stat updates. This is a debug feature which shouldn't exist
  578. * anyway. If you get hit by a race, retry.
  579. */
  580. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  581. struct blkio_policy_type *pol;
  582. list_for_each_entry(pol, &blkio_list, list) {
  583. struct blkg_policy_data *pd = blkg->pd[pol->plid];
  584. struct blkio_group_stats *stats = &pd->stats;
  585. /* queued stats shouldn't be cleared */
  586. blkg_rwstat_reset(&stats->merged);
  587. blkg_rwstat_reset(&stats->service_time);
  588. blkg_rwstat_reset(&stats->wait_time);
  589. blkg_stat_reset(&stats->time);
  590. #ifdef CONFIG_DEBUG_BLK_CGROUP
  591. blkg_stat_reset(&stats->unaccounted_time);
  592. blkg_stat_reset(&stats->avg_queue_size_sum);
  593. blkg_stat_reset(&stats->avg_queue_size_samples);
  594. blkg_stat_reset(&stats->dequeue);
  595. blkg_stat_reset(&stats->group_wait_time);
  596. blkg_stat_reset(&stats->idle_time);
  597. blkg_stat_reset(&stats->empty_time);
  598. #endif
  599. blkio_reset_stats_cpu(blkg, pol->plid);
  600. }
  601. }
  602. spin_unlock_irq(&blkcg->lock);
  603. spin_unlock(&blkio_list_lock);
  604. return 0;
  605. }
  606. static const char *blkg_dev_name(struct blkio_group *blkg)
  607. {
  608. /* some drivers (floppy) instantiate a queue w/o disk registered */
  609. if (blkg->q->backing_dev_info.dev)
  610. return dev_name(blkg->q->backing_dev_info.dev);
  611. return NULL;
  612. }
  613. /**
  614. * blkcg_print_blkgs - helper for printing per-blkg data
  615. * @sf: seq_file to print to
  616. * @blkcg: blkcg of interest
  617. * @prfill: fill function to print out a blkg
  618. * @pol: policy in question
  619. * @data: data to be passed to @prfill
  620. * @show_total: to print out sum of prfill return values or not
  621. *
  622. * This function invokes @prfill on each blkg of @blkcg if pd for the
  623. * policy specified by @pol exists. @prfill is invoked with @sf, the
  624. * policy data and @data. If @show_total is %true, the sum of the return
  625. * values from @prfill is printed with "Total" label at the end.
  626. *
  627. * This is to be used to construct print functions for
  628. * cftype->read_seq_string method.
  629. */
  630. void blkcg_print_blkgs(struct seq_file *sf, struct blkio_cgroup *blkcg,
  631. u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int),
  632. int pol, int data, bool show_total)
  633. {
  634. struct blkio_group *blkg;
  635. struct hlist_node *n;
  636. u64 total = 0;
  637. spin_lock_irq(&blkcg->lock);
  638. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
  639. if (blkg->pd[pol])
  640. total += prfill(sf, blkg->pd[pol], data);
  641. spin_unlock_irq(&blkcg->lock);
  642. if (show_total)
  643. seq_printf(sf, "Total %llu\n", (unsigned long long)total);
  644. }
  645. EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
  646. /**
  647. * __blkg_prfill_u64 - prfill helper for a single u64 value
  648. * @sf: seq_file to print to
  649. * @pd: policy data of interest
  650. * @v: value to print
  651. *
  652. * Print @v to @sf for the device assocaited with @pd.
  653. */
  654. u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
  655. {
  656. const char *dname = blkg_dev_name(pd->blkg);
  657. if (!dname)
  658. return 0;
  659. seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
  660. return v;
  661. }
  662. EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
  663. /**
  664. * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
  665. * @sf: seq_file to print to
  666. * @pd: policy data of interest
  667. * @rwstat: rwstat to print
  668. *
  669. * Print @rwstat to @sf for the device assocaited with @pd.
  670. */
  671. u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  672. const struct blkg_rwstat *rwstat)
  673. {
  674. static const char *rwstr[] = {
  675. [BLKG_RWSTAT_READ] = "Read",
  676. [BLKG_RWSTAT_WRITE] = "Write",
  677. [BLKG_RWSTAT_SYNC] = "Sync",
  678. [BLKG_RWSTAT_ASYNC] = "Async",
  679. };
  680. const char *dname = blkg_dev_name(pd->blkg);
  681. u64 v;
  682. int i;
  683. if (!dname)
  684. return 0;
  685. for (i = 0; i < BLKG_RWSTAT_NR; i++)
  686. seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
  687. (unsigned long long)rwstat->cnt[i]);
  688. v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
  689. seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
  690. return v;
  691. }
  692. static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
  693. int off)
  694. {
  695. return __blkg_prfill_u64(sf, pd,
  696. blkg_stat_read((void *)&pd->stats + off));
  697. }
  698. static u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
  699. int off)
  700. {
  701. struct blkg_rwstat rwstat = blkg_rwstat_read((void *)&pd->stats + off);
  702. return __blkg_prfill_rwstat(sf, pd, &rwstat);
  703. }
  704. /* print blkg_stat specified by BLKCG_STAT_PRIV() */
  705. int blkcg_print_stat(struct cgroup *cgrp, struct cftype *cft,
  706. struct seq_file *sf)
  707. {
  708. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  709. blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat,
  710. BLKCG_STAT_POL(cft->private),
  711. BLKCG_STAT_OFF(cft->private), false);
  712. return 0;
  713. }
  714. EXPORT_SYMBOL_GPL(blkcg_print_stat);
  715. /* print blkg_rwstat specified by BLKCG_STAT_PRIV() */
  716. int blkcg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
  717. struct seq_file *sf)
  718. {
  719. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  720. blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat,
  721. BLKCG_STAT_POL(cft->private),
  722. BLKCG_STAT_OFF(cft->private), true);
  723. return 0;
  724. }
  725. EXPORT_SYMBOL_GPL(blkcg_print_rwstat);
  726. static u64 blkg_prfill_cpu_stat(struct seq_file *sf,
  727. struct blkg_policy_data *pd, int off)
  728. {
  729. u64 v = 0;
  730. int cpu;
  731. for_each_possible_cpu(cpu) {
  732. struct blkio_group_stats_cpu *sc =
  733. per_cpu_ptr(pd->stats_cpu, cpu);
  734. v += blkg_stat_read((void *)sc + off);
  735. }
  736. return __blkg_prfill_u64(sf, pd, v);
  737. }
  738. static u64 blkg_prfill_cpu_rwstat(struct seq_file *sf,
  739. struct blkg_policy_data *pd, int off)
  740. {
  741. struct blkg_rwstat rwstat = { }, tmp;
  742. int i, cpu;
  743. for_each_possible_cpu(cpu) {
  744. struct blkio_group_stats_cpu *sc =
  745. per_cpu_ptr(pd->stats_cpu, cpu);
  746. tmp = blkg_rwstat_read((void *)sc + off);
  747. for (i = 0; i < BLKG_RWSTAT_NR; i++)
  748. rwstat.cnt[i] += tmp.cnt[i];
  749. }
  750. return __blkg_prfill_rwstat(sf, pd, &rwstat);
  751. }
  752. /* print per-cpu blkg_stat specified by BLKCG_STAT_PRIV() */
  753. int blkcg_print_cpu_stat(struct cgroup *cgrp, struct cftype *cft,
  754. struct seq_file *sf)
  755. {
  756. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  757. blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_stat,
  758. BLKCG_STAT_POL(cft->private),
  759. BLKCG_STAT_OFF(cft->private), false);
  760. return 0;
  761. }
  762. EXPORT_SYMBOL_GPL(blkcg_print_cpu_stat);
  763. /* print per-cpu blkg_rwstat specified by BLKCG_STAT_PRIV() */
  764. int blkcg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
  765. struct seq_file *sf)
  766. {
  767. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  768. blkcg_print_blkgs(sf, blkcg, blkg_prfill_cpu_rwstat,
  769. BLKCG_STAT_POL(cft->private),
  770. BLKCG_STAT_OFF(cft->private), true);
  771. return 0;
  772. }
  773. EXPORT_SYMBOL_GPL(blkcg_print_cpu_rwstat);
  774. #ifdef CONFIG_DEBUG_BLK_CGROUP
  775. static u64 blkg_prfill_avg_queue_size(struct seq_file *sf,
  776. struct blkg_policy_data *pd, int off)
  777. {
  778. u64 samples = blkg_stat_read(&pd->stats.avg_queue_size_samples);
  779. u64 v = 0;
  780. if (samples) {
  781. v = blkg_stat_read(&pd->stats.avg_queue_size_sum);
  782. do_div(v, samples);
  783. }
  784. __blkg_prfill_u64(sf, pd, v);
  785. return 0;
  786. }
  787. /* print avg_queue_size */
  788. static int blkcg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
  789. struct seq_file *sf)
  790. {
  791. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  792. blkcg_print_blkgs(sf, blkcg, blkg_prfill_avg_queue_size,
  793. BLKIO_POLICY_PROP, 0, false);
  794. return 0;
  795. }
  796. #endif /* CONFIG_DEBUG_BLK_CGROUP */
  797. /**
  798. * blkg_conf_prep - parse and prepare for per-blkg config update
  799. * @blkcg: target block cgroup
  800. * @input: input string
  801. * @ctx: blkg_conf_ctx to be filled
  802. *
  803. * Parse per-blkg config update from @input and initialize @ctx with the
  804. * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
  805. * value. This function returns with RCU read locked and must be paired
  806. * with blkg_conf_finish().
  807. */
  808. int blkg_conf_prep(struct blkio_cgroup *blkcg, const char *input,
  809. struct blkg_conf_ctx *ctx)
  810. __acquires(rcu)
  811. {
  812. struct gendisk *disk;
  813. struct blkio_group *blkg;
  814. unsigned int major, minor;
  815. unsigned long long v;
  816. int part, ret;
  817. if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
  818. return -EINVAL;
  819. disk = get_gendisk(MKDEV(major, minor), &part);
  820. if (!disk || part)
  821. return -EINVAL;
  822. rcu_read_lock();
  823. spin_lock_irq(disk->queue->queue_lock);
  824. blkg = blkg_lookup_create(blkcg, disk->queue, false);
  825. spin_unlock_irq(disk->queue->queue_lock);
  826. if (IS_ERR(blkg)) {
  827. ret = PTR_ERR(blkg);
  828. rcu_read_unlock();
  829. put_disk(disk);
  830. /*
  831. * If queue was bypassing, we should retry. Do so after a
  832. * short msleep(). It isn't strictly necessary but queue
  833. * can be bypassing for some time and it's always nice to
  834. * avoid busy looping.
  835. */
  836. if (ret == -EBUSY) {
  837. msleep(10);
  838. ret = restart_syscall();
  839. }
  840. return ret;
  841. }
  842. ctx->disk = disk;
  843. ctx->blkg = blkg;
  844. ctx->v = v;
  845. return 0;
  846. }
  847. EXPORT_SYMBOL_GPL(blkg_conf_prep);
  848. /**
  849. * blkg_conf_finish - finish up per-blkg config update
  850. * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
  851. *
  852. * Finish up after per-blkg config update. This function must be paired
  853. * with blkg_conf_prep().
  854. */
  855. void blkg_conf_finish(struct blkg_conf_ctx *ctx)
  856. __releases(rcu)
  857. {
  858. rcu_read_unlock();
  859. put_disk(ctx->disk);
  860. }
  861. EXPORT_SYMBOL_GPL(blkg_conf_finish);
  862. /* for propio conf */
  863. static u64 blkg_prfill_weight_device(struct seq_file *sf,
  864. struct blkg_policy_data *pd, int off)
  865. {
  866. if (!pd->conf.weight)
  867. return 0;
  868. return __blkg_prfill_u64(sf, pd, pd->conf.weight);
  869. }
  870. static int blkcg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
  871. struct seq_file *sf)
  872. {
  873. blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
  874. blkg_prfill_weight_device, BLKIO_POLICY_PROP, 0,
  875. false);
  876. return 0;
  877. }
  878. static int blkcg_print_weight(struct cgroup *cgrp, struct cftype *cft,
  879. struct seq_file *sf)
  880. {
  881. seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->weight);
  882. return 0;
  883. }
  884. static int blkcg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
  885. const char *buf)
  886. {
  887. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  888. struct blkg_policy_data *pd;
  889. struct blkg_conf_ctx ctx;
  890. int ret;
  891. ret = blkg_conf_prep(blkcg, buf, &ctx);
  892. if (ret)
  893. return ret;
  894. ret = -EINVAL;
  895. pd = ctx.blkg->pd[BLKIO_POLICY_PROP];
  896. if (pd && (!ctx.v || (ctx.v >= BLKIO_WEIGHT_MIN &&
  897. ctx.v <= BLKIO_WEIGHT_MAX))) {
  898. pd->conf.weight = ctx.v;
  899. blkio_update_group_weight(ctx.blkg, BLKIO_POLICY_PROP,
  900. ctx.v ?: blkcg->weight);
  901. ret = 0;
  902. }
  903. blkg_conf_finish(&ctx);
  904. return ret;
  905. }
  906. static int blkcg_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
  907. {
  908. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  909. struct blkio_group *blkg;
  910. struct hlist_node *n;
  911. if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
  912. return -EINVAL;
  913. spin_lock(&blkio_list_lock);
  914. spin_lock_irq(&blkcg->lock);
  915. blkcg->weight = (unsigned int)val;
  916. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  917. struct blkg_policy_data *pd = blkg->pd[BLKIO_POLICY_PROP];
  918. if (pd && !pd->conf.weight)
  919. blkio_update_group_weight(blkg, BLKIO_POLICY_PROP,
  920. blkcg->weight);
  921. }
  922. spin_unlock_irq(&blkcg->lock);
  923. spin_unlock(&blkio_list_lock);
  924. return 0;
  925. }
  926. /* for blk-throttle conf */
  927. #ifdef CONFIG_BLK_DEV_THROTTLING
  928. static u64 blkg_prfill_conf_u64(struct seq_file *sf,
  929. struct blkg_policy_data *pd, int off)
  930. {
  931. u64 v = *(u64 *)((void *)&pd->conf + off);
  932. if (!v)
  933. return 0;
  934. return __blkg_prfill_u64(sf, pd, v);
  935. }
  936. static int blkcg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
  937. struct seq_file *sf)
  938. {
  939. blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
  940. blkg_prfill_conf_u64, BLKIO_POLICY_THROTL,
  941. cft->private, false);
  942. return 0;
  943. }
  944. static int blkcg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
  945. const char *buf, int rw,
  946. void (*update)(struct blkio_group *, int, u64, int))
  947. {
  948. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
  949. struct blkg_policy_data *pd;
  950. struct blkg_conf_ctx ctx;
  951. int ret;
  952. ret = blkg_conf_prep(blkcg, buf, &ctx);
  953. if (ret)
  954. return ret;
  955. ret = -EINVAL;
  956. pd = ctx.blkg->pd[BLKIO_POLICY_THROTL];
  957. if (pd) {
  958. *(u64 *)((void *)&pd->conf + cft->private) = ctx.v;
  959. update(ctx.blkg, BLKIO_POLICY_THROTL, ctx.v ?: -1, rw);
  960. ret = 0;
  961. }
  962. blkg_conf_finish(&ctx);
  963. return ret;
  964. }
  965. static int blkcg_set_conf_bps_r(struct cgroup *cgrp, struct cftype *cft,
  966. const char *buf)
  967. {
  968. return blkcg_set_conf_u64(cgrp, cft, buf, READ, blkio_update_group_bps);
  969. }
  970. static int blkcg_set_conf_bps_w(struct cgroup *cgrp, struct cftype *cft,
  971. const char *buf)
  972. {
  973. return blkcg_set_conf_u64(cgrp, cft, buf, WRITE, blkio_update_group_bps);
  974. }
  975. static int blkcg_set_conf_iops_r(struct cgroup *cgrp, struct cftype *cft,
  976. const char *buf)
  977. {
  978. return blkcg_set_conf_u64(cgrp, cft, buf, READ, blkio_update_group_iops);
  979. }
  980. static int blkcg_set_conf_iops_w(struct cgroup *cgrp, struct cftype *cft,
  981. const char *buf)
  982. {
  983. return blkcg_set_conf_u64(cgrp, cft, buf, WRITE, blkio_update_group_iops);
  984. }
  985. #endif
  986. struct cftype blkio_files[] = {
  987. {
  988. .name = "weight_device",
  989. .read_seq_string = blkcg_print_weight_device,
  990. .write_string = blkcg_set_weight_device,
  991. .max_write_len = 256,
  992. },
  993. {
  994. .name = "weight",
  995. .read_seq_string = blkcg_print_weight,
  996. .write_u64 = blkcg_set_weight,
  997. },
  998. {
  999. .name = "time",
  1000. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1001. offsetof(struct blkio_group_stats, time)),
  1002. .read_seq_string = blkcg_print_stat,
  1003. },
  1004. {
  1005. .name = "sectors",
  1006. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1007. offsetof(struct blkio_group_stats_cpu, sectors)),
  1008. .read_seq_string = blkcg_print_cpu_stat,
  1009. },
  1010. {
  1011. .name = "io_service_bytes",
  1012. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1013. offsetof(struct blkio_group_stats_cpu, service_bytes)),
  1014. .read_seq_string = blkcg_print_cpu_rwstat,
  1015. },
  1016. {
  1017. .name = "io_serviced",
  1018. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1019. offsetof(struct blkio_group_stats_cpu, serviced)),
  1020. .read_seq_string = blkcg_print_cpu_rwstat,
  1021. },
  1022. {
  1023. .name = "io_service_time",
  1024. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1025. offsetof(struct blkio_group_stats, service_time)),
  1026. .read_seq_string = blkcg_print_rwstat,
  1027. },
  1028. {
  1029. .name = "io_wait_time",
  1030. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1031. offsetof(struct blkio_group_stats, wait_time)),
  1032. .read_seq_string = blkcg_print_rwstat,
  1033. },
  1034. {
  1035. .name = "io_merged",
  1036. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1037. offsetof(struct blkio_group_stats, merged)),
  1038. .read_seq_string = blkcg_print_rwstat,
  1039. },
  1040. {
  1041. .name = "io_queued",
  1042. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1043. offsetof(struct blkio_group_stats, queued)),
  1044. .read_seq_string = blkcg_print_rwstat,
  1045. },
  1046. {
  1047. .name = "reset_stats",
  1048. .write_u64 = blkiocg_reset_stats,
  1049. },
  1050. #ifdef CONFIG_BLK_DEV_THROTTLING
  1051. {
  1052. .name = "throttle.read_bps_device",
  1053. .private = offsetof(struct blkio_group_conf, bps[READ]),
  1054. .read_seq_string = blkcg_print_conf_u64,
  1055. .write_string = blkcg_set_conf_bps_r,
  1056. .max_write_len = 256,
  1057. },
  1058. {
  1059. .name = "throttle.write_bps_device",
  1060. .private = offsetof(struct blkio_group_conf, bps[WRITE]),
  1061. .read_seq_string = blkcg_print_conf_u64,
  1062. .write_string = blkcg_set_conf_bps_w,
  1063. .max_write_len = 256,
  1064. },
  1065. {
  1066. .name = "throttle.read_iops_device",
  1067. .private = offsetof(struct blkio_group_conf, iops[READ]),
  1068. .read_seq_string = blkcg_print_conf_u64,
  1069. .write_string = blkcg_set_conf_iops_r,
  1070. .max_write_len = 256,
  1071. },
  1072. {
  1073. .name = "throttle.write_iops_device",
  1074. .private = offsetof(struct blkio_group_conf, iops[WRITE]),
  1075. .read_seq_string = blkcg_print_conf_u64,
  1076. .write_string = blkcg_set_conf_iops_w,
  1077. .max_write_len = 256,
  1078. },
  1079. {
  1080. .name = "throttle.io_service_bytes",
  1081. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
  1082. offsetof(struct blkio_group_stats_cpu, service_bytes)),
  1083. .read_seq_string = blkcg_print_cpu_rwstat,
  1084. },
  1085. {
  1086. .name = "throttle.io_serviced",
  1087. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_THROTL,
  1088. offsetof(struct blkio_group_stats_cpu, serviced)),
  1089. .read_seq_string = blkcg_print_cpu_rwstat,
  1090. },
  1091. #endif /* CONFIG_BLK_DEV_THROTTLING */
  1092. #ifdef CONFIG_DEBUG_BLK_CGROUP
  1093. {
  1094. .name = "avg_queue_size",
  1095. .read_seq_string = blkcg_print_avg_queue_size,
  1096. },
  1097. {
  1098. .name = "group_wait_time",
  1099. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1100. offsetof(struct blkio_group_stats, group_wait_time)),
  1101. .read_seq_string = blkcg_print_stat,
  1102. },
  1103. {
  1104. .name = "idle_time",
  1105. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1106. offsetof(struct blkio_group_stats, idle_time)),
  1107. .read_seq_string = blkcg_print_stat,
  1108. },
  1109. {
  1110. .name = "empty_time",
  1111. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1112. offsetof(struct blkio_group_stats, empty_time)),
  1113. .read_seq_string = blkcg_print_stat,
  1114. },
  1115. {
  1116. .name = "dequeue",
  1117. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1118. offsetof(struct blkio_group_stats, dequeue)),
  1119. .read_seq_string = blkcg_print_stat,
  1120. },
  1121. {
  1122. .name = "unaccounted_time",
  1123. .private = BLKCG_STAT_PRIV(BLKIO_POLICY_PROP,
  1124. offsetof(struct blkio_group_stats, unaccounted_time)),
  1125. .read_seq_string = blkcg_print_stat,
  1126. },
  1127. #endif
  1128. { } /* terminate */
  1129. };
  1130. /**
  1131. * blkiocg_pre_destroy - cgroup pre_destroy callback
  1132. * @cgroup: cgroup of interest
  1133. *
  1134. * This function is called when @cgroup is about to go away and responsible
  1135. * for shooting down all blkgs associated with @cgroup. blkgs should be
  1136. * removed while holding both q and blkcg locks. As blkcg lock is nested
  1137. * inside q lock, this function performs reverse double lock dancing.
  1138. *
  1139. * This is the blkcg counterpart of ioc_release_fn().
  1140. */
  1141. static int blkiocg_pre_destroy(struct cgroup *cgroup)
  1142. {
  1143. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  1144. spin_lock_irq(&blkcg->lock);
  1145. while (!hlist_empty(&blkcg->blkg_list)) {
  1146. struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
  1147. struct blkio_group, blkcg_node);
  1148. struct request_queue *q = blkg->q;
  1149. if (spin_trylock(q->queue_lock)) {
  1150. blkg_destroy(blkg);
  1151. spin_unlock(q->queue_lock);
  1152. } else {
  1153. spin_unlock_irq(&blkcg->lock);
  1154. cpu_relax();
  1155. spin_lock_irq(&blkcg->lock);
  1156. }
  1157. }
  1158. spin_unlock_irq(&blkcg->lock);
  1159. return 0;
  1160. }
  1161. static void blkiocg_destroy(struct cgroup *cgroup)
  1162. {
  1163. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  1164. if (blkcg != &blkio_root_cgroup)
  1165. kfree(blkcg);
  1166. }
  1167. static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
  1168. {
  1169. static atomic64_t id_seq = ATOMIC64_INIT(0);
  1170. struct blkio_cgroup *blkcg;
  1171. struct cgroup *parent = cgroup->parent;
  1172. if (!parent) {
  1173. blkcg = &blkio_root_cgroup;
  1174. goto done;
  1175. }
  1176. blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
  1177. if (!blkcg)
  1178. return ERR_PTR(-ENOMEM);
  1179. blkcg->weight = BLKIO_WEIGHT_DEFAULT;
  1180. blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
  1181. done:
  1182. spin_lock_init(&blkcg->lock);
  1183. INIT_HLIST_HEAD(&blkcg->blkg_list);
  1184. return &blkcg->css;
  1185. }
  1186. /**
  1187. * blkcg_init_queue - initialize blkcg part of request queue
  1188. * @q: request_queue to initialize
  1189. *
  1190. * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
  1191. * part of new request_queue @q.
  1192. *
  1193. * RETURNS:
  1194. * 0 on success, -errno on failure.
  1195. */
  1196. int blkcg_init_queue(struct request_queue *q)
  1197. {
  1198. int ret;
  1199. might_sleep();
  1200. ret = blk_throtl_init(q);
  1201. if (ret)
  1202. return ret;
  1203. mutex_lock(&all_q_mutex);
  1204. INIT_LIST_HEAD(&q->all_q_node);
  1205. list_add_tail(&q->all_q_node, &all_q_list);
  1206. mutex_unlock(&all_q_mutex);
  1207. return 0;
  1208. }
  1209. /**
  1210. * blkcg_drain_queue - drain blkcg part of request_queue
  1211. * @q: request_queue to drain
  1212. *
  1213. * Called from blk_drain_queue(). Responsible for draining blkcg part.
  1214. */
  1215. void blkcg_drain_queue(struct request_queue *q)
  1216. {
  1217. lockdep_assert_held(q->queue_lock);
  1218. blk_throtl_drain(q);
  1219. }
  1220. /**
  1221. * blkcg_exit_queue - exit and release blkcg part of request_queue
  1222. * @q: request_queue being released
  1223. *
  1224. * Called from blk_release_queue(). Responsible for exiting blkcg part.
  1225. */
  1226. void blkcg_exit_queue(struct request_queue *q)
  1227. {
  1228. mutex_lock(&all_q_mutex);
  1229. list_del_init(&q->all_q_node);
  1230. mutex_unlock(&all_q_mutex);
  1231. blkg_destroy_all(q, true);
  1232. blk_throtl_exit(q);
  1233. }
  1234. /*
  1235. * We cannot support shared io contexts, as we have no mean to support
  1236. * two tasks with the same ioc in two different groups without major rework
  1237. * of the main cic data structures. For now we allow a task to change
  1238. * its cgroup only if it's the only owner of its ioc.
  1239. */
  1240. static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
  1241. {
  1242. struct task_struct *task;
  1243. struct io_context *ioc;
  1244. int ret = 0;
  1245. /* task_lock() is needed to avoid races with exit_io_context() */
  1246. cgroup_taskset_for_each(task, cgrp, tset) {
  1247. task_lock(task);
  1248. ioc = task->io_context;
  1249. if (ioc && atomic_read(&ioc->nr_tasks) > 1)
  1250. ret = -EINVAL;
  1251. task_unlock(task);
  1252. if (ret)
  1253. break;
  1254. }
  1255. return ret;
  1256. }
  1257. static void blkcg_bypass_start(void)
  1258. __acquires(&all_q_mutex)
  1259. {
  1260. struct request_queue *q;
  1261. mutex_lock(&all_q_mutex);
  1262. list_for_each_entry(q, &all_q_list, all_q_node) {
  1263. blk_queue_bypass_start(q);
  1264. blkg_destroy_all(q, false);
  1265. }
  1266. }
  1267. static void blkcg_bypass_end(void)
  1268. __releases(&all_q_mutex)
  1269. {
  1270. struct request_queue *q;
  1271. list_for_each_entry(q, &all_q_list, all_q_node)
  1272. blk_queue_bypass_end(q);
  1273. mutex_unlock(&all_q_mutex);
  1274. }
  1275. struct cgroup_subsys blkio_subsys = {
  1276. .name = "blkio",
  1277. .create = blkiocg_create,
  1278. .can_attach = blkiocg_can_attach,
  1279. .pre_destroy = blkiocg_pre_destroy,
  1280. .destroy = blkiocg_destroy,
  1281. .subsys_id = blkio_subsys_id,
  1282. .base_cftypes = blkio_files,
  1283. .module = THIS_MODULE,
  1284. };
  1285. EXPORT_SYMBOL_GPL(blkio_subsys);
  1286. void blkio_policy_register(struct blkio_policy_type *blkiop)
  1287. {
  1288. struct request_queue *q;
  1289. blkcg_bypass_start();
  1290. spin_lock(&blkio_list_lock);
  1291. BUG_ON(blkio_policy[blkiop->plid]);
  1292. blkio_policy[blkiop->plid] = blkiop;
  1293. list_add_tail(&blkiop->list, &blkio_list);
  1294. spin_unlock(&blkio_list_lock);
  1295. list_for_each_entry(q, &all_q_list, all_q_node)
  1296. update_root_blkg_pd(q, blkiop->plid);
  1297. blkcg_bypass_end();
  1298. if (blkiop->cftypes)
  1299. WARN_ON(cgroup_add_cftypes(&blkio_subsys, blkiop->cftypes));
  1300. }
  1301. EXPORT_SYMBOL_GPL(blkio_policy_register);
  1302. void blkio_policy_unregister(struct blkio_policy_type *blkiop)
  1303. {
  1304. struct request_queue *q;
  1305. if (blkiop->cftypes)
  1306. cgroup_rm_cftypes(&blkio_subsys, blkiop->cftypes);
  1307. blkcg_bypass_start();
  1308. spin_lock(&blkio_list_lock);
  1309. BUG_ON(blkio_policy[blkiop->plid] != blkiop);
  1310. blkio_policy[blkiop->plid] = NULL;
  1311. list_del_init(&blkiop->list);
  1312. spin_unlock(&blkio_list_lock);
  1313. list_for_each_entry(q, &all_q_list, all_q_node)
  1314. update_root_blkg_pd(q, blkiop->plid);
  1315. blkcg_bypass_end();
  1316. }
  1317. EXPORT_SYMBOL_GPL(blkio_policy_unregister);