blk-cgroup.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056
  1. /*
  2. * Common Block IO controller cgroup interface
  3. *
  4. * Based on ideas and code from CFQ, CFS and BFQ:
  5. * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  6. *
  7. * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  8. * Paolo Valente <paolo.valente@unimore.it>
  9. *
  10. * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11. * Nauman Rafique <nauman@google.com>
  12. */
  13. #include <linux/ioprio.h>
  14. #include <linux/seq_file.h>
  15. #include <linux/kdev_t.h>
  16. #include <linux/module.h>
  17. #include <linux/err.h>
  18. #include <linux/blkdev.h>
  19. #include <linux/slab.h>
  20. #include "blk-cgroup.h"
  21. #include <linux/genhd.h>
  22. #define MAX_KEY_LEN 100
  23. static DEFINE_SPINLOCK(blkio_list_lock);
  24. static LIST_HEAD(blkio_list);
  25. struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  26. EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  27. static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
  28. struct cgroup *);
  29. static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
  30. struct task_struct *, bool);
  31. static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
  32. struct cgroup *, struct task_struct *, bool);
  33. static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
  34. static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
  35. struct cgroup_subsys blkio_subsys = {
  36. .name = "blkio",
  37. .create = blkiocg_create,
  38. .can_attach = blkiocg_can_attach,
  39. .attach = blkiocg_attach,
  40. .destroy = blkiocg_destroy,
  41. .populate = blkiocg_populate,
  42. #ifdef CONFIG_BLK_CGROUP
  43. /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
  44. .subsys_id = blkio_subsys_id,
  45. #endif
  46. .use_id = 1,
  47. .module = THIS_MODULE,
  48. };
  49. EXPORT_SYMBOL_GPL(blkio_subsys);
  50. static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
  51. struct blkio_policy_node *pn)
  52. {
  53. list_add(&pn->node, &blkcg->policy_list);
  54. }
  55. /* Must be called with blkcg->lock held */
  56. static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
  57. {
  58. list_del(&pn->node);
  59. }
  60. /* Must be called with blkcg->lock held */
  61. static struct blkio_policy_node *
  62. blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
  63. {
  64. struct blkio_policy_node *pn;
  65. list_for_each_entry(pn, &blkcg->policy_list, node) {
  66. if (pn->dev == dev)
  67. return pn;
  68. }
  69. return NULL;
  70. }
  71. struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
  72. {
  73. return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
  74. struct blkio_cgroup, css);
  75. }
  76. EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
  77. void blkio_group_init(struct blkio_group *blkg)
  78. {
  79. spin_lock_init(&blkg->stats_lock);
  80. }
  81. EXPORT_SYMBOL_GPL(blkio_group_init);
  82. /*
  83. * Add to the appropriate stat variable depending on the request type.
  84. * This should be called with the blkg->stats_lock held.
  85. */
  86. static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
  87. bool sync)
  88. {
  89. if (direction)
  90. stat[BLKIO_STAT_WRITE] += add;
  91. else
  92. stat[BLKIO_STAT_READ] += add;
  93. if (sync)
  94. stat[BLKIO_STAT_SYNC] += add;
  95. else
  96. stat[BLKIO_STAT_ASYNC] += add;
  97. }
  98. /*
  99. * Decrements the appropriate stat variable if non-zero depending on the
  100. * request type. Panics on value being zero.
  101. * This should be called with the blkg->stats_lock held.
  102. */
  103. static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
  104. {
  105. if (direction) {
  106. BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
  107. stat[BLKIO_STAT_WRITE]--;
  108. } else {
  109. BUG_ON(stat[BLKIO_STAT_READ] == 0);
  110. stat[BLKIO_STAT_READ]--;
  111. }
  112. if (sync) {
  113. BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
  114. stat[BLKIO_STAT_SYNC]--;
  115. } else {
  116. BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
  117. stat[BLKIO_STAT_ASYNC]--;
  118. }
  119. }
  120. #ifdef CONFIG_DEBUG_BLK_CGROUP
  121. /* This should be called with the blkg->stats_lock held. */
  122. static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
  123. struct blkio_group *curr_blkg)
  124. {
  125. if (blkio_blkg_waiting(&blkg->stats))
  126. return;
  127. if (blkg == curr_blkg)
  128. return;
  129. blkg->stats.start_group_wait_time = sched_clock();
  130. blkio_mark_blkg_waiting(&blkg->stats);
  131. }
  132. /* This should be called with the blkg->stats_lock held. */
  133. static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
  134. {
  135. unsigned long long now;
  136. if (!blkio_blkg_waiting(stats))
  137. return;
  138. now = sched_clock();
  139. if (time_after64(now, stats->start_group_wait_time))
  140. stats->group_wait_time += now - stats->start_group_wait_time;
  141. blkio_clear_blkg_waiting(stats);
  142. }
  143. /* This should be called with the blkg->stats_lock held. */
  144. static void blkio_end_empty_time(struct blkio_group_stats *stats)
  145. {
  146. unsigned long long now;
  147. if (!blkio_blkg_empty(stats))
  148. return;
  149. now = sched_clock();
  150. if (time_after64(now, stats->start_empty_time))
  151. stats->empty_time += now - stats->start_empty_time;
  152. blkio_clear_blkg_empty(stats);
  153. }
  154. void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
  155. {
  156. unsigned long flags;
  157. spin_lock_irqsave(&blkg->stats_lock, flags);
  158. BUG_ON(blkio_blkg_idling(&blkg->stats));
  159. blkg->stats.start_idle_time = sched_clock();
  160. blkio_mark_blkg_idling(&blkg->stats);
  161. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  162. }
  163. EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
  164. void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
  165. {
  166. unsigned long flags;
  167. unsigned long long now;
  168. struct blkio_group_stats *stats;
  169. spin_lock_irqsave(&blkg->stats_lock, flags);
  170. stats = &blkg->stats;
  171. if (blkio_blkg_idling(stats)) {
  172. now = sched_clock();
  173. if (time_after64(now, stats->start_idle_time))
  174. stats->idle_time += now - stats->start_idle_time;
  175. blkio_clear_blkg_idling(stats);
  176. }
  177. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  178. }
  179. EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
  180. void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
  181. {
  182. unsigned long flags;
  183. struct blkio_group_stats *stats;
  184. spin_lock_irqsave(&blkg->stats_lock, flags);
  185. stats = &blkg->stats;
  186. stats->avg_queue_size_sum +=
  187. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
  188. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
  189. stats->avg_queue_size_samples++;
  190. blkio_update_group_wait_time(stats);
  191. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  192. }
  193. EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
  194. void blkiocg_set_start_empty_time(struct blkio_group *blkg, bool ignore)
  195. {
  196. unsigned long flags;
  197. struct blkio_group_stats *stats;
  198. spin_lock_irqsave(&blkg->stats_lock, flags);
  199. stats = &blkg->stats;
  200. if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
  201. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
  202. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  203. return;
  204. }
  205. /*
  206. * If ignore is set, we do not panic on the empty flag being set
  207. * already. This is to avoid cases where there are superfluous timeslice
  208. * complete events (for eg., forced_dispatch in CFQ) when no IOs are
  209. * served which could result in triggering the empty check incorrectly.
  210. */
  211. BUG_ON(!ignore && blkio_blkg_empty(stats));
  212. stats->start_empty_time = sched_clock();
  213. blkio_mark_blkg_empty(stats);
  214. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  215. }
  216. EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
  217. void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
  218. unsigned long dequeue)
  219. {
  220. blkg->stats.dequeue += dequeue;
  221. }
  222. EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
  223. #else
  224. static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
  225. struct blkio_group *curr_blkg) {}
  226. static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
  227. #endif
  228. void blkiocg_update_io_add_stats(struct blkio_group *blkg,
  229. struct blkio_group *curr_blkg, bool direction,
  230. bool sync)
  231. {
  232. unsigned long flags;
  233. spin_lock_irqsave(&blkg->stats_lock, flags);
  234. blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
  235. sync);
  236. blkio_end_empty_time(&blkg->stats);
  237. blkio_set_start_group_wait_time(blkg, curr_blkg);
  238. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  239. }
  240. EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
  241. void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
  242. bool direction, bool sync)
  243. {
  244. unsigned long flags;
  245. spin_lock_irqsave(&blkg->stats_lock, flags);
  246. blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
  247. direction, sync);
  248. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  249. }
  250. EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
  251. void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
  252. {
  253. unsigned long flags;
  254. spin_lock_irqsave(&blkg->stats_lock, flags);
  255. blkg->stats.time += time;
  256. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  257. }
  258. EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
  259. void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
  260. uint64_t bytes, bool direction, bool sync)
  261. {
  262. struct blkio_group_stats *stats;
  263. unsigned long flags;
  264. spin_lock_irqsave(&blkg->stats_lock, flags);
  265. stats = &blkg->stats;
  266. stats->sectors += bytes >> 9;
  267. blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
  268. sync);
  269. blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
  270. direction, sync);
  271. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  272. }
  273. EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
  274. void blkiocg_update_completion_stats(struct blkio_group *blkg,
  275. uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
  276. {
  277. struct blkio_group_stats *stats;
  278. unsigned long flags;
  279. unsigned long long now = sched_clock();
  280. spin_lock_irqsave(&blkg->stats_lock, flags);
  281. stats = &blkg->stats;
  282. if (time_after64(now, io_start_time))
  283. blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
  284. now - io_start_time, direction, sync);
  285. if (time_after64(io_start_time, start_time))
  286. blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
  287. io_start_time - start_time, direction, sync);
  288. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  289. }
  290. EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
  291. void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
  292. bool sync)
  293. {
  294. unsigned long flags;
  295. spin_lock_irqsave(&blkg->stats_lock, flags);
  296. blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
  297. sync);
  298. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  299. }
  300. EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
  301. void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
  302. struct blkio_group *blkg, void *key, dev_t dev)
  303. {
  304. unsigned long flags;
  305. spin_lock_irqsave(&blkcg->lock, flags);
  306. rcu_assign_pointer(blkg->key, key);
  307. blkg->blkcg_id = css_id(&blkcg->css);
  308. hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
  309. spin_unlock_irqrestore(&blkcg->lock, flags);
  310. #ifdef CONFIG_DEBUG_BLK_CGROUP
  311. /* Need to take css reference ? */
  312. cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
  313. #endif
  314. blkg->dev = dev;
  315. }
  316. EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
  317. static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
  318. {
  319. hlist_del_init_rcu(&blkg->blkcg_node);
  320. blkg->blkcg_id = 0;
  321. }
  322. /*
  323. * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
  324. * indicating that blk_group was unhashed by the time we got to it.
  325. */
  326. int blkiocg_del_blkio_group(struct blkio_group *blkg)
  327. {
  328. struct blkio_cgroup *blkcg;
  329. unsigned long flags;
  330. struct cgroup_subsys_state *css;
  331. int ret = 1;
  332. rcu_read_lock();
  333. css = css_lookup(&blkio_subsys, blkg->blkcg_id);
  334. if (!css)
  335. goto out;
  336. blkcg = container_of(css, struct blkio_cgroup, css);
  337. spin_lock_irqsave(&blkcg->lock, flags);
  338. if (!hlist_unhashed(&blkg->blkcg_node)) {
  339. __blkiocg_del_blkio_group(blkg);
  340. ret = 0;
  341. }
  342. spin_unlock_irqrestore(&blkcg->lock, flags);
  343. out:
  344. rcu_read_unlock();
  345. return ret;
  346. }
  347. EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
  348. /* called under rcu_read_lock(). */
  349. struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
  350. {
  351. struct blkio_group *blkg;
  352. struct hlist_node *n;
  353. void *__key;
  354. hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
  355. __key = blkg->key;
  356. if (__key == key)
  357. return blkg;
  358. }
  359. return NULL;
  360. }
  361. EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
  362. #define SHOW_FUNCTION(__VAR) \
  363. static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
  364. struct cftype *cftype) \
  365. { \
  366. struct blkio_cgroup *blkcg; \
  367. \
  368. blkcg = cgroup_to_blkio_cgroup(cgroup); \
  369. return (u64)blkcg->__VAR; \
  370. }
  371. SHOW_FUNCTION(weight);
  372. #undef SHOW_FUNCTION
  373. static int
  374. blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
  375. {
  376. struct blkio_cgroup *blkcg;
  377. struct blkio_group *blkg;
  378. struct hlist_node *n;
  379. struct blkio_policy_type *blkiop;
  380. struct blkio_policy_node *pn;
  381. if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
  382. return -EINVAL;
  383. blkcg = cgroup_to_blkio_cgroup(cgroup);
  384. spin_lock(&blkio_list_lock);
  385. spin_lock_irq(&blkcg->lock);
  386. blkcg->weight = (unsigned int)val;
  387. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  388. pn = blkio_policy_search_node(blkcg, blkg->dev);
  389. if (pn)
  390. continue;
  391. list_for_each_entry(blkiop, &blkio_list, list)
  392. blkiop->ops.blkio_update_group_weight_fn(blkg,
  393. blkcg->weight);
  394. }
  395. spin_unlock_irq(&blkcg->lock);
  396. spin_unlock(&blkio_list_lock);
  397. return 0;
  398. }
  399. static int
  400. blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
  401. {
  402. struct blkio_cgroup *blkcg;
  403. struct blkio_group *blkg;
  404. struct blkio_group_stats *stats;
  405. struct hlist_node *n;
  406. uint64_t queued[BLKIO_STAT_TOTAL];
  407. int i;
  408. #ifdef CONFIG_DEBUG_BLK_CGROUP
  409. bool idling, waiting, empty;
  410. unsigned long long now = sched_clock();
  411. #endif
  412. blkcg = cgroup_to_blkio_cgroup(cgroup);
  413. spin_lock_irq(&blkcg->lock);
  414. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  415. spin_lock(&blkg->stats_lock);
  416. stats = &blkg->stats;
  417. #ifdef CONFIG_DEBUG_BLK_CGROUP
  418. idling = blkio_blkg_idling(stats);
  419. waiting = blkio_blkg_waiting(stats);
  420. empty = blkio_blkg_empty(stats);
  421. #endif
  422. for (i = 0; i < BLKIO_STAT_TOTAL; i++)
  423. queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
  424. memset(stats, 0, sizeof(struct blkio_group_stats));
  425. for (i = 0; i < BLKIO_STAT_TOTAL; i++)
  426. stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
  427. #ifdef CONFIG_DEBUG_BLK_CGROUP
  428. if (idling) {
  429. blkio_mark_blkg_idling(stats);
  430. stats->start_idle_time = now;
  431. }
  432. if (waiting) {
  433. blkio_mark_blkg_waiting(stats);
  434. stats->start_group_wait_time = now;
  435. }
  436. if (empty) {
  437. blkio_mark_blkg_empty(stats);
  438. stats->start_empty_time = now;
  439. }
  440. #endif
  441. spin_unlock(&blkg->stats_lock);
  442. }
  443. spin_unlock_irq(&blkcg->lock);
  444. return 0;
  445. }
  446. static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
  447. int chars_left, bool diskname_only)
  448. {
  449. snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
  450. chars_left -= strlen(str);
  451. if (chars_left <= 0) {
  452. printk(KERN_WARNING
  453. "Possibly incorrect cgroup stat display format");
  454. return;
  455. }
  456. if (diskname_only)
  457. return;
  458. switch (type) {
  459. case BLKIO_STAT_READ:
  460. strlcat(str, " Read", chars_left);
  461. break;
  462. case BLKIO_STAT_WRITE:
  463. strlcat(str, " Write", chars_left);
  464. break;
  465. case BLKIO_STAT_SYNC:
  466. strlcat(str, " Sync", chars_left);
  467. break;
  468. case BLKIO_STAT_ASYNC:
  469. strlcat(str, " Async", chars_left);
  470. break;
  471. case BLKIO_STAT_TOTAL:
  472. strlcat(str, " Total", chars_left);
  473. break;
  474. default:
  475. strlcat(str, " Invalid", chars_left);
  476. }
  477. }
  478. static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
  479. struct cgroup_map_cb *cb, dev_t dev)
  480. {
  481. blkio_get_key_name(0, dev, str, chars_left, true);
  482. cb->fill(cb, str, val);
  483. return val;
  484. }
  485. /* This should be called with blkg->stats_lock held */
  486. static uint64_t blkio_get_stat(struct blkio_group *blkg,
  487. struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
  488. {
  489. uint64_t disk_total;
  490. char key_str[MAX_KEY_LEN];
  491. enum stat_sub_type sub_type;
  492. if (type == BLKIO_STAT_TIME)
  493. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  494. blkg->stats.time, cb, dev);
  495. if (type == BLKIO_STAT_SECTORS)
  496. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  497. blkg->stats.sectors, cb, dev);
  498. #ifdef CONFIG_DEBUG_BLK_CGROUP
  499. if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
  500. uint64_t sum = blkg->stats.avg_queue_size_sum;
  501. uint64_t samples = blkg->stats.avg_queue_size_samples;
  502. if (samples)
  503. do_div(sum, samples);
  504. else
  505. sum = 0;
  506. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
  507. }
  508. if (type == BLKIO_STAT_GROUP_WAIT_TIME)
  509. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  510. blkg->stats.group_wait_time, cb, dev);
  511. if (type == BLKIO_STAT_IDLE_TIME)
  512. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  513. blkg->stats.idle_time, cb, dev);
  514. if (type == BLKIO_STAT_EMPTY_TIME)
  515. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  516. blkg->stats.empty_time, cb, dev);
  517. if (type == BLKIO_STAT_DEQUEUE)
  518. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  519. blkg->stats.dequeue, cb, dev);
  520. #endif
  521. for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
  522. sub_type++) {
  523. blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
  524. cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
  525. }
  526. disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
  527. blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
  528. blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
  529. cb->fill(cb, key_str, disk_total);
  530. return disk_total;
  531. }
  532. #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
  533. static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
  534. struct cftype *cftype, struct cgroup_map_cb *cb) \
  535. { \
  536. struct blkio_cgroup *blkcg; \
  537. struct blkio_group *blkg; \
  538. struct hlist_node *n; \
  539. uint64_t cgroup_total = 0; \
  540. \
  541. if (!cgroup_lock_live_group(cgroup)) \
  542. return -ENODEV; \
  543. \
  544. blkcg = cgroup_to_blkio_cgroup(cgroup); \
  545. rcu_read_lock(); \
  546. hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
  547. if (blkg->dev) { \
  548. spin_lock_irq(&blkg->stats_lock); \
  549. cgroup_total += blkio_get_stat(blkg, cb, \
  550. blkg->dev, type); \
  551. spin_unlock_irq(&blkg->stats_lock); \
  552. } \
  553. } \
  554. if (show_total) \
  555. cb->fill(cb, "Total", cgroup_total); \
  556. rcu_read_unlock(); \
  557. cgroup_unlock(); \
  558. return 0; \
  559. }
  560. SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
  561. SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
  562. SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
  563. SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
  564. SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
  565. SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
  566. SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
  567. SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
  568. #ifdef CONFIG_DEBUG_BLK_CGROUP
  569. SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
  570. SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
  571. SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
  572. SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
  573. SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
  574. #endif
  575. #undef SHOW_FUNCTION_PER_GROUP
  576. static int blkio_check_dev_num(dev_t dev)
  577. {
  578. int part = 0;
  579. struct gendisk *disk;
  580. disk = get_gendisk(dev, &part);
  581. if (!disk || part)
  582. return -ENODEV;
  583. return 0;
  584. }
  585. static int blkio_policy_parse_and_set(char *buf,
  586. struct blkio_policy_node *newpn)
  587. {
  588. char *s[4], *p, *major_s = NULL, *minor_s = NULL;
  589. int ret;
  590. unsigned long major, minor, temp;
  591. int i = 0;
  592. dev_t dev;
  593. memset(s, 0, sizeof(s));
  594. while ((p = strsep(&buf, " ")) != NULL) {
  595. if (!*p)
  596. continue;
  597. s[i++] = p;
  598. /* Prevent from inputing too many things */
  599. if (i == 3)
  600. break;
  601. }
  602. if (i != 2)
  603. return -EINVAL;
  604. p = strsep(&s[0], ":");
  605. if (p != NULL)
  606. major_s = p;
  607. else
  608. return -EINVAL;
  609. minor_s = s[0];
  610. if (!minor_s)
  611. return -EINVAL;
  612. ret = strict_strtoul(major_s, 10, &major);
  613. if (ret)
  614. return -EINVAL;
  615. ret = strict_strtoul(minor_s, 10, &minor);
  616. if (ret)
  617. return -EINVAL;
  618. dev = MKDEV(major, minor);
  619. ret = blkio_check_dev_num(dev);
  620. if (ret)
  621. return ret;
  622. newpn->dev = dev;
  623. if (s[1] == NULL)
  624. return -EINVAL;
  625. ret = strict_strtoul(s[1], 10, &temp);
  626. if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
  627. temp > BLKIO_WEIGHT_MAX)
  628. return -EINVAL;
  629. newpn->weight = temp;
  630. return 0;
  631. }
  632. unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
  633. dev_t dev)
  634. {
  635. struct blkio_policy_node *pn;
  636. pn = blkio_policy_search_node(blkcg, dev);
  637. if (pn)
  638. return pn->weight;
  639. else
  640. return blkcg->weight;
  641. }
  642. EXPORT_SYMBOL_GPL(blkcg_get_weight);
  643. static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
  644. const char *buffer)
  645. {
  646. int ret = 0;
  647. char *buf;
  648. struct blkio_policy_node *newpn, *pn;
  649. struct blkio_cgroup *blkcg;
  650. struct blkio_group *blkg;
  651. int keep_newpn = 0;
  652. struct hlist_node *n;
  653. struct blkio_policy_type *blkiop;
  654. buf = kstrdup(buffer, GFP_KERNEL);
  655. if (!buf)
  656. return -ENOMEM;
  657. newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
  658. if (!newpn) {
  659. ret = -ENOMEM;
  660. goto free_buf;
  661. }
  662. ret = blkio_policy_parse_and_set(buf, newpn);
  663. if (ret)
  664. goto free_newpn;
  665. blkcg = cgroup_to_blkio_cgroup(cgrp);
  666. spin_lock_irq(&blkcg->lock);
  667. pn = blkio_policy_search_node(blkcg, newpn->dev);
  668. if (!pn) {
  669. if (newpn->weight != 0) {
  670. blkio_policy_insert_node(blkcg, newpn);
  671. keep_newpn = 1;
  672. }
  673. spin_unlock_irq(&blkcg->lock);
  674. goto update_io_group;
  675. }
  676. if (newpn->weight == 0) {
  677. /* weight == 0 means deleteing a specific weight */
  678. blkio_policy_delete_node(pn);
  679. spin_unlock_irq(&blkcg->lock);
  680. goto update_io_group;
  681. }
  682. spin_unlock_irq(&blkcg->lock);
  683. pn->weight = newpn->weight;
  684. update_io_group:
  685. /* update weight for each cfqg */
  686. spin_lock(&blkio_list_lock);
  687. spin_lock_irq(&blkcg->lock);
  688. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  689. if (newpn->dev == blkg->dev) {
  690. list_for_each_entry(blkiop, &blkio_list, list)
  691. blkiop->ops.blkio_update_group_weight_fn(blkg,
  692. newpn->weight ?
  693. newpn->weight :
  694. blkcg->weight);
  695. }
  696. }
  697. spin_unlock_irq(&blkcg->lock);
  698. spin_unlock(&blkio_list_lock);
  699. free_newpn:
  700. if (!keep_newpn)
  701. kfree(newpn);
  702. free_buf:
  703. kfree(buf);
  704. return ret;
  705. }
  706. static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
  707. struct seq_file *m)
  708. {
  709. struct blkio_cgroup *blkcg;
  710. struct blkio_policy_node *pn;
  711. seq_printf(m, "dev\tweight\n");
  712. blkcg = cgroup_to_blkio_cgroup(cgrp);
  713. if (list_empty(&blkcg->policy_list))
  714. goto out;
  715. spin_lock_irq(&blkcg->lock);
  716. list_for_each_entry(pn, &blkcg->policy_list, node) {
  717. seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
  718. MINOR(pn->dev), pn->weight);
  719. }
  720. spin_unlock_irq(&blkcg->lock);
  721. out:
  722. return 0;
  723. }
  724. struct cftype blkio_files[] = {
  725. {
  726. .name = "weight_device",
  727. .read_seq_string = blkiocg_weight_device_read,
  728. .write_string = blkiocg_weight_device_write,
  729. .max_write_len = 256,
  730. },
  731. {
  732. .name = "weight",
  733. .read_u64 = blkiocg_weight_read,
  734. .write_u64 = blkiocg_weight_write,
  735. },
  736. {
  737. .name = "time",
  738. .read_map = blkiocg_time_read,
  739. },
  740. {
  741. .name = "sectors",
  742. .read_map = blkiocg_sectors_read,
  743. },
  744. {
  745. .name = "io_service_bytes",
  746. .read_map = blkiocg_io_service_bytes_read,
  747. },
  748. {
  749. .name = "io_serviced",
  750. .read_map = blkiocg_io_serviced_read,
  751. },
  752. {
  753. .name = "io_service_time",
  754. .read_map = blkiocg_io_service_time_read,
  755. },
  756. {
  757. .name = "io_wait_time",
  758. .read_map = blkiocg_io_wait_time_read,
  759. },
  760. {
  761. .name = "io_merged",
  762. .read_map = blkiocg_io_merged_read,
  763. },
  764. {
  765. .name = "io_queued",
  766. .read_map = blkiocg_io_queued_read,
  767. },
  768. {
  769. .name = "reset_stats",
  770. .write_u64 = blkiocg_reset_stats,
  771. },
  772. #ifdef CONFIG_DEBUG_BLK_CGROUP
  773. {
  774. .name = "avg_queue_size",
  775. .read_map = blkiocg_avg_queue_size_read,
  776. },
  777. {
  778. .name = "group_wait_time",
  779. .read_map = blkiocg_group_wait_time_read,
  780. },
  781. {
  782. .name = "idle_time",
  783. .read_map = blkiocg_idle_time_read,
  784. },
  785. {
  786. .name = "empty_time",
  787. .read_map = blkiocg_empty_time_read,
  788. },
  789. {
  790. .name = "dequeue",
  791. .read_map = blkiocg_dequeue_read,
  792. },
  793. #endif
  794. };
  795. static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  796. {
  797. return cgroup_add_files(cgroup, subsys, blkio_files,
  798. ARRAY_SIZE(blkio_files));
  799. }
  800. static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  801. {
  802. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  803. unsigned long flags;
  804. struct blkio_group *blkg;
  805. void *key;
  806. struct blkio_policy_type *blkiop;
  807. struct blkio_policy_node *pn, *pntmp;
  808. rcu_read_lock();
  809. remove_entry:
  810. spin_lock_irqsave(&blkcg->lock, flags);
  811. if (hlist_empty(&blkcg->blkg_list)) {
  812. spin_unlock_irqrestore(&blkcg->lock, flags);
  813. goto done;
  814. }
  815. blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
  816. blkcg_node);
  817. key = rcu_dereference(blkg->key);
  818. __blkiocg_del_blkio_group(blkg);
  819. spin_unlock_irqrestore(&blkcg->lock, flags);
  820. /*
  821. * This blkio_group is being unlinked as associated cgroup is going
  822. * away. Let all the IO controlling policies know about this event.
  823. *
  824. * Currently this is static call to one io controlling policy. Once
  825. * we have more policies in place, we need some dynamic registration
  826. * of callback function.
  827. */
  828. spin_lock(&blkio_list_lock);
  829. list_for_each_entry(blkiop, &blkio_list, list)
  830. blkiop->ops.blkio_unlink_group_fn(key, blkg);
  831. spin_unlock(&blkio_list_lock);
  832. goto remove_entry;
  833. done:
  834. list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
  835. blkio_policy_delete_node(pn);
  836. kfree(pn);
  837. }
  838. free_css_id(&blkio_subsys, &blkcg->css);
  839. rcu_read_unlock();
  840. if (blkcg != &blkio_root_cgroup)
  841. kfree(blkcg);
  842. }
  843. static struct cgroup_subsys_state *
  844. blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  845. {
  846. struct blkio_cgroup *blkcg, *parent_blkcg;
  847. if (!cgroup->parent) {
  848. blkcg = &blkio_root_cgroup;
  849. goto done;
  850. }
  851. /* Currently we do not support hierarchy deeper than two level (0,1) */
  852. parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
  853. if (css_depth(&parent_blkcg->css) > 0)
  854. return ERR_PTR(-EINVAL);
  855. blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
  856. if (!blkcg)
  857. return ERR_PTR(-ENOMEM);
  858. blkcg->weight = BLKIO_WEIGHT_DEFAULT;
  859. done:
  860. spin_lock_init(&blkcg->lock);
  861. INIT_HLIST_HEAD(&blkcg->blkg_list);
  862. INIT_LIST_HEAD(&blkcg->policy_list);
  863. return &blkcg->css;
  864. }
  865. /*
  866. * We cannot support shared io contexts, as we have no mean to support
  867. * two tasks with the same ioc in two different groups without major rework
  868. * of the main cic data structures. For now we allow a task to change
  869. * its cgroup only if it's the only owner of its ioc.
  870. */
  871. static int blkiocg_can_attach(struct cgroup_subsys *subsys,
  872. struct cgroup *cgroup, struct task_struct *tsk,
  873. bool threadgroup)
  874. {
  875. struct io_context *ioc;
  876. int ret = 0;
  877. /* task_lock() is needed to avoid races with exit_io_context() */
  878. task_lock(tsk);
  879. ioc = tsk->io_context;
  880. if (ioc && atomic_read(&ioc->nr_tasks) > 1)
  881. ret = -EINVAL;
  882. task_unlock(tsk);
  883. return ret;
  884. }
  885. static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
  886. struct cgroup *prev, struct task_struct *tsk,
  887. bool threadgroup)
  888. {
  889. struct io_context *ioc;
  890. task_lock(tsk);
  891. ioc = tsk->io_context;
  892. if (ioc)
  893. ioc->cgroup_changed = 1;
  894. task_unlock(tsk);
  895. }
  896. void blkio_policy_register(struct blkio_policy_type *blkiop)
  897. {
  898. spin_lock(&blkio_list_lock);
  899. list_add_tail(&blkiop->list, &blkio_list);
  900. spin_unlock(&blkio_list_lock);
  901. }
  902. EXPORT_SYMBOL_GPL(blkio_policy_register);
  903. void blkio_policy_unregister(struct blkio_policy_type *blkiop)
  904. {
  905. spin_lock(&blkio_list_lock);
  906. list_del_init(&blkiop->list);
  907. spin_unlock(&blkio_list_lock);
  908. }
  909. EXPORT_SYMBOL_GPL(blkio_policy_unregister);
  910. static int __init init_cgroup_blkio(void)
  911. {
  912. return cgroup_load_subsys(&blkio_subsys);
  913. }
  914. static void __exit exit_cgroup_blkio(void)
  915. {
  916. cgroup_unload_subsys(&blkio_subsys);
  917. }
  918. module_init(init_cgroup_blkio);
  919. module_exit(exit_cgroup_blkio);
  920. MODULE_LICENSE("GPL");