blk-cgroup.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052
  1. /*
  2. * Common Block IO controller cgroup interface
  3. *
  4. * Based on ideas and code from CFQ, CFS and BFQ:
  5. * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  6. *
  7. * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  8. * Paolo Valente <paolo.valente@unimore.it>
  9. *
  10. * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11. * Nauman Rafique <nauman@google.com>
  12. */
  13. #include <linux/ioprio.h>
  14. #include <linux/seq_file.h>
  15. #include <linux/kdev_t.h>
  16. #include <linux/module.h>
  17. #include <linux/err.h>
  18. #include <linux/blkdev.h>
  19. #include <linux/slab.h>
  20. #include "blk-cgroup.h"
  21. #include <linux/genhd.h>
  22. #define MAX_KEY_LEN 100
  23. static DEFINE_SPINLOCK(blkio_list_lock);
  24. static LIST_HEAD(blkio_list);
  25. struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  26. EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  27. static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
  28. struct cgroup *);
  29. static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
  30. struct task_struct *, bool);
  31. static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
  32. struct cgroup *, struct task_struct *, bool);
  33. static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
  34. static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
  35. struct cgroup_subsys blkio_subsys = {
  36. .name = "blkio",
  37. .create = blkiocg_create,
  38. .can_attach = blkiocg_can_attach,
  39. .attach = blkiocg_attach,
  40. .destroy = blkiocg_destroy,
  41. .populate = blkiocg_populate,
  42. #ifdef CONFIG_BLK_CGROUP
  43. /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
  44. .subsys_id = blkio_subsys_id,
  45. #endif
  46. .use_id = 1,
  47. .module = THIS_MODULE,
  48. };
  49. EXPORT_SYMBOL_GPL(blkio_subsys);
  50. static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
  51. struct blkio_policy_node *pn)
  52. {
  53. list_add(&pn->node, &blkcg->policy_list);
  54. }
  55. /* Must be called with blkcg->lock held */
  56. static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
  57. {
  58. list_del(&pn->node);
  59. }
  60. /* Must be called with blkcg->lock held */
  61. static struct blkio_policy_node *
  62. blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
  63. {
  64. struct blkio_policy_node *pn;
  65. list_for_each_entry(pn, &blkcg->policy_list, node) {
  66. if (pn->dev == dev)
  67. return pn;
  68. }
  69. return NULL;
  70. }
  71. struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
  72. {
  73. return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
  74. struct blkio_cgroup, css);
  75. }
  76. EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
  77. /*
  78. * Add to the appropriate stat variable depending on the request type.
  79. * This should be called with the blkg->stats_lock held.
  80. */
  81. static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
  82. bool sync)
  83. {
  84. if (direction)
  85. stat[BLKIO_STAT_WRITE] += add;
  86. else
  87. stat[BLKIO_STAT_READ] += add;
  88. if (sync)
  89. stat[BLKIO_STAT_SYNC] += add;
  90. else
  91. stat[BLKIO_STAT_ASYNC] += add;
  92. }
  93. /*
  94. * Decrements the appropriate stat variable if non-zero depending on the
  95. * request type. Panics on value being zero.
  96. * This should be called with the blkg->stats_lock held.
  97. */
  98. static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
  99. {
  100. if (direction) {
  101. BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
  102. stat[BLKIO_STAT_WRITE]--;
  103. } else {
  104. BUG_ON(stat[BLKIO_STAT_READ] == 0);
  105. stat[BLKIO_STAT_READ]--;
  106. }
  107. if (sync) {
  108. BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
  109. stat[BLKIO_STAT_SYNC]--;
  110. } else {
  111. BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
  112. stat[BLKIO_STAT_ASYNC]--;
  113. }
  114. }
  115. #ifdef CONFIG_DEBUG_BLK_CGROUP
  116. /* This should be called with the blkg->stats_lock held. */
  117. static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
  118. struct blkio_group *curr_blkg)
  119. {
  120. if (blkio_blkg_waiting(&blkg->stats))
  121. return;
  122. if (blkg == curr_blkg)
  123. return;
  124. blkg->stats.start_group_wait_time = sched_clock();
  125. blkio_mark_blkg_waiting(&blkg->stats);
  126. }
  127. /* This should be called with the blkg->stats_lock held. */
  128. static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
  129. {
  130. unsigned long long now;
  131. if (!blkio_blkg_waiting(stats))
  132. return;
  133. now = sched_clock();
  134. if (time_after64(now, stats->start_group_wait_time))
  135. stats->group_wait_time += now - stats->start_group_wait_time;
  136. blkio_clear_blkg_waiting(stats);
  137. }
  138. /* This should be called with the blkg->stats_lock held. */
  139. static void blkio_end_empty_time(struct blkio_group_stats *stats)
  140. {
  141. unsigned long long now;
  142. if (!blkio_blkg_empty(stats))
  143. return;
  144. now = sched_clock();
  145. if (time_after64(now, stats->start_empty_time))
  146. stats->empty_time += now - stats->start_empty_time;
  147. blkio_clear_blkg_empty(stats);
  148. }
  149. void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
  150. {
  151. unsigned long flags;
  152. spin_lock_irqsave(&blkg->stats_lock, flags);
  153. BUG_ON(blkio_blkg_idling(&blkg->stats));
  154. blkg->stats.start_idle_time = sched_clock();
  155. blkio_mark_blkg_idling(&blkg->stats);
  156. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  157. }
  158. EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
  159. void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
  160. {
  161. unsigned long flags;
  162. unsigned long long now;
  163. struct blkio_group_stats *stats;
  164. spin_lock_irqsave(&blkg->stats_lock, flags);
  165. stats = &blkg->stats;
  166. if (blkio_blkg_idling(stats)) {
  167. now = sched_clock();
  168. if (time_after64(now, stats->start_idle_time))
  169. stats->idle_time += now - stats->start_idle_time;
  170. blkio_clear_blkg_idling(stats);
  171. }
  172. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  173. }
  174. EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
  175. void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
  176. {
  177. unsigned long flags;
  178. struct blkio_group_stats *stats;
  179. spin_lock_irqsave(&blkg->stats_lock, flags);
  180. stats = &blkg->stats;
  181. stats->avg_queue_size_sum +=
  182. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
  183. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
  184. stats->avg_queue_size_samples++;
  185. blkio_update_group_wait_time(stats);
  186. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  187. }
  188. EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
  189. void blkiocg_set_start_empty_time(struct blkio_group *blkg)
  190. {
  191. unsigned long flags;
  192. struct blkio_group_stats *stats;
  193. spin_lock_irqsave(&blkg->stats_lock, flags);
  194. stats = &blkg->stats;
  195. if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
  196. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
  197. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  198. return;
  199. }
  200. /*
  201. * group is already marked empty. This can happen if cfqq got new
  202. * request in parent group and moved to this group while being added
  203. * to service tree. Just ignore the event and move on.
  204. */
  205. if(blkio_blkg_empty(stats)) {
  206. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  207. return;
  208. }
  209. stats->start_empty_time = sched_clock();
  210. blkio_mark_blkg_empty(stats);
  211. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  212. }
  213. EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
  214. void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
  215. unsigned long dequeue)
  216. {
  217. blkg->stats.dequeue += dequeue;
  218. }
  219. EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
  220. #else
  221. static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
  222. struct blkio_group *curr_blkg) {}
  223. static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
  224. #endif
  225. void blkiocg_update_io_add_stats(struct blkio_group *blkg,
  226. struct blkio_group *curr_blkg, bool direction,
  227. bool sync)
  228. {
  229. unsigned long flags;
  230. spin_lock_irqsave(&blkg->stats_lock, flags);
  231. blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
  232. sync);
  233. blkio_end_empty_time(&blkg->stats);
  234. blkio_set_start_group_wait_time(blkg, curr_blkg);
  235. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  236. }
  237. EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
  238. void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
  239. bool direction, bool sync)
  240. {
  241. unsigned long flags;
  242. spin_lock_irqsave(&blkg->stats_lock, flags);
  243. blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
  244. direction, sync);
  245. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  246. }
  247. EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
  248. void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
  249. {
  250. unsigned long flags;
  251. spin_lock_irqsave(&blkg->stats_lock, flags);
  252. blkg->stats.time += time;
  253. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  254. }
  255. EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
  256. void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
  257. uint64_t bytes, bool direction, bool sync)
  258. {
  259. struct blkio_group_stats *stats;
  260. unsigned long flags;
  261. spin_lock_irqsave(&blkg->stats_lock, flags);
  262. stats = &blkg->stats;
  263. stats->sectors += bytes >> 9;
  264. blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
  265. sync);
  266. blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
  267. direction, sync);
  268. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  269. }
  270. EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
  271. void blkiocg_update_completion_stats(struct blkio_group *blkg,
  272. uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
  273. {
  274. struct blkio_group_stats *stats;
  275. unsigned long flags;
  276. unsigned long long now = sched_clock();
  277. spin_lock_irqsave(&blkg->stats_lock, flags);
  278. stats = &blkg->stats;
  279. if (time_after64(now, io_start_time))
  280. blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
  281. now - io_start_time, direction, sync);
  282. if (time_after64(io_start_time, start_time))
  283. blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
  284. io_start_time - start_time, direction, sync);
  285. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  286. }
  287. EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
  288. void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
  289. bool sync)
  290. {
  291. unsigned long flags;
  292. spin_lock_irqsave(&blkg->stats_lock, flags);
  293. blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
  294. sync);
  295. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  296. }
  297. EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
  298. void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
  299. struct blkio_group *blkg, void *key, dev_t dev)
  300. {
  301. unsigned long flags;
  302. spin_lock_irqsave(&blkcg->lock, flags);
  303. spin_lock_init(&blkg->stats_lock);
  304. rcu_assign_pointer(blkg->key, key);
  305. blkg->blkcg_id = css_id(&blkcg->css);
  306. hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
  307. spin_unlock_irqrestore(&blkcg->lock, flags);
  308. /* Need to take css reference ? */
  309. cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
  310. blkg->dev = dev;
  311. }
  312. EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
  313. static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
  314. {
  315. hlist_del_init_rcu(&blkg->blkcg_node);
  316. blkg->blkcg_id = 0;
  317. }
  318. /*
  319. * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
  320. * indicating that blk_group was unhashed by the time we got to it.
  321. */
  322. int blkiocg_del_blkio_group(struct blkio_group *blkg)
  323. {
  324. struct blkio_cgroup *blkcg;
  325. unsigned long flags;
  326. struct cgroup_subsys_state *css;
  327. int ret = 1;
  328. rcu_read_lock();
  329. css = css_lookup(&blkio_subsys, blkg->blkcg_id);
  330. if (!css)
  331. goto out;
  332. blkcg = container_of(css, struct blkio_cgroup, css);
  333. spin_lock_irqsave(&blkcg->lock, flags);
  334. if (!hlist_unhashed(&blkg->blkcg_node)) {
  335. __blkiocg_del_blkio_group(blkg);
  336. ret = 0;
  337. }
  338. spin_unlock_irqrestore(&blkcg->lock, flags);
  339. out:
  340. rcu_read_unlock();
  341. return ret;
  342. }
  343. EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
  344. /* called under rcu_read_lock(). */
  345. struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
  346. {
  347. struct blkio_group *blkg;
  348. struct hlist_node *n;
  349. void *__key;
  350. hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
  351. __key = blkg->key;
  352. if (__key == key)
  353. return blkg;
  354. }
  355. return NULL;
  356. }
  357. EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
  358. #define SHOW_FUNCTION(__VAR) \
  359. static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
  360. struct cftype *cftype) \
  361. { \
  362. struct blkio_cgroup *blkcg; \
  363. \
  364. blkcg = cgroup_to_blkio_cgroup(cgroup); \
  365. return (u64)blkcg->__VAR; \
  366. }
  367. SHOW_FUNCTION(weight);
  368. #undef SHOW_FUNCTION
  369. static int
  370. blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
  371. {
  372. struct blkio_cgroup *blkcg;
  373. struct blkio_group *blkg;
  374. struct hlist_node *n;
  375. struct blkio_policy_type *blkiop;
  376. struct blkio_policy_node *pn;
  377. if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
  378. return -EINVAL;
  379. blkcg = cgroup_to_blkio_cgroup(cgroup);
  380. spin_lock(&blkio_list_lock);
  381. spin_lock_irq(&blkcg->lock);
  382. blkcg->weight = (unsigned int)val;
  383. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  384. pn = blkio_policy_search_node(blkcg, blkg->dev);
  385. if (pn)
  386. continue;
  387. list_for_each_entry(blkiop, &blkio_list, list)
  388. blkiop->ops.blkio_update_group_weight_fn(blkg,
  389. blkcg->weight);
  390. }
  391. spin_unlock_irq(&blkcg->lock);
  392. spin_unlock(&blkio_list_lock);
  393. return 0;
  394. }
  395. static int
  396. blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
  397. {
  398. struct blkio_cgroup *blkcg;
  399. struct blkio_group *blkg;
  400. struct blkio_group_stats *stats;
  401. struct hlist_node *n;
  402. uint64_t queued[BLKIO_STAT_TOTAL];
  403. int i;
  404. #ifdef CONFIG_DEBUG_BLK_CGROUP
  405. bool idling, waiting, empty;
  406. unsigned long long now = sched_clock();
  407. #endif
  408. blkcg = cgroup_to_blkio_cgroup(cgroup);
  409. spin_lock_irq(&blkcg->lock);
  410. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  411. spin_lock(&blkg->stats_lock);
  412. stats = &blkg->stats;
  413. #ifdef CONFIG_DEBUG_BLK_CGROUP
  414. idling = blkio_blkg_idling(stats);
  415. waiting = blkio_blkg_waiting(stats);
  416. empty = blkio_blkg_empty(stats);
  417. #endif
  418. for (i = 0; i < BLKIO_STAT_TOTAL; i++)
  419. queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
  420. memset(stats, 0, sizeof(struct blkio_group_stats));
  421. for (i = 0; i < BLKIO_STAT_TOTAL; i++)
  422. stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
  423. #ifdef CONFIG_DEBUG_BLK_CGROUP
  424. if (idling) {
  425. blkio_mark_blkg_idling(stats);
  426. stats->start_idle_time = now;
  427. }
  428. if (waiting) {
  429. blkio_mark_blkg_waiting(stats);
  430. stats->start_group_wait_time = now;
  431. }
  432. if (empty) {
  433. blkio_mark_blkg_empty(stats);
  434. stats->start_empty_time = now;
  435. }
  436. #endif
  437. spin_unlock(&blkg->stats_lock);
  438. }
  439. spin_unlock_irq(&blkcg->lock);
  440. return 0;
  441. }
  442. static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
  443. int chars_left, bool diskname_only)
  444. {
  445. snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
  446. chars_left -= strlen(str);
  447. if (chars_left <= 0) {
  448. printk(KERN_WARNING
  449. "Possibly incorrect cgroup stat display format");
  450. return;
  451. }
  452. if (diskname_only)
  453. return;
  454. switch (type) {
  455. case BLKIO_STAT_READ:
  456. strlcat(str, " Read", chars_left);
  457. break;
  458. case BLKIO_STAT_WRITE:
  459. strlcat(str, " Write", chars_left);
  460. break;
  461. case BLKIO_STAT_SYNC:
  462. strlcat(str, " Sync", chars_left);
  463. break;
  464. case BLKIO_STAT_ASYNC:
  465. strlcat(str, " Async", chars_left);
  466. break;
  467. case BLKIO_STAT_TOTAL:
  468. strlcat(str, " Total", chars_left);
  469. break;
  470. default:
  471. strlcat(str, " Invalid", chars_left);
  472. }
  473. }
  474. static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
  475. struct cgroup_map_cb *cb, dev_t dev)
  476. {
  477. blkio_get_key_name(0, dev, str, chars_left, true);
  478. cb->fill(cb, str, val);
  479. return val;
  480. }
  481. /* This should be called with blkg->stats_lock held */
  482. static uint64_t blkio_get_stat(struct blkio_group *blkg,
  483. struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
  484. {
  485. uint64_t disk_total;
  486. char key_str[MAX_KEY_LEN];
  487. enum stat_sub_type sub_type;
  488. if (type == BLKIO_STAT_TIME)
  489. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  490. blkg->stats.time, cb, dev);
  491. if (type == BLKIO_STAT_SECTORS)
  492. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  493. blkg->stats.sectors, cb, dev);
  494. #ifdef CONFIG_DEBUG_BLK_CGROUP
  495. if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
  496. uint64_t sum = blkg->stats.avg_queue_size_sum;
  497. uint64_t samples = blkg->stats.avg_queue_size_samples;
  498. if (samples)
  499. do_div(sum, samples);
  500. else
  501. sum = 0;
  502. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
  503. }
  504. if (type == BLKIO_STAT_GROUP_WAIT_TIME)
  505. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  506. blkg->stats.group_wait_time, cb, dev);
  507. if (type == BLKIO_STAT_IDLE_TIME)
  508. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  509. blkg->stats.idle_time, cb, dev);
  510. if (type == BLKIO_STAT_EMPTY_TIME)
  511. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  512. blkg->stats.empty_time, cb, dev);
  513. if (type == BLKIO_STAT_DEQUEUE)
  514. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  515. blkg->stats.dequeue, cb, dev);
  516. #endif
  517. for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
  518. sub_type++) {
  519. blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
  520. cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
  521. }
  522. disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
  523. blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
  524. blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
  525. cb->fill(cb, key_str, disk_total);
  526. return disk_total;
  527. }
  528. #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
  529. static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
  530. struct cftype *cftype, struct cgroup_map_cb *cb) \
  531. { \
  532. struct blkio_cgroup *blkcg; \
  533. struct blkio_group *blkg; \
  534. struct hlist_node *n; \
  535. uint64_t cgroup_total = 0; \
  536. \
  537. if (!cgroup_lock_live_group(cgroup)) \
  538. return -ENODEV; \
  539. \
  540. blkcg = cgroup_to_blkio_cgroup(cgroup); \
  541. rcu_read_lock(); \
  542. hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
  543. if (blkg->dev) { \
  544. spin_lock_irq(&blkg->stats_lock); \
  545. cgroup_total += blkio_get_stat(blkg, cb, \
  546. blkg->dev, type); \
  547. spin_unlock_irq(&blkg->stats_lock); \
  548. } \
  549. } \
  550. if (show_total) \
  551. cb->fill(cb, "Total", cgroup_total); \
  552. rcu_read_unlock(); \
  553. cgroup_unlock(); \
  554. return 0; \
  555. }
  556. SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
  557. SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
  558. SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
  559. SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
  560. SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
  561. SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
  562. SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
  563. SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
  564. #ifdef CONFIG_DEBUG_BLK_CGROUP
  565. SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
  566. SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
  567. SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
  568. SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
  569. SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
  570. #endif
  571. #undef SHOW_FUNCTION_PER_GROUP
  572. static int blkio_check_dev_num(dev_t dev)
  573. {
  574. int part = 0;
  575. struct gendisk *disk;
  576. disk = get_gendisk(dev, &part);
  577. if (!disk || part)
  578. return -ENODEV;
  579. return 0;
  580. }
  581. static int blkio_policy_parse_and_set(char *buf,
  582. struct blkio_policy_node *newpn)
  583. {
  584. char *s[4], *p, *major_s = NULL, *minor_s = NULL;
  585. int ret;
  586. unsigned long major, minor, temp;
  587. int i = 0;
  588. dev_t dev;
  589. memset(s, 0, sizeof(s));
  590. while ((p = strsep(&buf, " ")) != NULL) {
  591. if (!*p)
  592. continue;
  593. s[i++] = p;
  594. /* Prevent from inputing too many things */
  595. if (i == 3)
  596. break;
  597. }
  598. if (i != 2)
  599. return -EINVAL;
  600. p = strsep(&s[0], ":");
  601. if (p != NULL)
  602. major_s = p;
  603. else
  604. return -EINVAL;
  605. minor_s = s[0];
  606. if (!minor_s)
  607. return -EINVAL;
  608. ret = strict_strtoul(major_s, 10, &major);
  609. if (ret)
  610. return -EINVAL;
  611. ret = strict_strtoul(minor_s, 10, &minor);
  612. if (ret)
  613. return -EINVAL;
  614. dev = MKDEV(major, minor);
  615. ret = blkio_check_dev_num(dev);
  616. if (ret)
  617. return ret;
  618. newpn->dev = dev;
  619. if (s[1] == NULL)
  620. return -EINVAL;
  621. ret = strict_strtoul(s[1], 10, &temp);
  622. if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
  623. temp > BLKIO_WEIGHT_MAX)
  624. return -EINVAL;
  625. newpn->weight = temp;
  626. return 0;
  627. }
  628. unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
  629. dev_t dev)
  630. {
  631. struct blkio_policy_node *pn;
  632. pn = blkio_policy_search_node(blkcg, dev);
  633. if (pn)
  634. return pn->weight;
  635. else
  636. return blkcg->weight;
  637. }
  638. EXPORT_SYMBOL_GPL(blkcg_get_weight);
  639. static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
  640. const char *buffer)
  641. {
  642. int ret = 0;
  643. char *buf;
  644. struct blkio_policy_node *newpn, *pn;
  645. struct blkio_cgroup *blkcg;
  646. struct blkio_group *blkg;
  647. int keep_newpn = 0;
  648. struct hlist_node *n;
  649. struct blkio_policy_type *blkiop;
  650. buf = kstrdup(buffer, GFP_KERNEL);
  651. if (!buf)
  652. return -ENOMEM;
  653. newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
  654. if (!newpn) {
  655. ret = -ENOMEM;
  656. goto free_buf;
  657. }
  658. ret = blkio_policy_parse_and_set(buf, newpn);
  659. if (ret)
  660. goto free_newpn;
  661. blkcg = cgroup_to_blkio_cgroup(cgrp);
  662. spin_lock_irq(&blkcg->lock);
  663. pn = blkio_policy_search_node(blkcg, newpn->dev);
  664. if (!pn) {
  665. if (newpn->weight != 0) {
  666. blkio_policy_insert_node(blkcg, newpn);
  667. keep_newpn = 1;
  668. }
  669. spin_unlock_irq(&blkcg->lock);
  670. goto update_io_group;
  671. }
  672. if (newpn->weight == 0) {
  673. /* weight == 0 means deleteing a specific weight */
  674. blkio_policy_delete_node(pn);
  675. spin_unlock_irq(&blkcg->lock);
  676. goto update_io_group;
  677. }
  678. spin_unlock_irq(&blkcg->lock);
  679. pn->weight = newpn->weight;
  680. update_io_group:
  681. /* update weight for each cfqg */
  682. spin_lock(&blkio_list_lock);
  683. spin_lock_irq(&blkcg->lock);
  684. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  685. if (newpn->dev == blkg->dev) {
  686. list_for_each_entry(blkiop, &blkio_list, list)
  687. blkiop->ops.blkio_update_group_weight_fn(blkg,
  688. newpn->weight ?
  689. newpn->weight :
  690. blkcg->weight);
  691. }
  692. }
  693. spin_unlock_irq(&blkcg->lock);
  694. spin_unlock(&blkio_list_lock);
  695. free_newpn:
  696. if (!keep_newpn)
  697. kfree(newpn);
  698. free_buf:
  699. kfree(buf);
  700. return ret;
  701. }
  702. static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
  703. struct seq_file *m)
  704. {
  705. struct blkio_cgroup *blkcg;
  706. struct blkio_policy_node *pn;
  707. seq_printf(m, "dev\tweight\n");
  708. blkcg = cgroup_to_blkio_cgroup(cgrp);
  709. if (list_empty(&blkcg->policy_list))
  710. goto out;
  711. spin_lock_irq(&blkcg->lock);
  712. list_for_each_entry(pn, &blkcg->policy_list, node) {
  713. seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
  714. MINOR(pn->dev), pn->weight);
  715. }
  716. spin_unlock_irq(&blkcg->lock);
  717. out:
  718. return 0;
  719. }
  720. struct cftype blkio_files[] = {
  721. {
  722. .name = "weight_device",
  723. .read_seq_string = blkiocg_weight_device_read,
  724. .write_string = blkiocg_weight_device_write,
  725. .max_write_len = 256,
  726. },
  727. {
  728. .name = "weight",
  729. .read_u64 = blkiocg_weight_read,
  730. .write_u64 = blkiocg_weight_write,
  731. },
  732. {
  733. .name = "time",
  734. .read_map = blkiocg_time_read,
  735. },
  736. {
  737. .name = "sectors",
  738. .read_map = blkiocg_sectors_read,
  739. },
  740. {
  741. .name = "io_service_bytes",
  742. .read_map = blkiocg_io_service_bytes_read,
  743. },
  744. {
  745. .name = "io_serviced",
  746. .read_map = blkiocg_io_serviced_read,
  747. },
  748. {
  749. .name = "io_service_time",
  750. .read_map = blkiocg_io_service_time_read,
  751. },
  752. {
  753. .name = "io_wait_time",
  754. .read_map = blkiocg_io_wait_time_read,
  755. },
  756. {
  757. .name = "io_merged",
  758. .read_map = blkiocg_io_merged_read,
  759. },
  760. {
  761. .name = "io_queued",
  762. .read_map = blkiocg_io_queued_read,
  763. },
  764. {
  765. .name = "reset_stats",
  766. .write_u64 = blkiocg_reset_stats,
  767. },
  768. #ifdef CONFIG_DEBUG_BLK_CGROUP
  769. {
  770. .name = "avg_queue_size",
  771. .read_map = blkiocg_avg_queue_size_read,
  772. },
  773. {
  774. .name = "group_wait_time",
  775. .read_map = blkiocg_group_wait_time_read,
  776. },
  777. {
  778. .name = "idle_time",
  779. .read_map = blkiocg_idle_time_read,
  780. },
  781. {
  782. .name = "empty_time",
  783. .read_map = blkiocg_empty_time_read,
  784. },
  785. {
  786. .name = "dequeue",
  787. .read_map = blkiocg_dequeue_read,
  788. },
  789. #endif
  790. };
  791. static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  792. {
  793. return cgroup_add_files(cgroup, subsys, blkio_files,
  794. ARRAY_SIZE(blkio_files));
  795. }
  796. static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  797. {
  798. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  799. unsigned long flags;
  800. struct blkio_group *blkg;
  801. void *key;
  802. struct blkio_policy_type *blkiop;
  803. struct blkio_policy_node *pn, *pntmp;
  804. rcu_read_lock();
  805. remove_entry:
  806. spin_lock_irqsave(&blkcg->lock, flags);
  807. if (hlist_empty(&blkcg->blkg_list)) {
  808. spin_unlock_irqrestore(&blkcg->lock, flags);
  809. goto done;
  810. }
  811. blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
  812. blkcg_node);
  813. key = rcu_dereference(blkg->key);
  814. __blkiocg_del_blkio_group(blkg);
  815. spin_unlock_irqrestore(&blkcg->lock, flags);
  816. /*
  817. * This blkio_group is being unlinked as associated cgroup is going
  818. * away. Let all the IO controlling policies know about this event.
  819. *
  820. * Currently this is static call to one io controlling policy. Once
  821. * we have more policies in place, we need some dynamic registration
  822. * of callback function.
  823. */
  824. spin_lock(&blkio_list_lock);
  825. list_for_each_entry(blkiop, &blkio_list, list)
  826. blkiop->ops.blkio_unlink_group_fn(key, blkg);
  827. spin_unlock(&blkio_list_lock);
  828. goto remove_entry;
  829. done:
  830. list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
  831. blkio_policy_delete_node(pn);
  832. kfree(pn);
  833. }
  834. free_css_id(&blkio_subsys, &blkcg->css);
  835. rcu_read_unlock();
  836. if (blkcg != &blkio_root_cgroup)
  837. kfree(blkcg);
  838. }
  839. static struct cgroup_subsys_state *
  840. blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  841. {
  842. struct blkio_cgroup *blkcg, *parent_blkcg;
  843. if (!cgroup->parent) {
  844. blkcg = &blkio_root_cgroup;
  845. goto done;
  846. }
  847. /* Currently we do not support hierarchy deeper than two level (0,1) */
  848. parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
  849. if (css_depth(&parent_blkcg->css) > 0)
  850. return ERR_PTR(-EINVAL);
  851. blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
  852. if (!blkcg)
  853. return ERR_PTR(-ENOMEM);
  854. blkcg->weight = BLKIO_WEIGHT_DEFAULT;
  855. done:
  856. spin_lock_init(&blkcg->lock);
  857. INIT_HLIST_HEAD(&blkcg->blkg_list);
  858. INIT_LIST_HEAD(&blkcg->policy_list);
  859. return &blkcg->css;
  860. }
  861. /*
  862. * We cannot support shared io contexts, as we have no mean to support
  863. * two tasks with the same ioc in two different groups without major rework
  864. * of the main cic data structures. For now we allow a task to change
  865. * its cgroup only if it's the only owner of its ioc.
  866. */
  867. static int blkiocg_can_attach(struct cgroup_subsys *subsys,
  868. struct cgroup *cgroup, struct task_struct *tsk,
  869. bool threadgroup)
  870. {
  871. struct io_context *ioc;
  872. int ret = 0;
  873. /* task_lock() is needed to avoid races with exit_io_context() */
  874. task_lock(tsk);
  875. ioc = tsk->io_context;
  876. if (ioc && atomic_read(&ioc->nr_tasks) > 1)
  877. ret = -EINVAL;
  878. task_unlock(tsk);
  879. return ret;
  880. }
  881. static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
  882. struct cgroup *prev, struct task_struct *tsk,
  883. bool threadgroup)
  884. {
  885. struct io_context *ioc;
  886. task_lock(tsk);
  887. ioc = tsk->io_context;
  888. if (ioc)
  889. ioc->cgroup_changed = 1;
  890. task_unlock(tsk);
  891. }
  892. void blkio_policy_register(struct blkio_policy_type *blkiop)
  893. {
  894. spin_lock(&blkio_list_lock);
  895. list_add_tail(&blkiop->list, &blkio_list);
  896. spin_unlock(&blkio_list_lock);
  897. }
  898. EXPORT_SYMBOL_GPL(blkio_policy_register);
  899. void blkio_policy_unregister(struct blkio_policy_type *blkiop)
  900. {
  901. spin_lock(&blkio_list_lock);
  902. list_del_init(&blkiop->list);
  903. spin_unlock(&blkio_list_lock);
  904. }
  905. EXPORT_SYMBOL_GPL(blkio_policy_unregister);
  906. static int __init init_cgroup_blkio(void)
  907. {
  908. return cgroup_load_subsys(&blkio_subsys);
  909. }
  910. static void __exit exit_cgroup_blkio(void)
  911. {
  912. cgroup_unload_subsys(&blkio_subsys);
  913. }
  914. module_init(init_cgroup_blkio);
  915. module_exit(exit_cgroup_blkio);
  916. MODULE_LICENSE("GPL");