blk-cgroup.c 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718
  1. /*
  2. * Common Block IO controller cgroup interface
  3. *
  4. * Based on ideas and code from CFQ, CFS and BFQ:
  5. * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  6. *
  7. * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  8. * Paolo Valente <paolo.valente@unimore.it>
  9. *
  10. * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11. * Nauman Rafique <nauman@google.com>
  12. */
  13. #include <linux/ioprio.h>
  14. #include <linux/seq_file.h>
  15. #include <linux/kdev_t.h>
  16. #include <linux/module.h>
  17. #include <linux/err.h>
  18. #include <linux/blkdev.h>
  19. #include <linux/slab.h>
  20. #include <linux/genhd.h>
  21. #include <linux/delay.h>
  22. #include "blk-cgroup.h"
  23. #define MAX_KEY_LEN 100
  24. static DEFINE_SPINLOCK(blkio_list_lock);
  25. static LIST_HEAD(blkio_list);
  26. struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
  27. EXPORT_SYMBOL_GPL(blkio_root_cgroup);
  28. static struct blkio_policy_type *blkio_policy[BLKIO_NR_POLICIES];
  29. static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
  30. struct cgroup *);
  31. static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
  32. struct cgroup_taskset *);
  33. static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
  34. struct cgroup_taskset *);
  35. static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
  36. static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
  37. /* for encoding cft->private value on file */
  38. #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
  39. /* What policy owns the file, proportional or throttle */
  40. #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
  41. #define BLKIOFILE_ATTR(val) ((val) & 0xffff)
  42. struct cgroup_subsys blkio_subsys = {
  43. .name = "blkio",
  44. .create = blkiocg_create,
  45. .can_attach = blkiocg_can_attach,
  46. .attach = blkiocg_attach,
  47. .destroy = blkiocg_destroy,
  48. .populate = blkiocg_populate,
  49. .subsys_id = blkio_subsys_id,
  50. .use_id = 1,
  51. .module = THIS_MODULE,
  52. };
  53. EXPORT_SYMBOL_GPL(blkio_subsys);
  54. static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
  55. struct blkio_policy_node *pn)
  56. {
  57. list_add(&pn->node, &blkcg->policy_list);
  58. }
  59. static inline bool cftype_blkg_same_policy(struct cftype *cft,
  60. struct blkio_group *blkg)
  61. {
  62. enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  63. if (blkg->plid == plid)
  64. return 1;
  65. return 0;
  66. }
  67. /* Determines if policy node matches cgroup file being accessed */
  68. static inline bool pn_matches_cftype(struct cftype *cft,
  69. struct blkio_policy_node *pn)
  70. {
  71. enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  72. int fileid = BLKIOFILE_ATTR(cft->private);
  73. return (plid == pn->plid && fileid == pn->fileid);
  74. }
  75. /* Must be called with blkcg->lock held */
  76. static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
  77. {
  78. list_del(&pn->node);
  79. }
  80. /* Must be called with blkcg->lock held */
  81. static struct blkio_policy_node *
  82. blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
  83. enum blkio_policy_id plid, int fileid)
  84. {
  85. struct blkio_policy_node *pn;
  86. list_for_each_entry(pn, &blkcg->policy_list, node) {
  87. if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
  88. return pn;
  89. }
  90. return NULL;
  91. }
  92. struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
  93. {
  94. return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
  95. struct blkio_cgroup, css);
  96. }
  97. EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
  98. struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
  99. {
  100. return container_of(task_subsys_state(tsk, blkio_subsys_id),
  101. struct blkio_cgroup, css);
  102. }
  103. EXPORT_SYMBOL_GPL(task_blkio_cgroup);
  104. static inline void
  105. blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
  106. {
  107. struct blkio_policy_type *blkiop;
  108. list_for_each_entry(blkiop, &blkio_list, list) {
  109. /* If this policy does not own the blkg, do not send updates */
  110. if (blkiop->plid != blkg->plid)
  111. continue;
  112. if (blkiop->ops.blkio_update_group_weight_fn)
  113. blkiop->ops.blkio_update_group_weight_fn(blkg->q,
  114. blkg, weight);
  115. }
  116. }
  117. static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
  118. int fileid)
  119. {
  120. struct blkio_policy_type *blkiop;
  121. list_for_each_entry(blkiop, &blkio_list, list) {
  122. /* If this policy does not own the blkg, do not send updates */
  123. if (blkiop->plid != blkg->plid)
  124. continue;
  125. if (fileid == BLKIO_THROTL_read_bps_device
  126. && blkiop->ops.blkio_update_group_read_bps_fn)
  127. blkiop->ops.blkio_update_group_read_bps_fn(blkg->q,
  128. blkg, bps);
  129. if (fileid == BLKIO_THROTL_write_bps_device
  130. && blkiop->ops.blkio_update_group_write_bps_fn)
  131. blkiop->ops.blkio_update_group_write_bps_fn(blkg->q,
  132. blkg, bps);
  133. }
  134. }
  135. static inline void blkio_update_group_iops(struct blkio_group *blkg,
  136. unsigned int iops, int fileid)
  137. {
  138. struct blkio_policy_type *blkiop;
  139. list_for_each_entry(blkiop, &blkio_list, list) {
  140. /* If this policy does not own the blkg, do not send updates */
  141. if (blkiop->plid != blkg->plid)
  142. continue;
  143. if (fileid == BLKIO_THROTL_read_iops_device
  144. && blkiop->ops.blkio_update_group_read_iops_fn)
  145. blkiop->ops.blkio_update_group_read_iops_fn(blkg->q,
  146. blkg, iops);
  147. if (fileid == BLKIO_THROTL_write_iops_device
  148. && blkiop->ops.blkio_update_group_write_iops_fn)
  149. blkiop->ops.blkio_update_group_write_iops_fn(blkg->q,
  150. blkg,iops);
  151. }
  152. }
  153. /*
  154. * Add to the appropriate stat variable depending on the request type.
  155. * This should be called with the blkg->stats_lock held.
  156. */
  157. static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
  158. bool sync)
  159. {
  160. if (direction)
  161. stat[BLKIO_STAT_WRITE] += add;
  162. else
  163. stat[BLKIO_STAT_READ] += add;
  164. if (sync)
  165. stat[BLKIO_STAT_SYNC] += add;
  166. else
  167. stat[BLKIO_STAT_ASYNC] += add;
  168. }
  169. /*
  170. * Decrements the appropriate stat variable if non-zero depending on the
  171. * request type. Panics on value being zero.
  172. * This should be called with the blkg->stats_lock held.
  173. */
  174. static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
  175. {
  176. if (direction) {
  177. BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
  178. stat[BLKIO_STAT_WRITE]--;
  179. } else {
  180. BUG_ON(stat[BLKIO_STAT_READ] == 0);
  181. stat[BLKIO_STAT_READ]--;
  182. }
  183. if (sync) {
  184. BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
  185. stat[BLKIO_STAT_SYNC]--;
  186. } else {
  187. BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
  188. stat[BLKIO_STAT_ASYNC]--;
  189. }
  190. }
  191. #ifdef CONFIG_DEBUG_BLK_CGROUP
  192. /* This should be called with the blkg->stats_lock held. */
  193. static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
  194. struct blkio_group *curr_blkg)
  195. {
  196. if (blkio_blkg_waiting(&blkg->stats))
  197. return;
  198. if (blkg == curr_blkg)
  199. return;
  200. blkg->stats.start_group_wait_time = sched_clock();
  201. blkio_mark_blkg_waiting(&blkg->stats);
  202. }
  203. /* This should be called with the blkg->stats_lock held. */
  204. static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
  205. {
  206. unsigned long long now;
  207. if (!blkio_blkg_waiting(stats))
  208. return;
  209. now = sched_clock();
  210. if (time_after64(now, stats->start_group_wait_time))
  211. stats->group_wait_time += now - stats->start_group_wait_time;
  212. blkio_clear_blkg_waiting(stats);
  213. }
  214. /* This should be called with the blkg->stats_lock held. */
  215. static void blkio_end_empty_time(struct blkio_group_stats *stats)
  216. {
  217. unsigned long long now;
  218. if (!blkio_blkg_empty(stats))
  219. return;
  220. now = sched_clock();
  221. if (time_after64(now, stats->start_empty_time))
  222. stats->empty_time += now - stats->start_empty_time;
  223. blkio_clear_blkg_empty(stats);
  224. }
  225. void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
  226. {
  227. unsigned long flags;
  228. spin_lock_irqsave(&blkg->stats_lock, flags);
  229. BUG_ON(blkio_blkg_idling(&blkg->stats));
  230. blkg->stats.start_idle_time = sched_clock();
  231. blkio_mark_blkg_idling(&blkg->stats);
  232. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  233. }
  234. EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
  235. void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
  236. {
  237. unsigned long flags;
  238. unsigned long long now;
  239. struct blkio_group_stats *stats;
  240. spin_lock_irqsave(&blkg->stats_lock, flags);
  241. stats = &blkg->stats;
  242. if (blkio_blkg_idling(stats)) {
  243. now = sched_clock();
  244. if (time_after64(now, stats->start_idle_time))
  245. stats->idle_time += now - stats->start_idle_time;
  246. blkio_clear_blkg_idling(stats);
  247. }
  248. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  249. }
  250. EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
  251. void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
  252. {
  253. unsigned long flags;
  254. struct blkio_group_stats *stats;
  255. spin_lock_irqsave(&blkg->stats_lock, flags);
  256. stats = &blkg->stats;
  257. stats->avg_queue_size_sum +=
  258. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
  259. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
  260. stats->avg_queue_size_samples++;
  261. blkio_update_group_wait_time(stats);
  262. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  263. }
  264. EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
  265. void blkiocg_set_start_empty_time(struct blkio_group *blkg)
  266. {
  267. unsigned long flags;
  268. struct blkio_group_stats *stats;
  269. spin_lock_irqsave(&blkg->stats_lock, flags);
  270. stats = &blkg->stats;
  271. if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
  272. stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
  273. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  274. return;
  275. }
  276. /*
  277. * group is already marked empty. This can happen if cfqq got new
  278. * request in parent group and moved to this group while being added
  279. * to service tree. Just ignore the event and move on.
  280. */
  281. if(blkio_blkg_empty(stats)) {
  282. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  283. return;
  284. }
  285. stats->start_empty_time = sched_clock();
  286. blkio_mark_blkg_empty(stats);
  287. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  288. }
  289. EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
  290. void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
  291. unsigned long dequeue)
  292. {
  293. blkg->stats.dequeue += dequeue;
  294. }
  295. EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
  296. #else
  297. static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
  298. struct blkio_group *curr_blkg) {}
  299. static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
  300. #endif
  301. void blkiocg_update_io_add_stats(struct blkio_group *blkg,
  302. struct blkio_group *curr_blkg, bool direction,
  303. bool sync)
  304. {
  305. unsigned long flags;
  306. spin_lock_irqsave(&blkg->stats_lock, flags);
  307. blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
  308. sync);
  309. blkio_end_empty_time(&blkg->stats);
  310. blkio_set_start_group_wait_time(blkg, curr_blkg);
  311. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  312. }
  313. EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
  314. void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
  315. bool direction, bool sync)
  316. {
  317. unsigned long flags;
  318. spin_lock_irqsave(&blkg->stats_lock, flags);
  319. blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
  320. direction, sync);
  321. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  322. }
  323. EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
  324. void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
  325. unsigned long unaccounted_time)
  326. {
  327. unsigned long flags;
  328. spin_lock_irqsave(&blkg->stats_lock, flags);
  329. blkg->stats.time += time;
  330. #ifdef CONFIG_DEBUG_BLK_CGROUP
  331. blkg->stats.unaccounted_time += unaccounted_time;
  332. #endif
  333. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  334. }
  335. EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
  336. /*
  337. * should be called under rcu read lock or queue lock to make sure blkg pointer
  338. * is valid.
  339. */
  340. void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
  341. uint64_t bytes, bool direction, bool sync)
  342. {
  343. struct blkio_group_stats_cpu *stats_cpu;
  344. unsigned long flags;
  345. /*
  346. * Disabling interrupts to provide mutual exclusion between two
  347. * writes on same cpu. It probably is not needed for 64bit. Not
  348. * optimizing that case yet.
  349. */
  350. local_irq_save(flags);
  351. stats_cpu = this_cpu_ptr(blkg->stats_cpu);
  352. u64_stats_update_begin(&stats_cpu->syncp);
  353. stats_cpu->sectors += bytes >> 9;
  354. blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
  355. 1, direction, sync);
  356. blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
  357. bytes, direction, sync);
  358. u64_stats_update_end(&stats_cpu->syncp);
  359. local_irq_restore(flags);
  360. }
  361. EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
  362. void blkiocg_update_completion_stats(struct blkio_group *blkg,
  363. uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
  364. {
  365. struct blkio_group_stats *stats;
  366. unsigned long flags;
  367. unsigned long long now = sched_clock();
  368. spin_lock_irqsave(&blkg->stats_lock, flags);
  369. stats = &blkg->stats;
  370. if (time_after64(now, io_start_time))
  371. blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
  372. now - io_start_time, direction, sync);
  373. if (time_after64(io_start_time, start_time))
  374. blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
  375. io_start_time - start_time, direction, sync);
  376. spin_unlock_irqrestore(&blkg->stats_lock, flags);
  377. }
  378. EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
  379. /* Merged stats are per cpu. */
  380. void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
  381. bool sync)
  382. {
  383. struct blkio_group_stats_cpu *stats_cpu;
  384. unsigned long flags;
  385. /*
  386. * Disabling interrupts to provide mutual exclusion between two
  387. * writes on same cpu. It probably is not needed for 64bit. Not
  388. * optimizing that case yet.
  389. */
  390. local_irq_save(flags);
  391. stats_cpu = this_cpu_ptr(blkg->stats_cpu);
  392. u64_stats_update_begin(&stats_cpu->syncp);
  393. blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
  394. direction, sync);
  395. u64_stats_update_end(&stats_cpu->syncp);
  396. local_irq_restore(flags);
  397. }
  398. EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
  399. /*
  400. * This function allocates the per cpu stats for blkio_group. Should be called
  401. * from sleepable context as alloc_per_cpu() requires that.
  402. */
  403. int blkio_alloc_blkg_stats(struct blkio_group *blkg)
  404. {
  405. /* Allocate memory for per cpu stats */
  406. blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
  407. if (!blkg->stats_cpu)
  408. return -ENOMEM;
  409. return 0;
  410. }
  411. EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
  412. void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
  413. struct blkio_group *blkg, struct request_queue *q, dev_t dev,
  414. enum blkio_policy_id plid)
  415. {
  416. unsigned long flags;
  417. spin_lock_irqsave(&blkcg->lock, flags);
  418. spin_lock_init(&blkg->stats_lock);
  419. rcu_assign_pointer(blkg->q, q);
  420. blkg->blkcg_id = css_id(&blkcg->css);
  421. hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
  422. blkg->plid = plid;
  423. spin_unlock_irqrestore(&blkcg->lock, flags);
  424. /* Need to take css reference ? */
  425. cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
  426. blkg->dev = dev;
  427. }
  428. EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
  429. static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
  430. {
  431. hlist_del_init_rcu(&blkg->blkcg_node);
  432. blkg->blkcg_id = 0;
  433. }
  434. /*
  435. * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
  436. * indicating that blk_group was unhashed by the time we got to it.
  437. */
  438. int blkiocg_del_blkio_group(struct blkio_group *blkg)
  439. {
  440. struct blkio_cgroup *blkcg;
  441. unsigned long flags;
  442. struct cgroup_subsys_state *css;
  443. int ret = 1;
  444. rcu_read_lock();
  445. css = css_lookup(&blkio_subsys, blkg->blkcg_id);
  446. if (css) {
  447. blkcg = container_of(css, struct blkio_cgroup, css);
  448. spin_lock_irqsave(&blkcg->lock, flags);
  449. if (!hlist_unhashed(&blkg->blkcg_node)) {
  450. __blkiocg_del_blkio_group(blkg);
  451. ret = 0;
  452. }
  453. spin_unlock_irqrestore(&blkcg->lock, flags);
  454. }
  455. rcu_read_unlock();
  456. return ret;
  457. }
  458. EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
  459. /* called under rcu_read_lock(). */
  460. struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
  461. struct request_queue *q,
  462. enum blkio_policy_id plid)
  463. {
  464. struct blkio_group *blkg;
  465. struct hlist_node *n;
  466. hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node)
  467. if (blkg->q == q && blkg->plid == plid)
  468. return blkg;
  469. return NULL;
  470. }
  471. EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
  472. void blkg_destroy_all(struct request_queue *q)
  473. {
  474. struct blkio_policy_type *pol;
  475. while (true) {
  476. bool done = true;
  477. spin_lock(&blkio_list_lock);
  478. spin_lock_irq(q->queue_lock);
  479. /*
  480. * clear_queue_fn() might return with non-empty group list
  481. * if it raced cgroup removal and lost. cgroup removal is
  482. * guaranteed to make forward progress and retrying after a
  483. * while is enough. This ugliness is scheduled to be
  484. * removed after locking update.
  485. */
  486. list_for_each_entry(pol, &blkio_list, list)
  487. if (!pol->ops.blkio_clear_queue_fn(q))
  488. done = false;
  489. spin_unlock_irq(q->queue_lock);
  490. spin_unlock(&blkio_list_lock);
  491. if (done)
  492. break;
  493. msleep(10); /* just some random duration I like */
  494. }
  495. }
  496. static void blkio_reset_stats_cpu(struct blkio_group *blkg)
  497. {
  498. struct blkio_group_stats_cpu *stats_cpu;
  499. int i, j, k;
  500. /*
  501. * Note: On 64 bit arch this should not be an issue. This has the
  502. * possibility of returning some inconsistent value on 32bit arch
  503. * as 64bit update on 32bit is non atomic. Taking care of this
  504. * corner case makes code very complicated, like sending IPIs to
  505. * cpus, taking care of stats of offline cpus etc.
  506. *
  507. * reset stats is anyway more of a debug feature and this sounds a
  508. * corner case. So I am not complicating the code yet until and
  509. * unless this becomes a real issue.
  510. */
  511. for_each_possible_cpu(i) {
  512. stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
  513. stats_cpu->sectors = 0;
  514. for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
  515. for (k = 0; k < BLKIO_STAT_TOTAL; k++)
  516. stats_cpu->stat_arr_cpu[j][k] = 0;
  517. }
  518. }
  519. static int
  520. blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
  521. {
  522. struct blkio_cgroup *blkcg;
  523. struct blkio_group *blkg;
  524. struct blkio_group_stats *stats;
  525. struct hlist_node *n;
  526. uint64_t queued[BLKIO_STAT_TOTAL];
  527. int i;
  528. #ifdef CONFIG_DEBUG_BLK_CGROUP
  529. bool idling, waiting, empty;
  530. unsigned long long now = sched_clock();
  531. #endif
  532. blkcg = cgroup_to_blkio_cgroup(cgroup);
  533. spin_lock_irq(&blkcg->lock);
  534. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  535. spin_lock(&blkg->stats_lock);
  536. stats = &blkg->stats;
  537. #ifdef CONFIG_DEBUG_BLK_CGROUP
  538. idling = blkio_blkg_idling(stats);
  539. waiting = blkio_blkg_waiting(stats);
  540. empty = blkio_blkg_empty(stats);
  541. #endif
  542. for (i = 0; i < BLKIO_STAT_TOTAL; i++)
  543. queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
  544. memset(stats, 0, sizeof(struct blkio_group_stats));
  545. for (i = 0; i < BLKIO_STAT_TOTAL; i++)
  546. stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
  547. #ifdef CONFIG_DEBUG_BLK_CGROUP
  548. if (idling) {
  549. blkio_mark_blkg_idling(stats);
  550. stats->start_idle_time = now;
  551. }
  552. if (waiting) {
  553. blkio_mark_blkg_waiting(stats);
  554. stats->start_group_wait_time = now;
  555. }
  556. if (empty) {
  557. blkio_mark_blkg_empty(stats);
  558. stats->start_empty_time = now;
  559. }
  560. #endif
  561. spin_unlock(&blkg->stats_lock);
  562. /* Reset Per cpu stats which don't take blkg->stats_lock */
  563. blkio_reset_stats_cpu(blkg);
  564. }
  565. spin_unlock_irq(&blkcg->lock);
  566. return 0;
  567. }
  568. static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
  569. int chars_left, bool diskname_only)
  570. {
  571. snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
  572. chars_left -= strlen(str);
  573. if (chars_left <= 0) {
  574. printk(KERN_WARNING
  575. "Possibly incorrect cgroup stat display format");
  576. return;
  577. }
  578. if (diskname_only)
  579. return;
  580. switch (type) {
  581. case BLKIO_STAT_READ:
  582. strlcat(str, " Read", chars_left);
  583. break;
  584. case BLKIO_STAT_WRITE:
  585. strlcat(str, " Write", chars_left);
  586. break;
  587. case BLKIO_STAT_SYNC:
  588. strlcat(str, " Sync", chars_left);
  589. break;
  590. case BLKIO_STAT_ASYNC:
  591. strlcat(str, " Async", chars_left);
  592. break;
  593. case BLKIO_STAT_TOTAL:
  594. strlcat(str, " Total", chars_left);
  595. break;
  596. default:
  597. strlcat(str, " Invalid", chars_left);
  598. }
  599. }
  600. static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
  601. struct cgroup_map_cb *cb, dev_t dev)
  602. {
  603. blkio_get_key_name(0, dev, str, chars_left, true);
  604. cb->fill(cb, str, val);
  605. return val;
  606. }
  607. static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
  608. enum stat_type_cpu type, enum stat_sub_type sub_type)
  609. {
  610. int cpu;
  611. struct blkio_group_stats_cpu *stats_cpu;
  612. u64 val = 0, tval;
  613. for_each_possible_cpu(cpu) {
  614. unsigned int start;
  615. stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
  616. do {
  617. start = u64_stats_fetch_begin(&stats_cpu->syncp);
  618. if (type == BLKIO_STAT_CPU_SECTORS)
  619. tval = stats_cpu->sectors;
  620. else
  621. tval = stats_cpu->stat_arr_cpu[type][sub_type];
  622. } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
  623. val += tval;
  624. }
  625. return val;
  626. }
  627. static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
  628. struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
  629. {
  630. uint64_t disk_total, val;
  631. char key_str[MAX_KEY_LEN];
  632. enum stat_sub_type sub_type;
  633. if (type == BLKIO_STAT_CPU_SECTORS) {
  634. val = blkio_read_stat_cpu(blkg, type, 0);
  635. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
  636. }
  637. for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
  638. sub_type++) {
  639. blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
  640. val = blkio_read_stat_cpu(blkg, type, sub_type);
  641. cb->fill(cb, key_str, val);
  642. }
  643. disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
  644. blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
  645. blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
  646. cb->fill(cb, key_str, disk_total);
  647. return disk_total;
  648. }
  649. /* This should be called with blkg->stats_lock held */
  650. static uint64_t blkio_get_stat(struct blkio_group *blkg,
  651. struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
  652. {
  653. uint64_t disk_total;
  654. char key_str[MAX_KEY_LEN];
  655. enum stat_sub_type sub_type;
  656. if (type == BLKIO_STAT_TIME)
  657. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  658. blkg->stats.time, cb, dev);
  659. #ifdef CONFIG_DEBUG_BLK_CGROUP
  660. if (type == BLKIO_STAT_UNACCOUNTED_TIME)
  661. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  662. blkg->stats.unaccounted_time, cb, dev);
  663. if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
  664. uint64_t sum = blkg->stats.avg_queue_size_sum;
  665. uint64_t samples = blkg->stats.avg_queue_size_samples;
  666. if (samples)
  667. do_div(sum, samples);
  668. else
  669. sum = 0;
  670. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
  671. }
  672. if (type == BLKIO_STAT_GROUP_WAIT_TIME)
  673. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  674. blkg->stats.group_wait_time, cb, dev);
  675. if (type == BLKIO_STAT_IDLE_TIME)
  676. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  677. blkg->stats.idle_time, cb, dev);
  678. if (type == BLKIO_STAT_EMPTY_TIME)
  679. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  680. blkg->stats.empty_time, cb, dev);
  681. if (type == BLKIO_STAT_DEQUEUE)
  682. return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
  683. blkg->stats.dequeue, cb, dev);
  684. #endif
  685. for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
  686. sub_type++) {
  687. blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
  688. cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
  689. }
  690. disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
  691. blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
  692. blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
  693. cb->fill(cb, key_str, disk_total);
  694. return disk_total;
  695. }
  696. static int blkio_policy_parse_and_set(char *buf,
  697. struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
  698. {
  699. struct gendisk *disk = NULL;
  700. char *s[4], *p, *major_s = NULL, *minor_s = NULL;
  701. unsigned long major, minor;
  702. int i = 0, ret = -EINVAL;
  703. int part;
  704. dev_t dev;
  705. u64 temp;
  706. memset(s, 0, sizeof(s));
  707. while ((p = strsep(&buf, " ")) != NULL) {
  708. if (!*p)
  709. continue;
  710. s[i++] = p;
  711. /* Prevent from inputing too many things */
  712. if (i == 3)
  713. break;
  714. }
  715. if (i != 2)
  716. goto out;
  717. p = strsep(&s[0], ":");
  718. if (p != NULL)
  719. major_s = p;
  720. else
  721. goto out;
  722. minor_s = s[0];
  723. if (!minor_s)
  724. goto out;
  725. if (strict_strtoul(major_s, 10, &major))
  726. goto out;
  727. if (strict_strtoul(minor_s, 10, &minor))
  728. goto out;
  729. dev = MKDEV(major, minor);
  730. if (strict_strtoull(s[1], 10, &temp))
  731. goto out;
  732. /* For rule removal, do not check for device presence. */
  733. if (temp) {
  734. disk = get_gendisk(dev, &part);
  735. if (!disk || part) {
  736. ret = -ENODEV;
  737. goto out;
  738. }
  739. }
  740. newpn->dev = dev;
  741. switch (plid) {
  742. case BLKIO_POLICY_PROP:
  743. if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
  744. temp > BLKIO_WEIGHT_MAX)
  745. goto out;
  746. newpn->plid = plid;
  747. newpn->fileid = fileid;
  748. newpn->val.weight = temp;
  749. break;
  750. case BLKIO_POLICY_THROTL:
  751. switch(fileid) {
  752. case BLKIO_THROTL_read_bps_device:
  753. case BLKIO_THROTL_write_bps_device:
  754. newpn->plid = plid;
  755. newpn->fileid = fileid;
  756. newpn->val.bps = temp;
  757. break;
  758. case BLKIO_THROTL_read_iops_device:
  759. case BLKIO_THROTL_write_iops_device:
  760. if (temp > THROTL_IOPS_MAX)
  761. goto out;
  762. newpn->plid = plid;
  763. newpn->fileid = fileid;
  764. newpn->val.iops = (unsigned int)temp;
  765. break;
  766. }
  767. break;
  768. default:
  769. BUG();
  770. }
  771. ret = 0;
  772. out:
  773. put_disk(disk);
  774. return ret;
  775. }
  776. unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
  777. dev_t dev)
  778. {
  779. struct blkio_policy_node *pn;
  780. unsigned long flags;
  781. unsigned int weight;
  782. spin_lock_irqsave(&blkcg->lock, flags);
  783. pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
  784. BLKIO_PROP_weight_device);
  785. if (pn)
  786. weight = pn->val.weight;
  787. else
  788. weight = blkcg->weight;
  789. spin_unlock_irqrestore(&blkcg->lock, flags);
  790. return weight;
  791. }
  792. EXPORT_SYMBOL_GPL(blkcg_get_weight);
  793. uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
  794. {
  795. struct blkio_policy_node *pn;
  796. unsigned long flags;
  797. uint64_t bps = -1;
  798. spin_lock_irqsave(&blkcg->lock, flags);
  799. pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
  800. BLKIO_THROTL_read_bps_device);
  801. if (pn)
  802. bps = pn->val.bps;
  803. spin_unlock_irqrestore(&blkcg->lock, flags);
  804. return bps;
  805. }
  806. uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
  807. {
  808. struct blkio_policy_node *pn;
  809. unsigned long flags;
  810. uint64_t bps = -1;
  811. spin_lock_irqsave(&blkcg->lock, flags);
  812. pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
  813. BLKIO_THROTL_write_bps_device);
  814. if (pn)
  815. bps = pn->val.bps;
  816. spin_unlock_irqrestore(&blkcg->lock, flags);
  817. return bps;
  818. }
  819. unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
  820. {
  821. struct blkio_policy_node *pn;
  822. unsigned long flags;
  823. unsigned int iops = -1;
  824. spin_lock_irqsave(&blkcg->lock, flags);
  825. pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
  826. BLKIO_THROTL_read_iops_device);
  827. if (pn)
  828. iops = pn->val.iops;
  829. spin_unlock_irqrestore(&blkcg->lock, flags);
  830. return iops;
  831. }
  832. unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
  833. {
  834. struct blkio_policy_node *pn;
  835. unsigned long flags;
  836. unsigned int iops = -1;
  837. spin_lock_irqsave(&blkcg->lock, flags);
  838. pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
  839. BLKIO_THROTL_write_iops_device);
  840. if (pn)
  841. iops = pn->val.iops;
  842. spin_unlock_irqrestore(&blkcg->lock, flags);
  843. return iops;
  844. }
  845. /* Checks whether user asked for deleting a policy rule */
  846. static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
  847. {
  848. switch(pn->plid) {
  849. case BLKIO_POLICY_PROP:
  850. if (pn->val.weight == 0)
  851. return 1;
  852. break;
  853. case BLKIO_POLICY_THROTL:
  854. switch(pn->fileid) {
  855. case BLKIO_THROTL_read_bps_device:
  856. case BLKIO_THROTL_write_bps_device:
  857. if (pn->val.bps == 0)
  858. return 1;
  859. break;
  860. case BLKIO_THROTL_read_iops_device:
  861. case BLKIO_THROTL_write_iops_device:
  862. if (pn->val.iops == 0)
  863. return 1;
  864. }
  865. break;
  866. default:
  867. BUG();
  868. }
  869. return 0;
  870. }
  871. static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
  872. struct blkio_policy_node *newpn)
  873. {
  874. switch(oldpn->plid) {
  875. case BLKIO_POLICY_PROP:
  876. oldpn->val.weight = newpn->val.weight;
  877. break;
  878. case BLKIO_POLICY_THROTL:
  879. switch(newpn->fileid) {
  880. case BLKIO_THROTL_read_bps_device:
  881. case BLKIO_THROTL_write_bps_device:
  882. oldpn->val.bps = newpn->val.bps;
  883. break;
  884. case BLKIO_THROTL_read_iops_device:
  885. case BLKIO_THROTL_write_iops_device:
  886. oldpn->val.iops = newpn->val.iops;
  887. }
  888. break;
  889. default:
  890. BUG();
  891. }
  892. }
  893. /*
  894. * Some rules/values in blkg have changed. Propagate those to respective
  895. * policies.
  896. */
  897. static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
  898. struct blkio_group *blkg, struct blkio_policy_node *pn)
  899. {
  900. unsigned int weight, iops;
  901. u64 bps;
  902. switch(pn->plid) {
  903. case BLKIO_POLICY_PROP:
  904. weight = pn->val.weight ? pn->val.weight :
  905. blkcg->weight;
  906. blkio_update_group_weight(blkg, weight);
  907. break;
  908. case BLKIO_POLICY_THROTL:
  909. switch(pn->fileid) {
  910. case BLKIO_THROTL_read_bps_device:
  911. case BLKIO_THROTL_write_bps_device:
  912. bps = pn->val.bps ? pn->val.bps : (-1);
  913. blkio_update_group_bps(blkg, bps, pn->fileid);
  914. break;
  915. case BLKIO_THROTL_read_iops_device:
  916. case BLKIO_THROTL_write_iops_device:
  917. iops = pn->val.iops ? pn->val.iops : (-1);
  918. blkio_update_group_iops(blkg, iops, pn->fileid);
  919. break;
  920. }
  921. break;
  922. default:
  923. BUG();
  924. }
  925. }
  926. /*
  927. * A policy node rule has been updated. Propagate this update to all the
  928. * block groups which might be affected by this update.
  929. */
  930. static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
  931. struct blkio_policy_node *pn)
  932. {
  933. struct blkio_group *blkg;
  934. struct hlist_node *n;
  935. spin_lock(&blkio_list_lock);
  936. spin_lock_irq(&blkcg->lock);
  937. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  938. if (pn->dev != blkg->dev || pn->plid != blkg->plid)
  939. continue;
  940. blkio_update_blkg_policy(blkcg, blkg, pn);
  941. }
  942. spin_unlock_irq(&blkcg->lock);
  943. spin_unlock(&blkio_list_lock);
  944. }
  945. static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
  946. const char *buffer)
  947. {
  948. int ret = 0;
  949. char *buf;
  950. struct blkio_policy_node *newpn, *pn;
  951. struct blkio_cgroup *blkcg;
  952. int keep_newpn = 0;
  953. enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  954. int fileid = BLKIOFILE_ATTR(cft->private);
  955. buf = kstrdup(buffer, GFP_KERNEL);
  956. if (!buf)
  957. return -ENOMEM;
  958. newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
  959. if (!newpn) {
  960. ret = -ENOMEM;
  961. goto free_buf;
  962. }
  963. ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
  964. if (ret)
  965. goto free_newpn;
  966. blkcg = cgroup_to_blkio_cgroup(cgrp);
  967. spin_lock_irq(&blkcg->lock);
  968. pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
  969. if (!pn) {
  970. if (!blkio_delete_rule_command(newpn)) {
  971. blkio_policy_insert_node(blkcg, newpn);
  972. keep_newpn = 1;
  973. }
  974. spin_unlock_irq(&blkcg->lock);
  975. goto update_io_group;
  976. }
  977. if (blkio_delete_rule_command(newpn)) {
  978. blkio_policy_delete_node(pn);
  979. kfree(pn);
  980. spin_unlock_irq(&blkcg->lock);
  981. goto update_io_group;
  982. }
  983. spin_unlock_irq(&blkcg->lock);
  984. blkio_update_policy_rule(pn, newpn);
  985. update_io_group:
  986. blkio_update_policy_node_blkg(blkcg, newpn);
  987. free_newpn:
  988. if (!keep_newpn)
  989. kfree(newpn);
  990. free_buf:
  991. kfree(buf);
  992. return ret;
  993. }
  994. static void
  995. blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
  996. {
  997. switch(pn->plid) {
  998. case BLKIO_POLICY_PROP:
  999. if (pn->fileid == BLKIO_PROP_weight_device)
  1000. seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
  1001. MINOR(pn->dev), pn->val.weight);
  1002. break;
  1003. case BLKIO_POLICY_THROTL:
  1004. switch(pn->fileid) {
  1005. case BLKIO_THROTL_read_bps_device:
  1006. case BLKIO_THROTL_write_bps_device:
  1007. seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
  1008. MINOR(pn->dev), pn->val.bps);
  1009. break;
  1010. case BLKIO_THROTL_read_iops_device:
  1011. case BLKIO_THROTL_write_iops_device:
  1012. seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
  1013. MINOR(pn->dev), pn->val.iops);
  1014. break;
  1015. }
  1016. break;
  1017. default:
  1018. BUG();
  1019. }
  1020. }
  1021. /* cgroup files which read their data from policy nodes end up here */
  1022. static void blkio_read_policy_node_files(struct cftype *cft,
  1023. struct blkio_cgroup *blkcg, struct seq_file *m)
  1024. {
  1025. struct blkio_policy_node *pn;
  1026. if (!list_empty(&blkcg->policy_list)) {
  1027. spin_lock_irq(&blkcg->lock);
  1028. list_for_each_entry(pn, &blkcg->policy_list, node) {
  1029. if (!pn_matches_cftype(cft, pn))
  1030. continue;
  1031. blkio_print_policy_node(m, pn);
  1032. }
  1033. spin_unlock_irq(&blkcg->lock);
  1034. }
  1035. }
  1036. static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
  1037. struct seq_file *m)
  1038. {
  1039. struct blkio_cgroup *blkcg;
  1040. enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  1041. int name = BLKIOFILE_ATTR(cft->private);
  1042. blkcg = cgroup_to_blkio_cgroup(cgrp);
  1043. switch(plid) {
  1044. case BLKIO_POLICY_PROP:
  1045. switch(name) {
  1046. case BLKIO_PROP_weight_device:
  1047. blkio_read_policy_node_files(cft, blkcg, m);
  1048. return 0;
  1049. default:
  1050. BUG();
  1051. }
  1052. break;
  1053. case BLKIO_POLICY_THROTL:
  1054. switch(name){
  1055. case BLKIO_THROTL_read_bps_device:
  1056. case BLKIO_THROTL_write_bps_device:
  1057. case BLKIO_THROTL_read_iops_device:
  1058. case BLKIO_THROTL_write_iops_device:
  1059. blkio_read_policy_node_files(cft, blkcg, m);
  1060. return 0;
  1061. default:
  1062. BUG();
  1063. }
  1064. break;
  1065. default:
  1066. BUG();
  1067. }
  1068. return 0;
  1069. }
  1070. static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
  1071. struct cftype *cft, struct cgroup_map_cb *cb,
  1072. enum stat_type type, bool show_total, bool pcpu)
  1073. {
  1074. struct blkio_group *blkg;
  1075. struct hlist_node *n;
  1076. uint64_t cgroup_total = 0;
  1077. rcu_read_lock();
  1078. hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
  1079. if (blkg->dev) {
  1080. if (!cftype_blkg_same_policy(cft, blkg))
  1081. continue;
  1082. if (pcpu)
  1083. cgroup_total += blkio_get_stat_cpu(blkg, cb,
  1084. blkg->dev, type);
  1085. else {
  1086. spin_lock_irq(&blkg->stats_lock);
  1087. cgroup_total += blkio_get_stat(blkg, cb,
  1088. blkg->dev, type);
  1089. spin_unlock_irq(&blkg->stats_lock);
  1090. }
  1091. }
  1092. }
  1093. if (show_total)
  1094. cb->fill(cb, "Total", cgroup_total);
  1095. rcu_read_unlock();
  1096. return 0;
  1097. }
  1098. /* All map kind of cgroup file get serviced by this function */
  1099. static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
  1100. struct cgroup_map_cb *cb)
  1101. {
  1102. struct blkio_cgroup *blkcg;
  1103. enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  1104. int name = BLKIOFILE_ATTR(cft->private);
  1105. blkcg = cgroup_to_blkio_cgroup(cgrp);
  1106. switch(plid) {
  1107. case BLKIO_POLICY_PROP:
  1108. switch(name) {
  1109. case BLKIO_PROP_time:
  1110. return blkio_read_blkg_stats(blkcg, cft, cb,
  1111. BLKIO_STAT_TIME, 0, 0);
  1112. case BLKIO_PROP_sectors:
  1113. return blkio_read_blkg_stats(blkcg, cft, cb,
  1114. BLKIO_STAT_CPU_SECTORS, 0, 1);
  1115. case BLKIO_PROP_io_service_bytes:
  1116. return blkio_read_blkg_stats(blkcg, cft, cb,
  1117. BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
  1118. case BLKIO_PROP_io_serviced:
  1119. return blkio_read_blkg_stats(blkcg, cft, cb,
  1120. BLKIO_STAT_CPU_SERVICED, 1, 1);
  1121. case BLKIO_PROP_io_service_time:
  1122. return blkio_read_blkg_stats(blkcg, cft, cb,
  1123. BLKIO_STAT_SERVICE_TIME, 1, 0);
  1124. case BLKIO_PROP_io_wait_time:
  1125. return blkio_read_blkg_stats(blkcg, cft, cb,
  1126. BLKIO_STAT_WAIT_TIME, 1, 0);
  1127. case BLKIO_PROP_io_merged:
  1128. return blkio_read_blkg_stats(blkcg, cft, cb,
  1129. BLKIO_STAT_CPU_MERGED, 1, 1);
  1130. case BLKIO_PROP_io_queued:
  1131. return blkio_read_blkg_stats(blkcg, cft, cb,
  1132. BLKIO_STAT_QUEUED, 1, 0);
  1133. #ifdef CONFIG_DEBUG_BLK_CGROUP
  1134. case BLKIO_PROP_unaccounted_time:
  1135. return blkio_read_blkg_stats(blkcg, cft, cb,
  1136. BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
  1137. case BLKIO_PROP_dequeue:
  1138. return blkio_read_blkg_stats(blkcg, cft, cb,
  1139. BLKIO_STAT_DEQUEUE, 0, 0);
  1140. case BLKIO_PROP_avg_queue_size:
  1141. return blkio_read_blkg_stats(blkcg, cft, cb,
  1142. BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
  1143. case BLKIO_PROP_group_wait_time:
  1144. return blkio_read_blkg_stats(blkcg, cft, cb,
  1145. BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
  1146. case BLKIO_PROP_idle_time:
  1147. return blkio_read_blkg_stats(blkcg, cft, cb,
  1148. BLKIO_STAT_IDLE_TIME, 0, 0);
  1149. case BLKIO_PROP_empty_time:
  1150. return blkio_read_blkg_stats(blkcg, cft, cb,
  1151. BLKIO_STAT_EMPTY_TIME, 0, 0);
  1152. #endif
  1153. default:
  1154. BUG();
  1155. }
  1156. break;
  1157. case BLKIO_POLICY_THROTL:
  1158. switch(name){
  1159. case BLKIO_THROTL_io_service_bytes:
  1160. return blkio_read_blkg_stats(blkcg, cft, cb,
  1161. BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
  1162. case BLKIO_THROTL_io_serviced:
  1163. return blkio_read_blkg_stats(blkcg, cft, cb,
  1164. BLKIO_STAT_CPU_SERVICED, 1, 1);
  1165. default:
  1166. BUG();
  1167. }
  1168. break;
  1169. default:
  1170. BUG();
  1171. }
  1172. return 0;
  1173. }
  1174. static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
  1175. {
  1176. struct blkio_group *blkg;
  1177. struct hlist_node *n;
  1178. struct blkio_policy_node *pn;
  1179. if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
  1180. return -EINVAL;
  1181. spin_lock(&blkio_list_lock);
  1182. spin_lock_irq(&blkcg->lock);
  1183. blkcg->weight = (unsigned int)val;
  1184. hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
  1185. pn = blkio_policy_search_node(blkcg, blkg->dev,
  1186. BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
  1187. if (pn)
  1188. continue;
  1189. blkio_update_group_weight(blkg, blkcg->weight);
  1190. }
  1191. spin_unlock_irq(&blkcg->lock);
  1192. spin_unlock(&blkio_list_lock);
  1193. return 0;
  1194. }
  1195. static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
  1196. struct blkio_cgroup *blkcg;
  1197. enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  1198. int name = BLKIOFILE_ATTR(cft->private);
  1199. blkcg = cgroup_to_blkio_cgroup(cgrp);
  1200. switch(plid) {
  1201. case BLKIO_POLICY_PROP:
  1202. switch(name) {
  1203. case BLKIO_PROP_weight:
  1204. return (u64)blkcg->weight;
  1205. }
  1206. break;
  1207. default:
  1208. BUG();
  1209. }
  1210. return 0;
  1211. }
  1212. static int
  1213. blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
  1214. {
  1215. struct blkio_cgroup *blkcg;
  1216. enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
  1217. int name = BLKIOFILE_ATTR(cft->private);
  1218. blkcg = cgroup_to_blkio_cgroup(cgrp);
  1219. switch(plid) {
  1220. case BLKIO_POLICY_PROP:
  1221. switch(name) {
  1222. case BLKIO_PROP_weight:
  1223. return blkio_weight_write(blkcg, val);
  1224. }
  1225. break;
  1226. default:
  1227. BUG();
  1228. }
  1229. return 0;
  1230. }
  1231. struct cftype blkio_files[] = {
  1232. {
  1233. .name = "weight_device",
  1234. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1235. BLKIO_PROP_weight_device),
  1236. .read_seq_string = blkiocg_file_read,
  1237. .write_string = blkiocg_file_write,
  1238. .max_write_len = 256,
  1239. },
  1240. {
  1241. .name = "weight",
  1242. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1243. BLKIO_PROP_weight),
  1244. .read_u64 = blkiocg_file_read_u64,
  1245. .write_u64 = blkiocg_file_write_u64,
  1246. },
  1247. {
  1248. .name = "time",
  1249. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1250. BLKIO_PROP_time),
  1251. .read_map = blkiocg_file_read_map,
  1252. },
  1253. {
  1254. .name = "sectors",
  1255. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1256. BLKIO_PROP_sectors),
  1257. .read_map = blkiocg_file_read_map,
  1258. },
  1259. {
  1260. .name = "io_service_bytes",
  1261. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1262. BLKIO_PROP_io_service_bytes),
  1263. .read_map = blkiocg_file_read_map,
  1264. },
  1265. {
  1266. .name = "io_serviced",
  1267. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1268. BLKIO_PROP_io_serviced),
  1269. .read_map = blkiocg_file_read_map,
  1270. },
  1271. {
  1272. .name = "io_service_time",
  1273. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1274. BLKIO_PROP_io_service_time),
  1275. .read_map = blkiocg_file_read_map,
  1276. },
  1277. {
  1278. .name = "io_wait_time",
  1279. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1280. BLKIO_PROP_io_wait_time),
  1281. .read_map = blkiocg_file_read_map,
  1282. },
  1283. {
  1284. .name = "io_merged",
  1285. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1286. BLKIO_PROP_io_merged),
  1287. .read_map = blkiocg_file_read_map,
  1288. },
  1289. {
  1290. .name = "io_queued",
  1291. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1292. BLKIO_PROP_io_queued),
  1293. .read_map = blkiocg_file_read_map,
  1294. },
  1295. {
  1296. .name = "reset_stats",
  1297. .write_u64 = blkiocg_reset_stats,
  1298. },
  1299. #ifdef CONFIG_BLK_DEV_THROTTLING
  1300. {
  1301. .name = "throttle.read_bps_device",
  1302. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
  1303. BLKIO_THROTL_read_bps_device),
  1304. .read_seq_string = blkiocg_file_read,
  1305. .write_string = blkiocg_file_write,
  1306. .max_write_len = 256,
  1307. },
  1308. {
  1309. .name = "throttle.write_bps_device",
  1310. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
  1311. BLKIO_THROTL_write_bps_device),
  1312. .read_seq_string = blkiocg_file_read,
  1313. .write_string = blkiocg_file_write,
  1314. .max_write_len = 256,
  1315. },
  1316. {
  1317. .name = "throttle.read_iops_device",
  1318. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
  1319. BLKIO_THROTL_read_iops_device),
  1320. .read_seq_string = blkiocg_file_read,
  1321. .write_string = blkiocg_file_write,
  1322. .max_write_len = 256,
  1323. },
  1324. {
  1325. .name = "throttle.write_iops_device",
  1326. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
  1327. BLKIO_THROTL_write_iops_device),
  1328. .read_seq_string = blkiocg_file_read,
  1329. .write_string = blkiocg_file_write,
  1330. .max_write_len = 256,
  1331. },
  1332. {
  1333. .name = "throttle.io_service_bytes",
  1334. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
  1335. BLKIO_THROTL_io_service_bytes),
  1336. .read_map = blkiocg_file_read_map,
  1337. },
  1338. {
  1339. .name = "throttle.io_serviced",
  1340. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
  1341. BLKIO_THROTL_io_serviced),
  1342. .read_map = blkiocg_file_read_map,
  1343. },
  1344. #endif /* CONFIG_BLK_DEV_THROTTLING */
  1345. #ifdef CONFIG_DEBUG_BLK_CGROUP
  1346. {
  1347. .name = "avg_queue_size",
  1348. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1349. BLKIO_PROP_avg_queue_size),
  1350. .read_map = blkiocg_file_read_map,
  1351. },
  1352. {
  1353. .name = "group_wait_time",
  1354. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1355. BLKIO_PROP_group_wait_time),
  1356. .read_map = blkiocg_file_read_map,
  1357. },
  1358. {
  1359. .name = "idle_time",
  1360. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1361. BLKIO_PROP_idle_time),
  1362. .read_map = blkiocg_file_read_map,
  1363. },
  1364. {
  1365. .name = "empty_time",
  1366. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1367. BLKIO_PROP_empty_time),
  1368. .read_map = blkiocg_file_read_map,
  1369. },
  1370. {
  1371. .name = "dequeue",
  1372. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1373. BLKIO_PROP_dequeue),
  1374. .read_map = blkiocg_file_read_map,
  1375. },
  1376. {
  1377. .name = "unaccounted_time",
  1378. .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
  1379. BLKIO_PROP_unaccounted_time),
  1380. .read_map = blkiocg_file_read_map,
  1381. },
  1382. #endif
  1383. };
  1384. static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  1385. {
  1386. return cgroup_add_files(cgroup, subsys, blkio_files,
  1387. ARRAY_SIZE(blkio_files));
  1388. }
  1389. static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  1390. {
  1391. struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
  1392. unsigned long flags;
  1393. struct blkio_group *blkg;
  1394. struct request_queue *q;
  1395. struct blkio_policy_type *blkiop;
  1396. struct blkio_policy_node *pn, *pntmp;
  1397. rcu_read_lock();
  1398. do {
  1399. spin_lock_irqsave(&blkcg->lock, flags);
  1400. if (hlist_empty(&blkcg->blkg_list)) {
  1401. spin_unlock_irqrestore(&blkcg->lock, flags);
  1402. break;
  1403. }
  1404. blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
  1405. blkcg_node);
  1406. q = rcu_dereference(blkg->q);
  1407. __blkiocg_del_blkio_group(blkg);
  1408. spin_unlock_irqrestore(&blkcg->lock, flags);
  1409. /*
  1410. * This blkio_group is being unlinked as associated cgroup is
  1411. * going away. Let all the IO controlling policies know about
  1412. * this event.
  1413. */
  1414. spin_lock(&blkio_list_lock);
  1415. list_for_each_entry(blkiop, &blkio_list, list) {
  1416. if (blkiop->plid != blkg->plid)
  1417. continue;
  1418. blkiop->ops.blkio_unlink_group_fn(q, blkg);
  1419. }
  1420. spin_unlock(&blkio_list_lock);
  1421. } while (1);
  1422. list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
  1423. blkio_policy_delete_node(pn);
  1424. kfree(pn);
  1425. }
  1426. free_css_id(&blkio_subsys, &blkcg->css);
  1427. rcu_read_unlock();
  1428. if (blkcg != &blkio_root_cgroup)
  1429. kfree(blkcg);
  1430. }
  1431. static struct cgroup_subsys_state *
  1432. blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
  1433. {
  1434. struct blkio_cgroup *blkcg;
  1435. struct cgroup *parent = cgroup->parent;
  1436. if (!parent) {
  1437. blkcg = &blkio_root_cgroup;
  1438. goto done;
  1439. }
  1440. blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
  1441. if (!blkcg)
  1442. return ERR_PTR(-ENOMEM);
  1443. blkcg->weight = BLKIO_WEIGHT_DEFAULT;
  1444. done:
  1445. spin_lock_init(&blkcg->lock);
  1446. INIT_HLIST_HEAD(&blkcg->blkg_list);
  1447. INIT_LIST_HEAD(&blkcg->policy_list);
  1448. return &blkcg->css;
  1449. }
  1450. /*
  1451. * We cannot support shared io contexts, as we have no mean to support
  1452. * two tasks with the same ioc in two different groups without major rework
  1453. * of the main cic data structures. For now we allow a task to change
  1454. * its cgroup only if it's the only owner of its ioc.
  1455. */
  1456. static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
  1457. struct cgroup_taskset *tset)
  1458. {
  1459. struct task_struct *task;
  1460. struct io_context *ioc;
  1461. int ret = 0;
  1462. /* task_lock() is needed to avoid races with exit_io_context() */
  1463. cgroup_taskset_for_each(task, cgrp, tset) {
  1464. task_lock(task);
  1465. ioc = task->io_context;
  1466. if (ioc && atomic_read(&ioc->nr_tasks) > 1)
  1467. ret = -EINVAL;
  1468. task_unlock(task);
  1469. if (ret)
  1470. break;
  1471. }
  1472. return ret;
  1473. }
  1474. static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
  1475. struct cgroup_taskset *tset)
  1476. {
  1477. struct task_struct *task;
  1478. struct io_context *ioc;
  1479. cgroup_taskset_for_each(task, cgrp, tset) {
  1480. /* we don't lose anything even if ioc allocation fails */
  1481. ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
  1482. if (ioc) {
  1483. ioc_cgroup_changed(ioc);
  1484. put_io_context(ioc);
  1485. }
  1486. }
  1487. }
  1488. void blkio_policy_register(struct blkio_policy_type *blkiop)
  1489. {
  1490. spin_lock(&blkio_list_lock);
  1491. BUG_ON(blkio_policy[blkiop->plid]);
  1492. blkio_policy[blkiop->plid] = blkiop;
  1493. list_add_tail(&blkiop->list, &blkio_list);
  1494. spin_unlock(&blkio_list_lock);
  1495. }
  1496. EXPORT_SYMBOL_GPL(blkio_policy_register);
  1497. void blkio_policy_unregister(struct blkio_policy_type *blkiop)
  1498. {
  1499. spin_lock(&blkio_list_lock);
  1500. BUG_ON(blkio_policy[blkiop->plid] != blkiop);
  1501. blkio_policy[blkiop->plid] = NULL;
  1502. list_del_init(&blkiop->list);
  1503. spin_unlock(&blkio_list_lock);
  1504. }
  1505. EXPORT_SYMBOL_GPL(blkio_policy_unregister);