blk-throttle.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292
  1. /*
  2. * Interface for controlling IO bandwidth on a request queue
  3. *
  4. * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
  5. */
  6. #include <linux/module.h>
  7. #include <linux/slab.h>
  8. #include <linux/blkdev.h>
  9. #include <linux/bio.h>
  10. #include <linux/blktrace_api.h>
  11. #include "blk-cgroup.h"
  12. #include "blk.h"
  13. /* Max dispatch from a group in 1 round */
  14. static int throtl_grp_quantum = 8;
  15. /* Total max dispatch from all groups in one round */
  16. static int throtl_quantum = 32;
  17. /* Throttling is performed over 100ms slice and after that slice is renewed */
  18. static unsigned long throtl_slice = HZ/10; /* 100 ms */
  19. /* A workqueue to queue throttle related work */
  20. static struct workqueue_struct *kthrotld_workqueue;
  21. static void throtl_schedule_delayed_work(struct throtl_data *td,
  22. unsigned long delay);
  23. struct throtl_rb_root {
  24. struct rb_root rb;
  25. struct rb_node *left;
  26. unsigned int count;
  27. unsigned long min_disptime;
  28. };
  29. #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
  30. .count = 0, .min_disptime = 0}
  31. #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
  32. struct throtl_grp {
  33. /* List of throtl groups on the request queue*/
  34. struct hlist_node tg_node;
  35. /* active throtl group service_tree member */
  36. struct rb_node rb_node;
  37. /*
  38. * Dispatch time in jiffies. This is the estimated time when group
  39. * will unthrottle and is ready to dispatch more bio. It is used as
  40. * key to sort active groups in service tree.
  41. */
  42. unsigned long disptime;
  43. struct blkio_group blkg;
  44. atomic_t ref;
  45. unsigned int flags;
  46. /* Two lists for READ and WRITE */
  47. struct bio_list bio_lists[2];
  48. /* Number of queued bios on READ and WRITE lists */
  49. unsigned int nr_queued[2];
  50. /* bytes per second rate limits */
  51. uint64_t bps[2];
  52. /* IOPS limits */
  53. unsigned int iops[2];
  54. /* Number of bytes disptached in current slice */
  55. uint64_t bytes_disp[2];
  56. /* Number of bio's dispatched in current slice */
  57. unsigned int io_disp[2];
  58. /* When did we start a new slice */
  59. unsigned long slice_start[2];
  60. unsigned long slice_end[2];
  61. /* Some throttle limits got updated for the group */
  62. int limits_changed;
  63. struct rcu_head rcu_head;
  64. };
  65. struct throtl_data
  66. {
  67. /* List of throtl groups */
  68. struct hlist_head tg_list;
  69. /* service tree for active throtl groups */
  70. struct throtl_rb_root tg_service_tree;
  71. struct throtl_grp *root_tg;
  72. struct request_queue *queue;
  73. /* Total Number of queued bios on READ and WRITE lists */
  74. unsigned int nr_queued[2];
  75. /*
  76. * number of total undestroyed groups
  77. */
  78. unsigned int nr_undestroyed_grps;
  79. /* Work for dispatching throttled bios */
  80. struct delayed_work throtl_work;
  81. int limits_changed;
  82. };
  83. enum tg_state_flags {
  84. THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
  85. };
  86. #define THROTL_TG_FNS(name) \
  87. static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
  88. { \
  89. (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
  90. } \
  91. static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
  92. { \
  93. (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
  94. } \
  95. static inline int throtl_tg_##name(const struct throtl_grp *tg) \
  96. { \
  97. return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
  98. }
  99. THROTL_TG_FNS(on_rr);
  100. #define throtl_log_tg(td, tg, fmt, args...) \
  101. blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
  102. blkg_path(&(tg)->blkg), ##args); \
  103. #define throtl_log(td, fmt, args...) \
  104. blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
  105. static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
  106. {
  107. if (blkg)
  108. return container_of(blkg, struct throtl_grp, blkg);
  109. return NULL;
  110. }
  111. static inline unsigned int total_nr_queued(struct throtl_data *td)
  112. {
  113. return td->nr_queued[0] + td->nr_queued[1];
  114. }
  115. static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
  116. {
  117. atomic_inc(&tg->ref);
  118. return tg;
  119. }
  120. static void throtl_free_tg(struct rcu_head *head)
  121. {
  122. struct throtl_grp *tg;
  123. tg = container_of(head, struct throtl_grp, rcu_head);
  124. free_percpu(tg->blkg.stats_cpu);
  125. kfree(tg);
  126. }
  127. static void throtl_put_tg(struct throtl_grp *tg)
  128. {
  129. BUG_ON(atomic_read(&tg->ref) <= 0);
  130. if (!atomic_dec_and_test(&tg->ref))
  131. return;
  132. /*
  133. * A group is freed in rcu manner. But having an rcu lock does not
  134. * mean that one can access all the fields of blkg and assume these
  135. * are valid. For example, don't try to follow throtl_data and
  136. * request queue links.
  137. *
  138. * Having a reference to blkg under an rcu allows acess to only
  139. * values local to groups like group stats and group rate limits
  140. */
  141. call_rcu(&tg->rcu_head, throtl_free_tg);
  142. }
  143. static void throtl_init_group(struct throtl_grp *tg)
  144. {
  145. INIT_HLIST_NODE(&tg->tg_node);
  146. RB_CLEAR_NODE(&tg->rb_node);
  147. bio_list_init(&tg->bio_lists[0]);
  148. bio_list_init(&tg->bio_lists[1]);
  149. tg->limits_changed = false;
  150. /* Practically unlimited BW */
  151. tg->bps[0] = tg->bps[1] = -1;
  152. tg->iops[0] = tg->iops[1] = -1;
  153. /*
  154. * Take the initial reference that will be released on destroy
  155. * This can be thought of a joint reference by cgroup and
  156. * request queue which will be dropped by either request queue
  157. * exit or cgroup deletion path depending on who is exiting first.
  158. */
  159. atomic_set(&tg->ref, 1);
  160. }
  161. /* Should be called with rcu read lock held (needed for blkcg) */
  162. static void
  163. throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
  164. {
  165. hlist_add_head(&tg->tg_node, &td->tg_list);
  166. td->nr_undestroyed_grps++;
  167. }
  168. static void
  169. __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
  170. {
  171. struct backing_dev_info *bdi = &td->queue->backing_dev_info;
  172. unsigned int major, minor;
  173. if (!tg || tg->blkg.dev)
  174. return;
  175. /*
  176. * Fill in device details for a group which might not have been
  177. * filled at group creation time as queue was being instantiated
  178. * and driver had not attached a device yet
  179. */
  180. if (bdi->dev && dev_name(bdi->dev)) {
  181. sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
  182. tg->blkg.dev = MKDEV(major, minor);
  183. }
  184. }
  185. /*
  186. * Should be called with without queue lock held. Here queue lock will be
  187. * taken rarely. It will be taken only once during life time of a group
  188. * if need be
  189. */
  190. static void
  191. throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
  192. {
  193. if (!tg || tg->blkg.dev)
  194. return;
  195. spin_lock_irq(td->queue->queue_lock);
  196. __throtl_tg_fill_dev_details(td, tg);
  197. spin_unlock_irq(td->queue->queue_lock);
  198. }
  199. static void throtl_init_add_tg_lists(struct throtl_data *td,
  200. struct throtl_grp *tg, struct blkio_cgroup *blkcg)
  201. {
  202. __throtl_tg_fill_dev_details(td, tg);
  203. /* Add group onto cgroup list */
  204. blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
  205. tg->blkg.dev, BLKIO_POLICY_THROTL);
  206. tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
  207. tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
  208. tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
  209. tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
  210. throtl_add_group_to_td_list(td, tg);
  211. }
  212. /* Should be called without queue lock and outside of rcu period */
  213. static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
  214. {
  215. struct throtl_grp *tg = NULL;
  216. int ret;
  217. tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
  218. if (!tg)
  219. return NULL;
  220. ret = blkio_alloc_blkg_stats(&tg->blkg);
  221. if (ret) {
  222. kfree(tg);
  223. return NULL;
  224. }
  225. throtl_init_group(tg);
  226. return tg;
  227. }
  228. static struct
  229. throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
  230. {
  231. struct throtl_grp *tg = NULL;
  232. void *key = td;
  233. /*
  234. * This is the common case when there are no blkio cgroups.
  235. * Avoid lookup in this case
  236. */
  237. if (blkcg == &blkio_root_cgroup)
  238. tg = td->root_tg;
  239. else
  240. tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
  241. __throtl_tg_fill_dev_details(td, tg);
  242. return tg;
  243. }
  244. static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
  245. {
  246. struct throtl_grp *tg = NULL, *__tg = NULL;
  247. struct blkio_cgroup *blkcg;
  248. struct request_queue *q = td->queue;
  249. rcu_read_lock();
  250. blkcg = task_blkio_cgroup(current);
  251. tg = throtl_find_tg(td, blkcg);
  252. if (tg) {
  253. rcu_read_unlock();
  254. return tg;
  255. }
  256. /*
  257. * Need to allocate a group. Allocation of group also needs allocation
  258. * of per cpu stats which in-turn takes a mutex() and can block. Hence
  259. * we need to drop rcu lock and queue_lock before we call alloc.
  260. */
  261. rcu_read_unlock();
  262. spin_unlock_irq(q->queue_lock);
  263. tg = throtl_alloc_tg(td);
  264. /* Group allocated and queue is still alive. take the lock */
  265. spin_lock_irq(q->queue_lock);
  266. /* Make sure @q is still alive */
  267. if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
  268. kfree(tg);
  269. return NULL;
  270. }
  271. /*
  272. * Initialize the new group. After sleeping, read the blkcg again.
  273. */
  274. rcu_read_lock();
  275. blkcg = task_blkio_cgroup(current);
  276. /*
  277. * If some other thread already allocated the group while we were
  278. * not holding queue lock, free up the group
  279. */
  280. __tg = throtl_find_tg(td, blkcg);
  281. if (__tg) {
  282. kfree(tg);
  283. rcu_read_unlock();
  284. return __tg;
  285. }
  286. /* Group allocation failed. Account the IO to root group */
  287. if (!tg) {
  288. tg = td->root_tg;
  289. return tg;
  290. }
  291. throtl_init_add_tg_lists(td, tg, blkcg);
  292. rcu_read_unlock();
  293. return tg;
  294. }
  295. static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
  296. {
  297. /* Service tree is empty */
  298. if (!root->count)
  299. return NULL;
  300. if (!root->left)
  301. root->left = rb_first(&root->rb);
  302. if (root->left)
  303. return rb_entry_tg(root->left);
  304. return NULL;
  305. }
  306. static void rb_erase_init(struct rb_node *n, struct rb_root *root)
  307. {
  308. rb_erase(n, root);
  309. RB_CLEAR_NODE(n);
  310. }
  311. static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
  312. {
  313. if (root->left == n)
  314. root->left = NULL;
  315. rb_erase_init(n, &root->rb);
  316. --root->count;
  317. }
  318. static void update_min_dispatch_time(struct throtl_rb_root *st)
  319. {
  320. struct throtl_grp *tg;
  321. tg = throtl_rb_first(st);
  322. if (!tg)
  323. return;
  324. st->min_disptime = tg->disptime;
  325. }
  326. static void
  327. tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
  328. {
  329. struct rb_node **node = &st->rb.rb_node;
  330. struct rb_node *parent = NULL;
  331. struct throtl_grp *__tg;
  332. unsigned long key = tg->disptime;
  333. int left = 1;
  334. while (*node != NULL) {
  335. parent = *node;
  336. __tg = rb_entry_tg(parent);
  337. if (time_before(key, __tg->disptime))
  338. node = &parent->rb_left;
  339. else {
  340. node = &parent->rb_right;
  341. left = 0;
  342. }
  343. }
  344. if (left)
  345. st->left = &tg->rb_node;
  346. rb_link_node(&tg->rb_node, parent, node);
  347. rb_insert_color(&tg->rb_node, &st->rb);
  348. }
  349. static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
  350. {
  351. struct throtl_rb_root *st = &td->tg_service_tree;
  352. tg_service_tree_add(st, tg);
  353. throtl_mark_tg_on_rr(tg);
  354. st->count++;
  355. }
  356. static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
  357. {
  358. if (!throtl_tg_on_rr(tg))
  359. __throtl_enqueue_tg(td, tg);
  360. }
  361. static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
  362. {
  363. throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
  364. throtl_clear_tg_on_rr(tg);
  365. }
  366. static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
  367. {
  368. if (throtl_tg_on_rr(tg))
  369. __throtl_dequeue_tg(td, tg);
  370. }
  371. static void throtl_schedule_next_dispatch(struct throtl_data *td)
  372. {
  373. struct throtl_rb_root *st = &td->tg_service_tree;
  374. /*
  375. * If there are more bios pending, schedule more work.
  376. */
  377. if (!total_nr_queued(td))
  378. return;
  379. BUG_ON(!st->count);
  380. update_min_dispatch_time(st);
  381. if (time_before_eq(st->min_disptime, jiffies))
  382. throtl_schedule_delayed_work(td, 0);
  383. else
  384. throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
  385. }
  386. static inline void
  387. throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
  388. {
  389. tg->bytes_disp[rw] = 0;
  390. tg->io_disp[rw] = 0;
  391. tg->slice_start[rw] = jiffies;
  392. tg->slice_end[rw] = jiffies + throtl_slice;
  393. throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
  394. rw == READ ? 'R' : 'W', tg->slice_start[rw],
  395. tg->slice_end[rw], jiffies);
  396. }
  397. static inline void throtl_set_slice_end(struct throtl_data *td,
  398. struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
  399. {
  400. tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
  401. }
  402. static inline void throtl_extend_slice(struct throtl_data *td,
  403. struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
  404. {
  405. tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
  406. throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
  407. rw == READ ? 'R' : 'W', tg->slice_start[rw],
  408. tg->slice_end[rw], jiffies);
  409. }
  410. /* Determine if previously allocated or extended slice is complete or not */
  411. static bool
  412. throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
  413. {
  414. if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
  415. return 0;
  416. return 1;
  417. }
  418. /* Trim the used slices and adjust slice start accordingly */
  419. static inline void
  420. throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
  421. {
  422. unsigned long nr_slices, time_elapsed, io_trim;
  423. u64 bytes_trim, tmp;
  424. BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
  425. /*
  426. * If bps are unlimited (-1), then time slice don't get
  427. * renewed. Don't try to trim the slice if slice is used. A new
  428. * slice will start when appropriate.
  429. */
  430. if (throtl_slice_used(td, tg, rw))
  431. return;
  432. /*
  433. * A bio has been dispatched. Also adjust slice_end. It might happen
  434. * that initially cgroup limit was very low resulting in high
  435. * slice_end, but later limit was bumped up and bio was dispached
  436. * sooner, then we need to reduce slice_end. A high bogus slice_end
  437. * is bad because it does not allow new slice to start.
  438. */
  439. throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
  440. time_elapsed = jiffies - tg->slice_start[rw];
  441. nr_slices = time_elapsed / throtl_slice;
  442. if (!nr_slices)
  443. return;
  444. tmp = tg->bps[rw] * throtl_slice * nr_slices;
  445. do_div(tmp, HZ);
  446. bytes_trim = tmp;
  447. io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
  448. if (!bytes_trim && !io_trim)
  449. return;
  450. if (tg->bytes_disp[rw] >= bytes_trim)
  451. tg->bytes_disp[rw] -= bytes_trim;
  452. else
  453. tg->bytes_disp[rw] = 0;
  454. if (tg->io_disp[rw] >= io_trim)
  455. tg->io_disp[rw] -= io_trim;
  456. else
  457. tg->io_disp[rw] = 0;
  458. tg->slice_start[rw] += nr_slices * throtl_slice;
  459. throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
  460. " start=%lu end=%lu jiffies=%lu",
  461. rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
  462. tg->slice_start[rw], tg->slice_end[rw], jiffies);
  463. }
  464. static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
  465. struct bio *bio, unsigned long *wait)
  466. {
  467. bool rw = bio_data_dir(bio);
  468. unsigned int io_allowed;
  469. unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
  470. u64 tmp;
  471. jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  472. /* Slice has just started. Consider one slice interval */
  473. if (!jiffy_elapsed)
  474. jiffy_elapsed_rnd = throtl_slice;
  475. jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
  476. /*
  477. * jiffy_elapsed_rnd should not be a big value as minimum iops can be
  478. * 1 then at max jiffy elapsed should be equivalent of 1 second as we
  479. * will allow dispatch after 1 second and after that slice should
  480. * have been trimmed.
  481. */
  482. tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
  483. do_div(tmp, HZ);
  484. if (tmp > UINT_MAX)
  485. io_allowed = UINT_MAX;
  486. else
  487. io_allowed = tmp;
  488. if (tg->io_disp[rw] + 1 <= io_allowed) {
  489. if (wait)
  490. *wait = 0;
  491. return 1;
  492. }
  493. /* Calc approx time to dispatch */
  494. jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
  495. if (jiffy_wait > jiffy_elapsed)
  496. jiffy_wait = jiffy_wait - jiffy_elapsed;
  497. else
  498. jiffy_wait = 1;
  499. if (wait)
  500. *wait = jiffy_wait;
  501. return 0;
  502. }
  503. static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
  504. struct bio *bio, unsigned long *wait)
  505. {
  506. bool rw = bio_data_dir(bio);
  507. u64 bytes_allowed, extra_bytes, tmp;
  508. unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
  509. jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  510. /* Slice has just started. Consider one slice interval */
  511. if (!jiffy_elapsed)
  512. jiffy_elapsed_rnd = throtl_slice;
  513. jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
  514. tmp = tg->bps[rw] * jiffy_elapsed_rnd;
  515. do_div(tmp, HZ);
  516. bytes_allowed = tmp;
  517. if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
  518. if (wait)
  519. *wait = 0;
  520. return 1;
  521. }
  522. /* Calc approx time to dispatch */
  523. extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
  524. jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
  525. if (!jiffy_wait)
  526. jiffy_wait = 1;
  527. /*
  528. * This wait time is without taking into consideration the rounding
  529. * up we did. Add that time also.
  530. */
  531. jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
  532. if (wait)
  533. *wait = jiffy_wait;
  534. return 0;
  535. }
  536. static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
  537. if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
  538. return 1;
  539. return 0;
  540. }
  541. /*
  542. * Returns whether one can dispatch a bio or not. Also returns approx number
  543. * of jiffies to wait before this bio is with-in IO rate and can be dispatched
  544. */
  545. static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
  546. struct bio *bio, unsigned long *wait)
  547. {
  548. bool rw = bio_data_dir(bio);
  549. unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
  550. /*
  551. * Currently whole state machine of group depends on first bio
  552. * queued in the group bio list. So one should not be calling
  553. * this function with a different bio if there are other bios
  554. * queued.
  555. */
  556. BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
  557. /* If tg->bps = -1, then BW is unlimited */
  558. if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
  559. if (wait)
  560. *wait = 0;
  561. return 1;
  562. }
  563. /*
  564. * If previous slice expired, start a new one otherwise renew/extend
  565. * existing slice to make sure it is at least throtl_slice interval
  566. * long since now.
  567. */
  568. if (throtl_slice_used(td, tg, rw))
  569. throtl_start_new_slice(td, tg, rw);
  570. else {
  571. if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
  572. throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
  573. }
  574. if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
  575. && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
  576. if (wait)
  577. *wait = 0;
  578. return 1;
  579. }
  580. max_wait = max(bps_wait, iops_wait);
  581. if (wait)
  582. *wait = max_wait;
  583. if (time_before(tg->slice_end[rw], jiffies + max_wait))
  584. throtl_extend_slice(td, tg, rw, jiffies + max_wait);
  585. return 0;
  586. }
  587. static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
  588. {
  589. bool rw = bio_data_dir(bio);
  590. bool sync = rw_is_sync(bio->bi_rw);
  591. /* Charge the bio to the group */
  592. tg->bytes_disp[rw] += bio->bi_size;
  593. tg->io_disp[rw]++;
  594. blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
  595. }
  596. static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
  597. struct bio *bio)
  598. {
  599. bool rw = bio_data_dir(bio);
  600. bio_list_add(&tg->bio_lists[rw], bio);
  601. /* Take a bio reference on tg */
  602. throtl_ref_get_tg(tg);
  603. tg->nr_queued[rw]++;
  604. td->nr_queued[rw]++;
  605. throtl_enqueue_tg(td, tg);
  606. }
  607. static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
  608. {
  609. unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
  610. struct bio *bio;
  611. if ((bio = bio_list_peek(&tg->bio_lists[READ])))
  612. tg_may_dispatch(td, tg, bio, &read_wait);
  613. if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
  614. tg_may_dispatch(td, tg, bio, &write_wait);
  615. min_wait = min(read_wait, write_wait);
  616. disptime = jiffies + min_wait;
  617. /* Update dispatch time */
  618. throtl_dequeue_tg(td, tg);
  619. tg->disptime = disptime;
  620. throtl_enqueue_tg(td, tg);
  621. }
  622. static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
  623. bool rw, struct bio_list *bl)
  624. {
  625. struct bio *bio;
  626. bio = bio_list_pop(&tg->bio_lists[rw]);
  627. tg->nr_queued[rw]--;
  628. /* Drop bio reference on tg */
  629. throtl_put_tg(tg);
  630. BUG_ON(td->nr_queued[rw] <= 0);
  631. td->nr_queued[rw]--;
  632. throtl_charge_bio(tg, bio);
  633. bio_list_add(bl, bio);
  634. bio->bi_rw |= REQ_THROTTLED;
  635. throtl_trim_slice(td, tg, rw);
  636. }
  637. static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
  638. struct bio_list *bl)
  639. {
  640. unsigned int nr_reads = 0, nr_writes = 0;
  641. unsigned int max_nr_reads = throtl_grp_quantum*3/4;
  642. unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
  643. struct bio *bio;
  644. /* Try to dispatch 75% READS and 25% WRITES */
  645. while ((bio = bio_list_peek(&tg->bio_lists[READ]))
  646. && tg_may_dispatch(td, tg, bio, NULL)) {
  647. tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
  648. nr_reads++;
  649. if (nr_reads >= max_nr_reads)
  650. break;
  651. }
  652. while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
  653. && tg_may_dispatch(td, tg, bio, NULL)) {
  654. tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
  655. nr_writes++;
  656. if (nr_writes >= max_nr_writes)
  657. break;
  658. }
  659. return nr_reads + nr_writes;
  660. }
  661. static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
  662. {
  663. unsigned int nr_disp = 0;
  664. struct throtl_grp *tg;
  665. struct throtl_rb_root *st = &td->tg_service_tree;
  666. while (1) {
  667. tg = throtl_rb_first(st);
  668. if (!tg)
  669. break;
  670. if (time_before(jiffies, tg->disptime))
  671. break;
  672. throtl_dequeue_tg(td, tg);
  673. nr_disp += throtl_dispatch_tg(td, tg, bl);
  674. if (tg->nr_queued[0] || tg->nr_queued[1]) {
  675. tg_update_disptime(td, tg);
  676. throtl_enqueue_tg(td, tg);
  677. }
  678. if (nr_disp >= throtl_quantum)
  679. break;
  680. }
  681. return nr_disp;
  682. }
  683. static void throtl_process_limit_change(struct throtl_data *td)
  684. {
  685. struct throtl_grp *tg;
  686. struct hlist_node *pos, *n;
  687. if (!td->limits_changed)
  688. return;
  689. xchg(&td->limits_changed, false);
  690. throtl_log(td, "limits changed");
  691. hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
  692. if (!tg->limits_changed)
  693. continue;
  694. if (!xchg(&tg->limits_changed, false))
  695. continue;
  696. throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
  697. " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
  698. tg->iops[READ], tg->iops[WRITE]);
  699. /*
  700. * Restart the slices for both READ and WRITES. It
  701. * might happen that a group's limit are dropped
  702. * suddenly and we don't want to account recently
  703. * dispatched IO with new low rate
  704. */
  705. throtl_start_new_slice(td, tg, 0);
  706. throtl_start_new_slice(td, tg, 1);
  707. if (throtl_tg_on_rr(tg))
  708. tg_update_disptime(td, tg);
  709. }
  710. }
  711. /* Dispatch throttled bios. Should be called without queue lock held. */
  712. static int throtl_dispatch(struct request_queue *q)
  713. {
  714. struct throtl_data *td = q->td;
  715. unsigned int nr_disp = 0;
  716. struct bio_list bio_list_on_stack;
  717. struct bio *bio;
  718. struct blk_plug plug;
  719. spin_lock_irq(q->queue_lock);
  720. throtl_process_limit_change(td);
  721. if (!total_nr_queued(td))
  722. goto out;
  723. bio_list_init(&bio_list_on_stack);
  724. throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
  725. total_nr_queued(td), td->nr_queued[READ],
  726. td->nr_queued[WRITE]);
  727. nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
  728. if (nr_disp)
  729. throtl_log(td, "bios disp=%u", nr_disp);
  730. throtl_schedule_next_dispatch(td);
  731. out:
  732. spin_unlock_irq(q->queue_lock);
  733. /*
  734. * If we dispatched some requests, unplug the queue to make sure
  735. * immediate dispatch
  736. */
  737. if (nr_disp) {
  738. blk_start_plug(&plug);
  739. while((bio = bio_list_pop(&bio_list_on_stack)))
  740. generic_make_request(bio);
  741. blk_finish_plug(&plug);
  742. }
  743. return nr_disp;
  744. }
  745. void blk_throtl_work(struct work_struct *work)
  746. {
  747. struct throtl_data *td = container_of(work, struct throtl_data,
  748. throtl_work.work);
  749. struct request_queue *q = td->queue;
  750. throtl_dispatch(q);
  751. }
  752. /* Call with queue lock held */
  753. static void
  754. throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
  755. {
  756. struct delayed_work *dwork = &td->throtl_work;
  757. /* schedule work if limits changed even if no bio is queued */
  758. if (total_nr_queued(td) || td->limits_changed) {
  759. /*
  760. * We might have a work scheduled to be executed in future.
  761. * Cancel that and schedule a new one.
  762. */
  763. __cancel_delayed_work(dwork);
  764. queue_delayed_work(kthrotld_workqueue, dwork, delay);
  765. throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
  766. delay, jiffies);
  767. }
  768. }
  769. static void
  770. throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
  771. {
  772. /* Something wrong if we are trying to remove same group twice */
  773. BUG_ON(hlist_unhashed(&tg->tg_node));
  774. hlist_del_init(&tg->tg_node);
  775. /*
  776. * Put the reference taken at the time of creation so that when all
  777. * queues are gone, group can be destroyed.
  778. */
  779. throtl_put_tg(tg);
  780. td->nr_undestroyed_grps--;
  781. }
  782. static void throtl_release_tgs(struct throtl_data *td)
  783. {
  784. struct hlist_node *pos, *n;
  785. struct throtl_grp *tg;
  786. hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
  787. /*
  788. * If cgroup removal path got to blk_group first and removed
  789. * it from cgroup list, then it will take care of destroying
  790. * cfqg also.
  791. */
  792. if (!blkiocg_del_blkio_group(&tg->blkg))
  793. throtl_destroy_tg(td, tg);
  794. }
  795. }
  796. static void throtl_td_free(struct throtl_data *td)
  797. {
  798. kfree(td);
  799. }
  800. /*
  801. * Blk cgroup controller notification saying that blkio_group object is being
  802. * delinked as associated cgroup object is going away. That also means that
  803. * no new IO will come in this group. So get rid of this group as soon as
  804. * any pending IO in the group is finished.
  805. *
  806. * This function is called under rcu_read_lock(). key is the rcu protected
  807. * pointer. That means "key" is a valid throtl_data pointer as long as we are
  808. * rcu read lock.
  809. *
  810. * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
  811. * it should not be NULL as even if queue was going away, cgroup deltion
  812. * path got to it first.
  813. */
  814. void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
  815. {
  816. unsigned long flags;
  817. struct throtl_data *td = key;
  818. spin_lock_irqsave(td->queue->queue_lock, flags);
  819. throtl_destroy_tg(td, tg_of_blkg(blkg));
  820. spin_unlock_irqrestore(td->queue->queue_lock, flags);
  821. }
  822. static void throtl_update_blkio_group_common(struct throtl_data *td,
  823. struct throtl_grp *tg)
  824. {
  825. xchg(&tg->limits_changed, true);
  826. xchg(&td->limits_changed, true);
  827. /* Schedule a work now to process the limit change */
  828. throtl_schedule_delayed_work(td, 0);
  829. }
  830. /*
  831. * For all update functions, key should be a valid pointer because these
  832. * update functions are called under blkcg_lock, that means, blkg is
  833. * valid and in turn key is valid. queue exit path can not race because
  834. * of blkcg_lock
  835. *
  836. * Can not take queue lock in update functions as queue lock under blkcg_lock
  837. * is not allowed. Under other paths we take blkcg_lock under queue_lock.
  838. */
  839. static void throtl_update_blkio_group_read_bps(void *key,
  840. struct blkio_group *blkg, u64 read_bps)
  841. {
  842. struct throtl_data *td = key;
  843. struct throtl_grp *tg = tg_of_blkg(blkg);
  844. tg->bps[READ] = read_bps;
  845. throtl_update_blkio_group_common(td, tg);
  846. }
  847. static void throtl_update_blkio_group_write_bps(void *key,
  848. struct blkio_group *blkg, u64 write_bps)
  849. {
  850. struct throtl_data *td = key;
  851. struct throtl_grp *tg = tg_of_blkg(blkg);
  852. tg->bps[WRITE] = write_bps;
  853. throtl_update_blkio_group_common(td, tg);
  854. }
  855. static void throtl_update_blkio_group_read_iops(void *key,
  856. struct blkio_group *blkg, unsigned int read_iops)
  857. {
  858. struct throtl_data *td = key;
  859. struct throtl_grp *tg = tg_of_blkg(blkg);
  860. tg->iops[READ] = read_iops;
  861. throtl_update_blkio_group_common(td, tg);
  862. }
  863. static void throtl_update_blkio_group_write_iops(void *key,
  864. struct blkio_group *blkg, unsigned int write_iops)
  865. {
  866. struct throtl_data *td = key;
  867. struct throtl_grp *tg = tg_of_blkg(blkg);
  868. tg->iops[WRITE] = write_iops;
  869. throtl_update_blkio_group_common(td, tg);
  870. }
  871. static void throtl_shutdown_wq(struct request_queue *q)
  872. {
  873. struct throtl_data *td = q->td;
  874. cancel_delayed_work_sync(&td->throtl_work);
  875. }
  876. static struct blkio_policy_type blkio_policy_throtl = {
  877. .ops = {
  878. .blkio_unlink_group_fn = throtl_unlink_blkio_group,
  879. .blkio_update_group_read_bps_fn =
  880. throtl_update_blkio_group_read_bps,
  881. .blkio_update_group_write_bps_fn =
  882. throtl_update_blkio_group_write_bps,
  883. .blkio_update_group_read_iops_fn =
  884. throtl_update_blkio_group_read_iops,
  885. .blkio_update_group_write_iops_fn =
  886. throtl_update_blkio_group_write_iops,
  887. },
  888. .plid = BLKIO_POLICY_THROTL,
  889. };
  890. bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
  891. {
  892. struct throtl_data *td = q->td;
  893. struct throtl_grp *tg;
  894. bool rw = bio_data_dir(bio), update_disptime = true;
  895. struct blkio_cgroup *blkcg;
  896. bool throttled = false;
  897. if (bio->bi_rw & REQ_THROTTLED) {
  898. bio->bi_rw &= ~REQ_THROTTLED;
  899. goto out;
  900. }
  901. /*
  902. * A throtl_grp pointer retrieved under rcu can be used to access
  903. * basic fields like stats and io rates. If a group has no rules,
  904. * just update the dispatch stats in lockless manner and return.
  905. */
  906. rcu_read_lock();
  907. blkcg = task_blkio_cgroup(current);
  908. tg = throtl_find_tg(td, blkcg);
  909. if (tg) {
  910. throtl_tg_fill_dev_details(td, tg);
  911. if (tg_no_rule_group(tg, rw)) {
  912. blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
  913. rw, rw_is_sync(bio->bi_rw));
  914. rcu_read_unlock();
  915. goto out;
  916. }
  917. }
  918. rcu_read_unlock();
  919. /*
  920. * Either group has not been allocated yet or it is not an unlimited
  921. * IO group
  922. */
  923. spin_lock_irq(q->queue_lock);
  924. tg = throtl_get_tg(td);
  925. if (unlikely(!tg))
  926. goto out_unlock;
  927. if (tg->nr_queued[rw]) {
  928. /*
  929. * There is already another bio queued in same dir. No
  930. * need to update dispatch time.
  931. */
  932. update_disptime = false;
  933. goto queue_bio;
  934. }
  935. /* Bio is with-in rate limit of group */
  936. if (tg_may_dispatch(td, tg, bio, NULL)) {
  937. throtl_charge_bio(tg, bio);
  938. /*
  939. * We need to trim slice even when bios are not being queued
  940. * otherwise it might happen that a bio is not queued for
  941. * a long time and slice keeps on extending and trim is not
  942. * called for a long time. Now if limits are reduced suddenly
  943. * we take into account all the IO dispatched so far at new
  944. * low rate and * newly queued IO gets a really long dispatch
  945. * time.
  946. *
  947. * So keep on trimming slice even if bio is not queued.
  948. */
  949. throtl_trim_slice(td, tg, rw);
  950. goto out_unlock;
  951. }
  952. queue_bio:
  953. throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
  954. " iodisp=%u iops=%u queued=%d/%d",
  955. rw == READ ? 'R' : 'W',
  956. tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
  957. tg->io_disp[rw], tg->iops[rw],
  958. tg->nr_queued[READ], tg->nr_queued[WRITE]);
  959. throtl_add_bio_tg(q->td, tg, bio);
  960. throttled = true;
  961. if (update_disptime) {
  962. tg_update_disptime(td, tg);
  963. throtl_schedule_next_dispatch(td);
  964. }
  965. out_unlock:
  966. spin_unlock_irq(q->queue_lock);
  967. out:
  968. return throttled;
  969. }
  970. int blk_throtl_init(struct request_queue *q)
  971. {
  972. struct throtl_data *td;
  973. struct throtl_grp *tg;
  974. td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
  975. if (!td)
  976. return -ENOMEM;
  977. INIT_HLIST_HEAD(&td->tg_list);
  978. td->tg_service_tree = THROTL_RB_ROOT;
  979. td->limits_changed = false;
  980. INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
  981. /* alloc and Init root group. */
  982. td->queue = q;
  983. tg = throtl_alloc_tg(td);
  984. if (!tg) {
  985. kfree(td);
  986. return -ENOMEM;
  987. }
  988. td->root_tg = tg;
  989. rcu_read_lock();
  990. throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
  991. rcu_read_unlock();
  992. /* Attach throtl data to request queue */
  993. q->td = td;
  994. return 0;
  995. }
  996. void blk_throtl_exit(struct request_queue *q)
  997. {
  998. struct throtl_data *td = q->td;
  999. bool wait = false;
  1000. BUG_ON(!td);
  1001. throtl_shutdown_wq(q);
  1002. spin_lock_irq(q->queue_lock);
  1003. throtl_release_tgs(td);
  1004. /* If there are other groups */
  1005. if (td->nr_undestroyed_grps > 0)
  1006. wait = true;
  1007. spin_unlock_irq(q->queue_lock);
  1008. /*
  1009. * Wait for tg->blkg->key accessors to exit their grace periods.
  1010. * Do this wait only if there are other undestroyed groups out
  1011. * there (other than root group). This can happen if cgroup deletion
  1012. * path claimed the responsibility of cleaning up a group before
  1013. * queue cleanup code get to the group.
  1014. *
  1015. * Do not call synchronize_rcu() unconditionally as there are drivers
  1016. * which create/delete request queue hundreds of times during scan/boot
  1017. * and synchronize_rcu() can take significant time and slow down boot.
  1018. */
  1019. if (wait)
  1020. synchronize_rcu();
  1021. /*
  1022. * Just being safe to make sure after previous flush if some body did
  1023. * update limits through cgroup and another work got queued, cancel
  1024. * it.
  1025. */
  1026. throtl_shutdown_wq(q);
  1027. throtl_td_free(td);
  1028. }
  1029. static int __init throtl_init(void)
  1030. {
  1031. kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
  1032. if (!kthrotld_workqueue)
  1033. panic("Failed to create kthrotld\n");
  1034. blkio_policy_register(&blkio_policy_throtl);
  1035. return 0;
  1036. }
  1037. module_init(throtl_init);