as-iosched.c 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714
  1. /*
  2. * Anticipatory & deadline i/o scheduler.
  3. *
  4. * Copyright (C) 2002 Jens Axboe <axboe@suse.de>
  5. * Nick Piggin <nickpiggin@yahoo.com.au>
  6. *
  7. */
  8. #include <linux/kernel.h>
  9. #include <linux/fs.h>
  10. #include <linux/blkdev.h>
  11. #include <linux/elevator.h>
  12. #include <linux/bio.h>
  13. #include <linux/module.h>
  14. #include <linux/slab.h>
  15. #include <linux/init.h>
  16. #include <linux/compiler.h>
  17. #include <linux/rbtree.h>
  18. #include <linux/interrupt.h>
  19. #define REQ_SYNC 1
  20. #define REQ_ASYNC 0
  21. /*
  22. * See Documentation/block/as-iosched.txt
  23. */
  24. /*
  25. * max time before a read is submitted.
  26. */
  27. #define default_read_expire (HZ / 8)
  28. /*
  29. * ditto for writes, these limits are not hard, even
  30. * if the disk is capable of satisfying them.
  31. */
  32. #define default_write_expire (HZ / 4)
  33. /*
  34. * read_batch_expire describes how long we will allow a stream of reads to
  35. * persist before looking to see whether it is time to switch over to writes.
  36. */
  37. #define default_read_batch_expire (HZ / 2)
  38. /*
  39. * write_batch_expire describes how long we want a stream of writes to run for.
  40. * This is not a hard limit, but a target we set for the auto-tuning thingy.
  41. * See, the problem is: we can send a lot of writes to disk cache / TCQ in
  42. * a short amount of time...
  43. */
  44. #define default_write_batch_expire (HZ / 8)
  45. /*
  46. * max time we may wait to anticipate a read (default around 6ms)
  47. */
  48. #define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
  49. /*
  50. * Keep track of up to 20ms thinktimes. We can go as big as we like here,
  51. * however huge values tend to interfere and not decay fast enough. A program
  52. * might be in a non-io phase of operation. Waiting on user input for example,
  53. * or doing a lengthy computation. A small penalty can be justified there, and
  54. * will still catch out those processes that constantly have large thinktimes.
  55. */
  56. #define MAX_THINKTIME (HZ/50UL)
  57. /* Bits in as_io_context.state */
  58. enum as_io_states {
  59. AS_TASK_RUNNING=0, /* Process has not exited */
  60. AS_TASK_IOSTARTED, /* Process has started some IO */
  61. AS_TASK_IORUNNING, /* Process has completed some IO */
  62. };
  63. enum anticipation_status {
  64. ANTIC_OFF=0, /* Not anticipating (normal operation) */
  65. ANTIC_WAIT_REQ, /* The last read has not yet completed */
  66. ANTIC_WAIT_NEXT, /* Currently anticipating a request vs
  67. last read (which has completed) */
  68. ANTIC_FINISHED, /* Anticipating but have found a candidate
  69. * or timed out */
  70. };
  71. struct as_data {
  72. /*
  73. * run time data
  74. */
  75. struct request_queue *q; /* the "owner" queue */
  76. /*
  77. * requests (as_rq s) are present on both sort_list and fifo_list
  78. */
  79. struct rb_root sort_list[2];
  80. struct list_head fifo_list[2];
  81. struct as_rq *next_arq[2]; /* next in sort order */
  82. sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */
  83. unsigned long exit_prob; /* probability a task will exit while
  84. being waited on */
  85. unsigned long exit_no_coop; /* probablility an exited task will
  86. not be part of a later cooperating
  87. request */
  88. unsigned long new_ttime_total; /* mean thinktime on new proc */
  89. unsigned long new_ttime_mean;
  90. u64 new_seek_total; /* mean seek on new proc */
  91. sector_t new_seek_mean;
  92. unsigned long current_batch_expires;
  93. unsigned long last_check_fifo[2];
  94. int changed_batch; /* 1: waiting for old batch to end */
  95. int new_batch; /* 1: waiting on first read complete */
  96. int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */
  97. int write_batch_count; /* max # of reqs in a write batch */
  98. int current_write_count; /* how many requests left this batch */
  99. int write_batch_idled; /* has the write batch gone idle? */
  100. mempool_t *arq_pool;
  101. enum anticipation_status antic_status;
  102. unsigned long antic_start; /* jiffies: when it started */
  103. struct timer_list antic_timer; /* anticipatory scheduling timer */
  104. struct work_struct antic_work; /* Deferred unplugging */
  105. struct io_context *io_context; /* Identify the expected process */
  106. int ioc_finished; /* IO associated with io_context is finished */
  107. int nr_dispatched;
  108. /*
  109. * settings that change how the i/o scheduler behaves
  110. */
  111. unsigned long fifo_expire[2];
  112. unsigned long batch_expire[2];
  113. unsigned long antic_expire;
  114. };
  115. #define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo)
  116. /*
  117. * per-request data.
  118. */
  119. enum arq_state {
  120. AS_RQ_NEW=0, /* New - not referenced and not on any lists */
  121. AS_RQ_QUEUED, /* In the request queue. It belongs to the
  122. scheduler */
  123. AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the
  124. driver now */
  125. AS_RQ_PRESCHED, /* Debug poisoning for requests being used */
  126. AS_RQ_REMOVED,
  127. AS_RQ_MERGED,
  128. AS_RQ_POSTSCHED, /* when they shouldn't be */
  129. };
  130. struct as_rq {
  131. /*
  132. * rbtree index, key is the starting offset
  133. */
  134. struct rb_node rb_node;
  135. sector_t rb_key;
  136. struct request *request;
  137. struct io_context *io_context; /* The submitting task */
  138. /*
  139. * expire fifo
  140. */
  141. struct list_head fifo;
  142. unsigned long expires;
  143. unsigned int is_sync;
  144. enum arq_state state;
  145. };
  146. #define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private)
  147. static kmem_cache_t *arq_pool;
  148. static atomic_t ioc_count = ATOMIC_INIT(0);
  149. static struct completion *ioc_gone;
  150. static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
  151. static void as_antic_stop(struct as_data *ad);
  152. /*
  153. * IO Context helper functions
  154. */
  155. /* Called to deallocate the as_io_context */
  156. static void free_as_io_context(struct as_io_context *aic)
  157. {
  158. kfree(aic);
  159. if (atomic_dec_and_test(&ioc_count) && ioc_gone)
  160. complete(ioc_gone);
  161. }
  162. static void as_trim(struct io_context *ioc)
  163. {
  164. if (ioc->aic)
  165. free_as_io_context(ioc->aic);
  166. ioc->aic = NULL;
  167. }
  168. /* Called when the task exits */
  169. static void exit_as_io_context(struct as_io_context *aic)
  170. {
  171. WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
  172. clear_bit(AS_TASK_RUNNING, &aic->state);
  173. }
  174. static struct as_io_context *alloc_as_io_context(void)
  175. {
  176. struct as_io_context *ret;
  177. ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
  178. if (ret) {
  179. ret->dtor = free_as_io_context;
  180. ret->exit = exit_as_io_context;
  181. ret->state = 1 << AS_TASK_RUNNING;
  182. atomic_set(&ret->nr_queued, 0);
  183. atomic_set(&ret->nr_dispatched, 0);
  184. spin_lock_init(&ret->lock);
  185. ret->ttime_total = 0;
  186. ret->ttime_samples = 0;
  187. ret->ttime_mean = 0;
  188. ret->seek_total = 0;
  189. ret->seek_samples = 0;
  190. ret->seek_mean = 0;
  191. atomic_inc(&ioc_count);
  192. }
  193. return ret;
  194. }
  195. /*
  196. * If the current task has no AS IO context then create one and initialise it.
  197. * Then take a ref on the task's io context and return it.
  198. */
  199. static struct io_context *as_get_io_context(void)
  200. {
  201. struct io_context *ioc = get_io_context(GFP_ATOMIC);
  202. if (ioc && !ioc->aic) {
  203. ioc->aic = alloc_as_io_context();
  204. if (!ioc->aic) {
  205. put_io_context(ioc);
  206. ioc = NULL;
  207. }
  208. }
  209. return ioc;
  210. }
  211. static void as_put_io_context(struct as_rq *arq)
  212. {
  213. struct as_io_context *aic;
  214. if (unlikely(!arq->io_context))
  215. return;
  216. aic = arq->io_context->aic;
  217. if (arq->is_sync == REQ_SYNC && aic) {
  218. spin_lock(&aic->lock);
  219. set_bit(AS_TASK_IORUNNING, &aic->state);
  220. aic->last_end_request = jiffies;
  221. spin_unlock(&aic->lock);
  222. }
  223. put_io_context(arq->io_context);
  224. }
  225. /*
  226. * rb tree support functions
  227. */
  228. #define rb_entry_arq(node) rb_entry((node), struct as_rq, rb_node)
  229. #define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[(arq)->is_sync])
  230. #define rq_rb_key(rq) (rq)->sector
  231. /*
  232. * as_find_first_arq finds the first (lowest sector numbered) request
  233. * for the specified data_dir. Used to sweep back to the start of the disk
  234. * (1-way elevator) after we process the last (highest sector) request.
  235. */
  236. static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
  237. {
  238. struct rb_node *n = ad->sort_list[data_dir].rb_node;
  239. if (n == NULL)
  240. return NULL;
  241. for (;;) {
  242. if (n->rb_left == NULL)
  243. return rb_entry_arq(n);
  244. n = n->rb_left;
  245. }
  246. }
  247. /*
  248. * Add the request to the rb tree if it is unique. If there is an alias (an
  249. * existing request against the same sector), which can happen when using
  250. * direct IO, then return the alias.
  251. */
  252. static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
  253. {
  254. struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
  255. struct rb_node *parent = NULL;
  256. struct as_rq *__arq;
  257. struct request *rq = arq->request;
  258. arq->rb_key = rq_rb_key(rq);
  259. while (*p) {
  260. parent = *p;
  261. __arq = rb_entry_arq(parent);
  262. if (arq->rb_key < __arq->rb_key)
  263. p = &(*p)->rb_left;
  264. else if (arq->rb_key > __arq->rb_key)
  265. p = &(*p)->rb_right;
  266. else
  267. return __arq;
  268. }
  269. rb_link_node(&arq->rb_node, parent, p);
  270. rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
  271. return NULL;
  272. }
  273. static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
  274. {
  275. struct as_rq *alias;
  276. while ((unlikely(alias = __as_add_arq_rb(ad, arq)))) {
  277. as_move_to_dispatch(ad, alias);
  278. as_antic_stop(ad);
  279. }
  280. }
  281. static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
  282. {
  283. if (!RB_EMPTY_NODE(&arq->rb_node)) {
  284. WARN_ON(1);
  285. return;
  286. }
  287. rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
  288. RB_CLEAR_NODE(&arq->rb_node);
  289. }
  290. static struct request *
  291. as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir)
  292. {
  293. struct rb_node *n = ad->sort_list[data_dir].rb_node;
  294. struct as_rq *arq;
  295. while (n) {
  296. arq = rb_entry_arq(n);
  297. if (sector < arq->rb_key)
  298. n = n->rb_left;
  299. else if (sector > arq->rb_key)
  300. n = n->rb_right;
  301. else
  302. return arq->request;
  303. }
  304. return NULL;
  305. }
  306. /*
  307. * IO Scheduler proper
  308. */
  309. #define MAXBACK (1024 * 1024) /*
  310. * Maximum distance the disk will go backward
  311. * for a request.
  312. */
  313. #define BACK_PENALTY 2
  314. /*
  315. * as_choose_req selects the preferred one of two requests of the same data_dir
  316. * ignoring time - eg. timeouts, which is the job of as_dispatch_request
  317. */
  318. static struct as_rq *
  319. as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
  320. {
  321. int data_dir;
  322. sector_t last, s1, s2, d1, d2;
  323. int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */
  324. const sector_t maxback = MAXBACK;
  325. if (arq1 == NULL || arq1 == arq2)
  326. return arq2;
  327. if (arq2 == NULL)
  328. return arq1;
  329. data_dir = arq1->is_sync;
  330. last = ad->last_sector[data_dir];
  331. s1 = arq1->request->sector;
  332. s2 = arq2->request->sector;
  333. BUG_ON(data_dir != arq2->is_sync);
  334. /*
  335. * Strict one way elevator _except_ in the case where we allow
  336. * short backward seeks which are biased as twice the cost of a
  337. * similar forward seek.
  338. */
  339. if (s1 >= last)
  340. d1 = s1 - last;
  341. else if (s1+maxback >= last)
  342. d1 = (last - s1)*BACK_PENALTY;
  343. else {
  344. r1_wrap = 1;
  345. d1 = 0; /* shut up, gcc */
  346. }
  347. if (s2 >= last)
  348. d2 = s2 - last;
  349. else if (s2+maxback >= last)
  350. d2 = (last - s2)*BACK_PENALTY;
  351. else {
  352. r2_wrap = 1;
  353. d2 = 0;
  354. }
  355. /* Found required data */
  356. if (!r1_wrap && r2_wrap)
  357. return arq1;
  358. else if (!r2_wrap && r1_wrap)
  359. return arq2;
  360. else if (r1_wrap && r2_wrap) {
  361. /* both behind the head */
  362. if (s1 <= s2)
  363. return arq1;
  364. else
  365. return arq2;
  366. }
  367. /* Both requests in front of the head */
  368. if (d1 < d2)
  369. return arq1;
  370. else if (d2 < d1)
  371. return arq2;
  372. else {
  373. if (s1 >= s2)
  374. return arq1;
  375. else
  376. return arq2;
  377. }
  378. }
  379. /*
  380. * as_find_next_arq finds the next request after @prev in elevator order.
  381. * this with as_choose_req form the basis for how the scheduler chooses
  382. * what request to process next. Anticipation works on top of this.
  383. */
  384. static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
  385. {
  386. const int data_dir = last->is_sync;
  387. struct as_rq *ret;
  388. struct rb_node *rbnext = rb_next(&last->rb_node);
  389. struct rb_node *rbprev = rb_prev(&last->rb_node);
  390. struct as_rq *arq_next, *arq_prev;
  391. BUG_ON(!RB_EMPTY_NODE(&last->rb_node));
  392. if (rbprev)
  393. arq_prev = rb_entry_arq(rbprev);
  394. else
  395. arq_prev = NULL;
  396. if (rbnext)
  397. arq_next = rb_entry_arq(rbnext);
  398. else {
  399. arq_next = as_find_first_arq(ad, data_dir);
  400. if (arq_next == last)
  401. arq_next = NULL;
  402. }
  403. ret = as_choose_req(ad, arq_next, arq_prev);
  404. return ret;
  405. }
  406. /*
  407. * anticipatory scheduling functions follow
  408. */
  409. /*
  410. * as_antic_expired tells us when we have anticipated too long.
  411. * The funny "absolute difference" math on the elapsed time is to handle
  412. * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
  413. */
  414. static int as_antic_expired(struct as_data *ad)
  415. {
  416. long delta_jif;
  417. delta_jif = jiffies - ad->antic_start;
  418. if (unlikely(delta_jif < 0))
  419. delta_jif = -delta_jif;
  420. if (delta_jif < ad->antic_expire)
  421. return 0;
  422. return 1;
  423. }
  424. /*
  425. * as_antic_waitnext starts anticipating that a nice request will soon be
  426. * submitted. See also as_antic_waitreq
  427. */
  428. static void as_antic_waitnext(struct as_data *ad)
  429. {
  430. unsigned long timeout;
  431. BUG_ON(ad->antic_status != ANTIC_OFF
  432. && ad->antic_status != ANTIC_WAIT_REQ);
  433. timeout = ad->antic_start + ad->antic_expire;
  434. mod_timer(&ad->antic_timer, timeout);
  435. ad->antic_status = ANTIC_WAIT_NEXT;
  436. }
  437. /*
  438. * as_antic_waitreq starts anticipating. We don't start timing the anticipation
  439. * until the request that we're anticipating on has finished. This means we
  440. * are timing from when the candidate process wakes up hopefully.
  441. */
  442. static void as_antic_waitreq(struct as_data *ad)
  443. {
  444. BUG_ON(ad->antic_status == ANTIC_FINISHED);
  445. if (ad->antic_status == ANTIC_OFF) {
  446. if (!ad->io_context || ad->ioc_finished)
  447. as_antic_waitnext(ad);
  448. else
  449. ad->antic_status = ANTIC_WAIT_REQ;
  450. }
  451. }
  452. /*
  453. * This is called directly by the functions in this file to stop anticipation.
  454. * We kill the timer and schedule a call to the request_fn asap.
  455. */
  456. static void as_antic_stop(struct as_data *ad)
  457. {
  458. int status = ad->antic_status;
  459. if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
  460. if (status == ANTIC_WAIT_NEXT)
  461. del_timer(&ad->antic_timer);
  462. ad->antic_status = ANTIC_FINISHED;
  463. /* see as_work_handler */
  464. kblockd_schedule_work(&ad->antic_work);
  465. }
  466. }
  467. /*
  468. * as_antic_timeout is the timer function set by as_antic_waitnext.
  469. */
  470. static void as_antic_timeout(unsigned long data)
  471. {
  472. struct request_queue *q = (struct request_queue *)data;
  473. struct as_data *ad = q->elevator->elevator_data;
  474. unsigned long flags;
  475. spin_lock_irqsave(q->queue_lock, flags);
  476. if (ad->antic_status == ANTIC_WAIT_REQ
  477. || ad->antic_status == ANTIC_WAIT_NEXT) {
  478. struct as_io_context *aic = ad->io_context->aic;
  479. ad->antic_status = ANTIC_FINISHED;
  480. kblockd_schedule_work(&ad->antic_work);
  481. if (aic->ttime_samples == 0) {
  482. /* process anticipated on has exited or timed out*/
  483. ad->exit_prob = (7*ad->exit_prob + 256)/8;
  484. }
  485. if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
  486. /* process not "saved" by a cooperating request */
  487. ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
  488. }
  489. }
  490. spin_unlock_irqrestore(q->queue_lock, flags);
  491. }
  492. static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic,
  493. unsigned long ttime)
  494. {
  495. /* fixed point: 1.0 == 1<<8 */
  496. if (aic->ttime_samples == 0) {
  497. ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
  498. ad->new_ttime_mean = ad->new_ttime_total / 256;
  499. ad->exit_prob = (7*ad->exit_prob)/8;
  500. }
  501. aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
  502. aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
  503. aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
  504. }
  505. static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
  506. sector_t sdist)
  507. {
  508. u64 total;
  509. if (aic->seek_samples == 0) {
  510. ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
  511. ad->new_seek_mean = ad->new_seek_total / 256;
  512. }
  513. /*
  514. * Don't allow the seek distance to get too large from the
  515. * odd fragment, pagein, etc
  516. */
  517. if (aic->seek_samples <= 60) /* second&third seek */
  518. sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
  519. else
  520. sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
  521. aic->seek_samples = (7*aic->seek_samples + 256) / 8;
  522. aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
  523. total = aic->seek_total + (aic->seek_samples/2);
  524. do_div(total, aic->seek_samples);
  525. aic->seek_mean = (sector_t)total;
  526. }
  527. /*
  528. * as_update_iohist keeps a decaying histogram of IO thinktimes, and
  529. * updates @aic->ttime_mean based on that. It is called when a new
  530. * request is queued.
  531. */
  532. static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
  533. struct request *rq)
  534. {
  535. struct as_rq *arq = RQ_DATA(rq);
  536. int data_dir = arq->is_sync;
  537. unsigned long thinktime = 0;
  538. sector_t seek_dist;
  539. if (aic == NULL)
  540. return;
  541. if (data_dir == REQ_SYNC) {
  542. unsigned long in_flight = atomic_read(&aic->nr_queued)
  543. + atomic_read(&aic->nr_dispatched);
  544. spin_lock(&aic->lock);
  545. if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
  546. test_bit(AS_TASK_IOSTARTED, &aic->state)) {
  547. /* Calculate read -> read thinktime */
  548. if (test_bit(AS_TASK_IORUNNING, &aic->state)
  549. && in_flight == 0) {
  550. thinktime = jiffies - aic->last_end_request;
  551. thinktime = min(thinktime, MAX_THINKTIME-1);
  552. }
  553. as_update_thinktime(ad, aic, thinktime);
  554. /* Calculate read -> read seek distance */
  555. if (aic->last_request_pos < rq->sector)
  556. seek_dist = rq->sector - aic->last_request_pos;
  557. else
  558. seek_dist = aic->last_request_pos - rq->sector;
  559. as_update_seekdist(ad, aic, seek_dist);
  560. }
  561. aic->last_request_pos = rq->sector + rq->nr_sectors;
  562. set_bit(AS_TASK_IOSTARTED, &aic->state);
  563. spin_unlock(&aic->lock);
  564. }
  565. }
  566. /*
  567. * as_close_req decides if one request is considered "close" to the
  568. * previous one issued.
  569. */
  570. static int as_close_req(struct as_data *ad, struct as_io_context *aic,
  571. struct as_rq *arq)
  572. {
  573. unsigned long delay; /* milliseconds */
  574. sector_t last = ad->last_sector[ad->batch_data_dir];
  575. sector_t next = arq->request->sector;
  576. sector_t delta; /* acceptable close offset (in sectors) */
  577. sector_t s;
  578. if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
  579. delay = 0;
  580. else
  581. delay = ((jiffies - ad->antic_start) * 1000) / HZ;
  582. if (delay == 0)
  583. delta = 8192;
  584. else if (delay <= 20 && delay <= ad->antic_expire)
  585. delta = 8192 << delay;
  586. else
  587. return 1;
  588. if ((last <= next + (delta>>1)) && (next <= last + delta))
  589. return 1;
  590. if (last < next)
  591. s = next - last;
  592. else
  593. s = last - next;
  594. if (aic->seek_samples == 0) {
  595. /*
  596. * Process has just started IO. Use past statistics to
  597. * gauge success possibility
  598. */
  599. if (ad->new_seek_mean > s) {
  600. /* this request is better than what we're expecting */
  601. return 1;
  602. }
  603. } else {
  604. if (aic->seek_mean > s) {
  605. /* this request is better than what we're expecting */
  606. return 1;
  607. }
  608. }
  609. return 0;
  610. }
  611. /*
  612. * as_can_break_anticipation returns true if we have been anticipating this
  613. * request.
  614. *
  615. * It also returns true if the process against which we are anticipating
  616. * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
  617. * dispatch it ASAP, because we know that application will not be submitting
  618. * any new reads.
  619. *
  620. * If the task which has submitted the request has exited, break anticipation.
  621. *
  622. * If this task has queued some other IO, do not enter enticipation.
  623. */
  624. static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
  625. {
  626. struct io_context *ioc;
  627. struct as_io_context *aic;
  628. ioc = ad->io_context;
  629. BUG_ON(!ioc);
  630. if (arq && ioc == arq->io_context) {
  631. /* request from same process */
  632. return 1;
  633. }
  634. if (ad->ioc_finished && as_antic_expired(ad)) {
  635. /*
  636. * In this situation status should really be FINISHED,
  637. * however the timer hasn't had the chance to run yet.
  638. */
  639. return 1;
  640. }
  641. aic = ioc->aic;
  642. if (!aic)
  643. return 0;
  644. if (atomic_read(&aic->nr_queued) > 0) {
  645. /* process has more requests queued */
  646. return 1;
  647. }
  648. if (atomic_read(&aic->nr_dispatched) > 0) {
  649. /* process has more requests dispatched */
  650. return 1;
  651. }
  652. if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, aic, arq)) {
  653. /*
  654. * Found a close request that is not one of ours.
  655. *
  656. * This makes close requests from another process update
  657. * our IO history. Is generally useful when there are
  658. * two or more cooperating processes working in the same
  659. * area.
  660. */
  661. if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
  662. if (aic->ttime_samples == 0)
  663. ad->exit_prob = (7*ad->exit_prob + 256)/8;
  664. ad->exit_no_coop = (7*ad->exit_no_coop)/8;
  665. }
  666. as_update_iohist(ad, aic, arq->request);
  667. return 1;
  668. }
  669. if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
  670. /* process anticipated on has exited */
  671. if (aic->ttime_samples == 0)
  672. ad->exit_prob = (7*ad->exit_prob + 256)/8;
  673. if (ad->exit_no_coop > 128)
  674. return 1;
  675. }
  676. if (aic->ttime_samples == 0) {
  677. if (ad->new_ttime_mean > ad->antic_expire)
  678. return 1;
  679. if (ad->exit_prob * ad->exit_no_coop > 128*256)
  680. return 1;
  681. } else if (aic->ttime_mean > ad->antic_expire) {
  682. /* the process thinks too much between requests */
  683. return 1;
  684. }
  685. return 0;
  686. }
  687. /*
  688. * as_can_anticipate indicates whether we should either run arq
  689. * or keep anticipating a better request.
  690. */
  691. static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
  692. {
  693. if (!ad->io_context)
  694. /*
  695. * Last request submitted was a write
  696. */
  697. return 0;
  698. if (ad->antic_status == ANTIC_FINISHED)
  699. /*
  700. * Don't restart if we have just finished. Run the next request
  701. */
  702. return 0;
  703. if (as_can_break_anticipation(ad, arq))
  704. /*
  705. * This request is a good candidate. Don't keep anticipating,
  706. * run it.
  707. */
  708. return 0;
  709. /*
  710. * OK from here, we haven't finished, and don't have a decent request!
  711. * Status is either ANTIC_OFF so start waiting,
  712. * ANTIC_WAIT_REQ so continue waiting for request to finish
  713. * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
  714. */
  715. return 1;
  716. }
  717. /*
  718. * as_update_arq must be called whenever a request (arq) is added to
  719. * the sort_list. This function keeps caches up to date, and checks if the
  720. * request might be one we are "anticipating"
  721. */
  722. static void as_update_arq(struct as_data *ad, struct as_rq *arq)
  723. {
  724. const int data_dir = arq->is_sync;
  725. /* keep the next_arq cache up to date */
  726. ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]);
  727. /*
  728. * have we been anticipating this request?
  729. * or does it come from the same process as the one we are anticipating
  730. * for?
  731. */
  732. if (ad->antic_status == ANTIC_WAIT_REQ
  733. || ad->antic_status == ANTIC_WAIT_NEXT) {
  734. if (as_can_break_anticipation(ad, arq))
  735. as_antic_stop(ad);
  736. }
  737. }
  738. /*
  739. * Gathers timings and resizes the write batch automatically
  740. */
  741. static void update_write_batch(struct as_data *ad)
  742. {
  743. unsigned long batch = ad->batch_expire[REQ_ASYNC];
  744. long write_time;
  745. write_time = (jiffies - ad->current_batch_expires) + batch;
  746. if (write_time < 0)
  747. write_time = 0;
  748. if (write_time > batch && !ad->write_batch_idled) {
  749. if (write_time > batch * 3)
  750. ad->write_batch_count /= 2;
  751. else
  752. ad->write_batch_count--;
  753. } else if (write_time < batch && ad->current_write_count == 0) {
  754. if (batch > write_time * 3)
  755. ad->write_batch_count *= 2;
  756. else
  757. ad->write_batch_count++;
  758. }
  759. if (ad->write_batch_count < 1)
  760. ad->write_batch_count = 1;
  761. }
  762. /*
  763. * as_completed_request is to be called when a request has completed and
  764. * returned something to the requesting process, be it an error or data.
  765. */
  766. static void as_completed_request(request_queue_t *q, struct request *rq)
  767. {
  768. struct as_data *ad = q->elevator->elevator_data;
  769. struct as_rq *arq = RQ_DATA(rq);
  770. WARN_ON(!list_empty(&rq->queuelist));
  771. if (arq->state != AS_RQ_REMOVED) {
  772. printk("arq->state %d\n", arq->state);
  773. WARN_ON(1);
  774. goto out;
  775. }
  776. if (ad->changed_batch && ad->nr_dispatched == 1) {
  777. kblockd_schedule_work(&ad->antic_work);
  778. ad->changed_batch = 0;
  779. if (ad->batch_data_dir == REQ_SYNC)
  780. ad->new_batch = 1;
  781. }
  782. WARN_ON(ad->nr_dispatched == 0);
  783. ad->nr_dispatched--;
  784. /*
  785. * Start counting the batch from when a request of that direction is
  786. * actually serviced. This should help devices with big TCQ windows
  787. * and writeback caches
  788. */
  789. if (ad->new_batch && ad->batch_data_dir == arq->is_sync) {
  790. update_write_batch(ad);
  791. ad->current_batch_expires = jiffies +
  792. ad->batch_expire[REQ_SYNC];
  793. ad->new_batch = 0;
  794. }
  795. if (ad->io_context == arq->io_context && ad->io_context) {
  796. ad->antic_start = jiffies;
  797. ad->ioc_finished = 1;
  798. if (ad->antic_status == ANTIC_WAIT_REQ) {
  799. /*
  800. * We were waiting on this request, now anticipate
  801. * the next one
  802. */
  803. as_antic_waitnext(ad);
  804. }
  805. }
  806. as_put_io_context(arq);
  807. out:
  808. arq->state = AS_RQ_POSTSCHED;
  809. }
  810. /*
  811. * as_remove_queued_request removes a request from the pre dispatch queue
  812. * without updating refcounts. It is expected the caller will drop the
  813. * reference unless it replaces the request at somepart of the elevator
  814. * (ie. the dispatch queue)
  815. */
  816. static void as_remove_queued_request(request_queue_t *q, struct request *rq)
  817. {
  818. struct as_rq *arq = RQ_DATA(rq);
  819. const int data_dir = arq->is_sync;
  820. struct as_data *ad = q->elevator->elevator_data;
  821. WARN_ON(arq->state != AS_RQ_QUEUED);
  822. if (arq->io_context && arq->io_context->aic) {
  823. BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
  824. atomic_dec(&arq->io_context->aic->nr_queued);
  825. }
  826. /*
  827. * Update the "next_arq" cache if we are about to remove its
  828. * entry
  829. */
  830. if (ad->next_arq[data_dir] == arq)
  831. ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
  832. list_del_init(&arq->fifo);
  833. as_del_arq_rb(ad, arq);
  834. }
  835. /*
  836. * as_fifo_expired returns 0 if there are no expired reads on the fifo,
  837. * 1 otherwise. It is ratelimited so that we only perform the check once per
  838. * `fifo_expire' interval. Otherwise a large number of expired requests
  839. * would create a hopeless seekstorm.
  840. *
  841. * See as_antic_expired comment.
  842. */
  843. static int as_fifo_expired(struct as_data *ad, int adir)
  844. {
  845. struct as_rq *arq;
  846. long delta_jif;
  847. delta_jif = jiffies - ad->last_check_fifo[adir];
  848. if (unlikely(delta_jif < 0))
  849. delta_jif = -delta_jif;
  850. if (delta_jif < ad->fifo_expire[adir])
  851. return 0;
  852. ad->last_check_fifo[adir] = jiffies;
  853. if (list_empty(&ad->fifo_list[adir]))
  854. return 0;
  855. arq = list_entry_fifo(ad->fifo_list[adir].next);
  856. return time_after(jiffies, arq->expires);
  857. }
  858. /*
  859. * as_batch_expired returns true if the current batch has expired. A batch
  860. * is a set of reads or a set of writes.
  861. */
  862. static inline int as_batch_expired(struct as_data *ad)
  863. {
  864. if (ad->changed_batch || ad->new_batch)
  865. return 0;
  866. if (ad->batch_data_dir == REQ_SYNC)
  867. /* TODO! add a check so a complete fifo gets written? */
  868. return time_after(jiffies, ad->current_batch_expires);
  869. return time_after(jiffies, ad->current_batch_expires)
  870. || ad->current_write_count == 0;
  871. }
  872. /*
  873. * move an entry to dispatch queue
  874. */
  875. static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
  876. {
  877. struct request *rq = arq->request;
  878. const int data_dir = arq->is_sync;
  879. BUG_ON(!RB_EMPTY_NODE(&arq->rb_node));
  880. as_antic_stop(ad);
  881. ad->antic_status = ANTIC_OFF;
  882. /*
  883. * This has to be set in order to be correctly updated by
  884. * as_find_next_arq
  885. */
  886. ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
  887. if (data_dir == REQ_SYNC) {
  888. /* In case we have to anticipate after this */
  889. copy_io_context(&ad->io_context, &arq->io_context);
  890. } else {
  891. if (ad->io_context) {
  892. put_io_context(ad->io_context);
  893. ad->io_context = NULL;
  894. }
  895. if (ad->current_write_count != 0)
  896. ad->current_write_count--;
  897. }
  898. ad->ioc_finished = 0;
  899. ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
  900. /*
  901. * take it off the sort and fifo list, add to dispatch queue
  902. */
  903. as_remove_queued_request(ad->q, rq);
  904. WARN_ON(arq->state != AS_RQ_QUEUED);
  905. elv_dispatch_sort(ad->q, rq);
  906. arq->state = AS_RQ_DISPATCHED;
  907. if (arq->io_context && arq->io_context->aic)
  908. atomic_inc(&arq->io_context->aic->nr_dispatched);
  909. ad->nr_dispatched++;
  910. }
  911. /*
  912. * as_dispatch_request selects the best request according to
  913. * read/write expire, batch expire, etc, and moves it to the dispatch
  914. * queue. Returns 1 if a request was found, 0 otherwise.
  915. */
  916. static int as_dispatch_request(request_queue_t *q, int force)
  917. {
  918. struct as_data *ad = q->elevator->elevator_data;
  919. struct as_rq *arq;
  920. const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
  921. const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
  922. if (unlikely(force)) {
  923. /*
  924. * Forced dispatch, accounting is useless. Reset
  925. * accounting states and dump fifo_lists. Note that
  926. * batch_data_dir is reset to REQ_SYNC to avoid
  927. * screwing write batch accounting as write batch
  928. * accounting occurs on W->R transition.
  929. */
  930. int dispatched = 0;
  931. ad->batch_data_dir = REQ_SYNC;
  932. ad->changed_batch = 0;
  933. ad->new_batch = 0;
  934. while (ad->next_arq[REQ_SYNC]) {
  935. as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]);
  936. dispatched++;
  937. }
  938. ad->last_check_fifo[REQ_SYNC] = jiffies;
  939. while (ad->next_arq[REQ_ASYNC]) {
  940. as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
  941. dispatched++;
  942. }
  943. ad->last_check_fifo[REQ_ASYNC] = jiffies;
  944. return dispatched;
  945. }
  946. /* Signal that the write batch was uncontended, so we can't time it */
  947. if (ad->batch_data_dir == REQ_ASYNC && !reads) {
  948. if (ad->current_write_count == 0 || !writes)
  949. ad->write_batch_idled = 1;
  950. }
  951. if (!(reads || writes)
  952. || ad->antic_status == ANTIC_WAIT_REQ
  953. || ad->antic_status == ANTIC_WAIT_NEXT
  954. || ad->changed_batch)
  955. return 0;
  956. if (!(reads && writes && as_batch_expired(ad))) {
  957. /*
  958. * batch is still running or no reads or no writes
  959. */
  960. arq = ad->next_arq[ad->batch_data_dir];
  961. if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
  962. if (as_fifo_expired(ad, REQ_SYNC))
  963. goto fifo_expired;
  964. if (as_can_anticipate(ad, arq)) {
  965. as_antic_waitreq(ad);
  966. return 0;
  967. }
  968. }
  969. if (arq) {
  970. /* we have a "next request" */
  971. if (reads && !writes)
  972. ad->current_batch_expires =
  973. jiffies + ad->batch_expire[REQ_SYNC];
  974. goto dispatch_request;
  975. }
  976. }
  977. /*
  978. * at this point we are not running a batch. select the appropriate
  979. * data direction (read / write)
  980. */
  981. if (reads) {
  982. BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_SYNC]));
  983. if (writes && ad->batch_data_dir == REQ_SYNC)
  984. /*
  985. * Last batch was a read, switch to writes
  986. */
  987. goto dispatch_writes;
  988. if (ad->batch_data_dir == REQ_ASYNC) {
  989. WARN_ON(ad->new_batch);
  990. ad->changed_batch = 1;
  991. }
  992. ad->batch_data_dir = REQ_SYNC;
  993. arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
  994. ad->last_check_fifo[ad->batch_data_dir] = jiffies;
  995. goto dispatch_request;
  996. }
  997. /*
  998. * the last batch was a read
  999. */
  1000. if (writes) {
  1001. dispatch_writes:
  1002. BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_ASYNC]));
  1003. if (ad->batch_data_dir == REQ_SYNC) {
  1004. ad->changed_batch = 1;
  1005. /*
  1006. * new_batch might be 1 when the queue runs out of
  1007. * reads. A subsequent submission of a write might
  1008. * cause a change of batch before the read is finished.
  1009. */
  1010. ad->new_batch = 0;
  1011. }
  1012. ad->batch_data_dir = REQ_ASYNC;
  1013. ad->current_write_count = ad->write_batch_count;
  1014. ad->write_batch_idled = 0;
  1015. arq = ad->next_arq[ad->batch_data_dir];
  1016. goto dispatch_request;
  1017. }
  1018. BUG();
  1019. return 0;
  1020. dispatch_request:
  1021. /*
  1022. * If a request has expired, service it.
  1023. */
  1024. if (as_fifo_expired(ad, ad->batch_data_dir)) {
  1025. fifo_expired:
  1026. arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
  1027. BUG_ON(arq == NULL);
  1028. }
  1029. if (ad->changed_batch) {
  1030. WARN_ON(ad->new_batch);
  1031. if (ad->nr_dispatched)
  1032. return 0;
  1033. if (ad->batch_data_dir == REQ_ASYNC)
  1034. ad->current_batch_expires = jiffies +
  1035. ad->batch_expire[REQ_ASYNC];
  1036. else
  1037. ad->new_batch = 1;
  1038. ad->changed_batch = 0;
  1039. }
  1040. /*
  1041. * arq is the selected appropriate request.
  1042. */
  1043. as_move_to_dispatch(ad, arq);
  1044. return 1;
  1045. }
  1046. /*
  1047. * add arq to rbtree and fifo
  1048. */
  1049. static void as_add_request(request_queue_t *q, struct request *rq)
  1050. {
  1051. struct as_data *ad = q->elevator->elevator_data;
  1052. struct as_rq *arq = RQ_DATA(rq);
  1053. int data_dir;
  1054. arq->state = AS_RQ_NEW;
  1055. if (rq_data_dir(arq->request) == READ
  1056. || (arq->request->cmd_flags & REQ_RW_SYNC))
  1057. arq->is_sync = 1;
  1058. else
  1059. arq->is_sync = 0;
  1060. data_dir = arq->is_sync;
  1061. arq->io_context = as_get_io_context();
  1062. if (arq->io_context) {
  1063. as_update_iohist(ad, arq->io_context->aic, arq->request);
  1064. atomic_inc(&arq->io_context->aic->nr_queued);
  1065. }
  1066. as_add_arq_rb(ad, arq);
  1067. /*
  1068. * set expire time (only used for reads) and add to fifo list
  1069. */
  1070. arq->expires = jiffies + ad->fifo_expire[data_dir];
  1071. list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
  1072. as_update_arq(ad, arq); /* keep state machine up to date */
  1073. arq->state = AS_RQ_QUEUED;
  1074. }
  1075. static void as_activate_request(request_queue_t *q, struct request *rq)
  1076. {
  1077. struct as_rq *arq = RQ_DATA(rq);
  1078. WARN_ON(arq->state != AS_RQ_DISPATCHED);
  1079. arq->state = AS_RQ_REMOVED;
  1080. if (arq->io_context && arq->io_context->aic)
  1081. atomic_dec(&arq->io_context->aic->nr_dispatched);
  1082. }
  1083. static void as_deactivate_request(request_queue_t *q, struct request *rq)
  1084. {
  1085. struct as_rq *arq = RQ_DATA(rq);
  1086. WARN_ON(arq->state != AS_RQ_REMOVED);
  1087. arq->state = AS_RQ_DISPATCHED;
  1088. if (arq->io_context && arq->io_context->aic)
  1089. atomic_inc(&arq->io_context->aic->nr_dispatched);
  1090. }
  1091. /*
  1092. * as_queue_empty tells us if there are requests left in the device. It may
  1093. * not be the case that a driver can get the next request even if the queue
  1094. * is not empty - it is used in the block layer to check for plugging and
  1095. * merging opportunities
  1096. */
  1097. static int as_queue_empty(request_queue_t *q)
  1098. {
  1099. struct as_data *ad = q->elevator->elevator_data;
  1100. return list_empty(&ad->fifo_list[REQ_ASYNC])
  1101. && list_empty(&ad->fifo_list[REQ_SYNC]);
  1102. }
  1103. static struct request *as_former_request(request_queue_t *q,
  1104. struct request *rq)
  1105. {
  1106. struct as_rq *arq = RQ_DATA(rq);
  1107. struct rb_node *rbprev = rb_prev(&arq->rb_node);
  1108. struct request *ret = NULL;
  1109. if (rbprev)
  1110. ret = rb_entry_arq(rbprev)->request;
  1111. return ret;
  1112. }
  1113. static struct request *as_latter_request(request_queue_t *q,
  1114. struct request *rq)
  1115. {
  1116. struct as_rq *arq = RQ_DATA(rq);
  1117. struct rb_node *rbnext = rb_next(&arq->rb_node);
  1118. struct request *ret = NULL;
  1119. if (rbnext)
  1120. ret = rb_entry_arq(rbnext)->request;
  1121. return ret;
  1122. }
  1123. static int
  1124. as_merge(request_queue_t *q, struct request **req, struct bio *bio)
  1125. {
  1126. struct as_data *ad = q->elevator->elevator_data;
  1127. sector_t rb_key = bio->bi_sector + bio_sectors(bio);
  1128. struct request *__rq;
  1129. /*
  1130. * check for front merge
  1131. */
  1132. __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
  1133. if (__rq && elv_rq_merge_ok(__rq, bio)) {
  1134. *req = __rq;
  1135. return ELEVATOR_FRONT_MERGE;
  1136. }
  1137. return ELEVATOR_NO_MERGE;
  1138. }
  1139. static void as_merged_request(request_queue_t *q, struct request *req)
  1140. {
  1141. struct as_data *ad = q->elevator->elevator_data;
  1142. struct as_rq *arq = RQ_DATA(req);
  1143. /*
  1144. * if the merge was a front merge, we need to reposition request
  1145. */
  1146. if (rq_rb_key(req) != arq->rb_key) {
  1147. as_del_arq_rb(ad, arq);
  1148. as_add_arq_rb(ad, arq);
  1149. /*
  1150. * Note! At this stage of this and the next function, our next
  1151. * request may not be optimal - eg the request may have "grown"
  1152. * behind the disk head. We currently don't bother adjusting.
  1153. */
  1154. }
  1155. }
  1156. static void as_merged_requests(request_queue_t *q, struct request *req,
  1157. struct request *next)
  1158. {
  1159. struct as_data *ad = q->elevator->elevator_data;
  1160. struct as_rq *arq = RQ_DATA(req);
  1161. struct as_rq *anext = RQ_DATA(next);
  1162. BUG_ON(!arq);
  1163. BUG_ON(!anext);
  1164. if (rq_rb_key(req) != arq->rb_key) {
  1165. as_del_arq_rb(ad, arq);
  1166. as_add_arq_rb(ad, arq);
  1167. }
  1168. /*
  1169. * if anext expires before arq, assign its expire time to arq
  1170. * and move into anext position (anext will be deleted) in fifo
  1171. */
  1172. if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
  1173. if (time_before(anext->expires, arq->expires)) {
  1174. list_move(&arq->fifo, &anext->fifo);
  1175. arq->expires = anext->expires;
  1176. /*
  1177. * Don't copy here but swap, because when anext is
  1178. * removed below, it must contain the unused context
  1179. */
  1180. swap_io_context(&arq->io_context, &anext->io_context);
  1181. }
  1182. }
  1183. /*
  1184. * kill knowledge of next, this one is a goner
  1185. */
  1186. as_remove_queued_request(q, next);
  1187. as_put_io_context(anext);
  1188. anext->state = AS_RQ_MERGED;
  1189. }
  1190. /*
  1191. * This is executed in a "deferred" process context, by kblockd. It calls the
  1192. * driver's request_fn so the driver can submit that request.
  1193. *
  1194. * IMPORTANT! This guy will reenter the elevator, so set up all queue global
  1195. * state before calling, and don't rely on any state over calls.
  1196. *
  1197. * FIXME! dispatch queue is not a queue at all!
  1198. */
  1199. static void as_work_handler(void *data)
  1200. {
  1201. struct request_queue *q = data;
  1202. unsigned long flags;
  1203. spin_lock_irqsave(q->queue_lock, flags);
  1204. if (!as_queue_empty(q))
  1205. q->request_fn(q);
  1206. spin_unlock_irqrestore(q->queue_lock, flags);
  1207. }
  1208. static void as_put_request(request_queue_t *q, struct request *rq)
  1209. {
  1210. struct as_data *ad = q->elevator->elevator_data;
  1211. struct as_rq *arq = RQ_DATA(rq);
  1212. if (!arq) {
  1213. WARN_ON(1);
  1214. return;
  1215. }
  1216. if (unlikely(arq->state != AS_RQ_POSTSCHED &&
  1217. arq->state != AS_RQ_PRESCHED &&
  1218. arq->state != AS_RQ_MERGED)) {
  1219. printk("arq->state %d\n", arq->state);
  1220. WARN_ON(1);
  1221. }
  1222. mempool_free(arq, ad->arq_pool);
  1223. rq->elevator_private = NULL;
  1224. }
  1225. static int as_set_request(request_queue_t *q, struct request *rq,
  1226. struct bio *bio, gfp_t gfp_mask)
  1227. {
  1228. struct as_data *ad = q->elevator->elevator_data;
  1229. struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
  1230. if (arq) {
  1231. memset(arq, 0, sizeof(*arq));
  1232. RB_CLEAR_NODE(&arq->rb_node);
  1233. arq->request = rq;
  1234. arq->state = AS_RQ_PRESCHED;
  1235. arq->io_context = NULL;
  1236. INIT_LIST_HEAD(&arq->fifo);
  1237. rq->elevator_private = arq;
  1238. return 0;
  1239. }
  1240. return 1;
  1241. }
  1242. static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
  1243. {
  1244. int ret = ELV_MQUEUE_MAY;
  1245. struct as_data *ad = q->elevator->elevator_data;
  1246. struct io_context *ioc;
  1247. if (ad->antic_status == ANTIC_WAIT_REQ ||
  1248. ad->antic_status == ANTIC_WAIT_NEXT) {
  1249. ioc = as_get_io_context();
  1250. if (ad->io_context == ioc)
  1251. ret = ELV_MQUEUE_MUST;
  1252. put_io_context(ioc);
  1253. }
  1254. return ret;
  1255. }
  1256. static void as_exit_queue(elevator_t *e)
  1257. {
  1258. struct as_data *ad = e->elevator_data;
  1259. del_timer_sync(&ad->antic_timer);
  1260. kblockd_flush();
  1261. BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
  1262. BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
  1263. mempool_destroy(ad->arq_pool);
  1264. put_io_context(ad->io_context);
  1265. kfree(ad);
  1266. }
  1267. /*
  1268. * initialize elevator private data (as_data), and alloc a arq for
  1269. * each request on the free lists
  1270. */
  1271. static void *as_init_queue(request_queue_t *q, elevator_t *e)
  1272. {
  1273. struct as_data *ad;
  1274. if (!arq_pool)
  1275. return NULL;
  1276. ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
  1277. if (!ad)
  1278. return NULL;
  1279. memset(ad, 0, sizeof(*ad));
  1280. ad->q = q; /* Identify what queue the data belongs to */
  1281. ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
  1282. mempool_free_slab, arq_pool, q->node);
  1283. if (!ad->arq_pool) {
  1284. kfree(ad);
  1285. return NULL;
  1286. }
  1287. /* anticipatory scheduling helpers */
  1288. ad->antic_timer.function = as_antic_timeout;
  1289. ad->antic_timer.data = (unsigned long)q;
  1290. init_timer(&ad->antic_timer);
  1291. INIT_WORK(&ad->antic_work, as_work_handler, q);
  1292. INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
  1293. INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
  1294. ad->sort_list[REQ_SYNC] = RB_ROOT;
  1295. ad->sort_list[REQ_ASYNC] = RB_ROOT;
  1296. ad->fifo_expire[REQ_SYNC] = default_read_expire;
  1297. ad->fifo_expire[REQ_ASYNC] = default_write_expire;
  1298. ad->antic_expire = default_antic_expire;
  1299. ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
  1300. ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
  1301. ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
  1302. ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
  1303. if (ad->write_batch_count < 2)
  1304. ad->write_batch_count = 2;
  1305. return ad;
  1306. }
  1307. /*
  1308. * sysfs parts below
  1309. */
  1310. static ssize_t
  1311. as_var_show(unsigned int var, char *page)
  1312. {
  1313. return sprintf(page, "%d\n", var);
  1314. }
  1315. static ssize_t
  1316. as_var_store(unsigned long *var, const char *page, size_t count)
  1317. {
  1318. char *p = (char *) page;
  1319. *var = simple_strtoul(p, &p, 10);
  1320. return count;
  1321. }
  1322. static ssize_t est_time_show(elevator_t *e, char *page)
  1323. {
  1324. struct as_data *ad = e->elevator_data;
  1325. int pos = 0;
  1326. pos += sprintf(page+pos, "%lu %% exit probability\n",
  1327. 100*ad->exit_prob/256);
  1328. pos += sprintf(page+pos, "%lu %% probability of exiting without a "
  1329. "cooperating process submitting IO\n",
  1330. 100*ad->exit_no_coop/256);
  1331. pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
  1332. pos += sprintf(page+pos, "%llu sectors new seek distance\n",
  1333. (unsigned long long)ad->new_seek_mean);
  1334. return pos;
  1335. }
  1336. #define SHOW_FUNCTION(__FUNC, __VAR) \
  1337. static ssize_t __FUNC(elevator_t *e, char *page) \
  1338. { \
  1339. struct as_data *ad = e->elevator_data; \
  1340. return as_var_show(jiffies_to_msecs((__VAR)), (page)); \
  1341. }
  1342. SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]);
  1343. SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]);
  1344. SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
  1345. SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]);
  1346. SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
  1347. #undef SHOW_FUNCTION
  1348. #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
  1349. static ssize_t __FUNC(elevator_t *e, const char *page, size_t count) \
  1350. { \
  1351. struct as_data *ad = e->elevator_data; \
  1352. int ret = as_var_store(__PTR, (page), count); \
  1353. if (*(__PTR) < (MIN)) \
  1354. *(__PTR) = (MIN); \
  1355. else if (*(__PTR) > (MAX)) \
  1356. *(__PTR) = (MAX); \
  1357. *(__PTR) = msecs_to_jiffies(*(__PTR)); \
  1358. return ret; \
  1359. }
  1360. STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
  1361. STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
  1362. STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
  1363. STORE_FUNCTION(as_read_batch_expire_store,
  1364. &ad->batch_expire[REQ_SYNC], 0, INT_MAX);
  1365. STORE_FUNCTION(as_write_batch_expire_store,
  1366. &ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
  1367. #undef STORE_FUNCTION
  1368. #define AS_ATTR(name) \
  1369. __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
  1370. static struct elv_fs_entry as_attrs[] = {
  1371. __ATTR_RO(est_time),
  1372. AS_ATTR(read_expire),
  1373. AS_ATTR(write_expire),
  1374. AS_ATTR(antic_expire),
  1375. AS_ATTR(read_batch_expire),
  1376. AS_ATTR(write_batch_expire),
  1377. __ATTR_NULL
  1378. };
  1379. static struct elevator_type iosched_as = {
  1380. .ops = {
  1381. .elevator_merge_fn = as_merge,
  1382. .elevator_merged_fn = as_merged_request,
  1383. .elevator_merge_req_fn = as_merged_requests,
  1384. .elevator_dispatch_fn = as_dispatch_request,
  1385. .elevator_add_req_fn = as_add_request,
  1386. .elevator_activate_req_fn = as_activate_request,
  1387. .elevator_deactivate_req_fn = as_deactivate_request,
  1388. .elevator_queue_empty_fn = as_queue_empty,
  1389. .elevator_completed_req_fn = as_completed_request,
  1390. .elevator_former_req_fn = as_former_request,
  1391. .elevator_latter_req_fn = as_latter_request,
  1392. .elevator_set_req_fn = as_set_request,
  1393. .elevator_put_req_fn = as_put_request,
  1394. .elevator_may_queue_fn = as_may_queue,
  1395. .elevator_init_fn = as_init_queue,
  1396. .elevator_exit_fn = as_exit_queue,
  1397. .trim = as_trim,
  1398. },
  1399. .elevator_attrs = as_attrs,
  1400. .elevator_name = "anticipatory",
  1401. .elevator_owner = THIS_MODULE,
  1402. };
  1403. static int __init as_init(void)
  1404. {
  1405. int ret;
  1406. arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
  1407. 0, 0, NULL, NULL);
  1408. if (!arq_pool)
  1409. return -ENOMEM;
  1410. ret = elv_register(&iosched_as);
  1411. if (!ret) {
  1412. /*
  1413. * don't allow AS to get unregistered, since we would have
  1414. * to browse all tasks in the system and release their
  1415. * as_io_context first
  1416. */
  1417. __module_get(THIS_MODULE);
  1418. return 0;
  1419. }
  1420. kmem_cache_destroy(arq_pool);
  1421. return ret;
  1422. }
  1423. static void __exit as_exit(void)
  1424. {
  1425. DECLARE_COMPLETION(all_gone);
  1426. elv_unregister(&iosched_as);
  1427. ioc_gone = &all_gone;
  1428. /* ioc_gone's update must be visible before reading ioc_count */
  1429. smp_wmb();
  1430. if (atomic_read(&ioc_count))
  1431. wait_for_completion(ioc_gone);
  1432. synchronize_rcu();
  1433. kmem_cache_destroy(arq_pool);
  1434. }
  1435. module_init(as_init);
  1436. module_exit(as_exit);
  1437. MODULE_AUTHOR("Nick Piggin");
  1438. MODULE_LICENSE("GPL");
  1439. MODULE_DESCRIPTION("anticipatory IO scheduler");