fs-writeback.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244
  1. /*
  2. * fs/fs-writeback.c
  3. *
  4. * Copyright (C) 2002, Linus Torvalds.
  5. *
  6. * Contains all the functions related to writing back and waiting
  7. * upon dirty inodes against superblocks, and writing back dirty
  8. * pages against inodes. ie: data writeback. Writeout of the
  9. * inode itself is not handled here.
  10. *
  11. * 10Apr2002 Andrew Morton
  12. * Split out of fs/inode.c
  13. * Additions for address_space-based writeback
  14. */
  15. #include <linux/kernel.h>
  16. #include <linux/module.h>
  17. #include <linux/spinlock.h>
  18. #include <linux/sched.h>
  19. #include <linux/fs.h>
  20. #include <linux/mm.h>
  21. #include <linux/kthread.h>
  22. #include <linux/freezer.h>
  23. #include <linux/writeback.h>
  24. #include <linux/blkdev.h>
  25. #include <linux/backing-dev.h>
  26. #include <linux/buffer_head.h>
  27. #include "internal.h"
  28. #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
  29. /*
  30. * We don't actually have pdflush, but this one is exported though /proc...
  31. */
  32. int nr_pdflush_threads;
  33. /*
  34. * Work items for the bdi_writeback threads
  35. */
  36. struct bdi_work {
  37. struct list_head list;
  38. struct list_head wait_list;
  39. struct rcu_head rcu_head;
  40. unsigned long seen;
  41. atomic_t pending;
  42. struct super_block *sb;
  43. unsigned long nr_pages;
  44. enum writeback_sync_modes sync_mode;
  45. unsigned long state;
  46. };
  47. enum {
  48. WS_USED_B = 0,
  49. WS_ONSTACK_B,
  50. };
  51. #define WS_USED (1 << WS_USED_B)
  52. #define WS_ONSTACK (1 << WS_ONSTACK_B)
  53. static inline bool bdi_work_on_stack(struct bdi_work *work)
  54. {
  55. return test_bit(WS_ONSTACK_B, &work->state);
  56. }
  57. static inline void bdi_work_init(struct bdi_work *work,
  58. struct writeback_control *wbc)
  59. {
  60. INIT_RCU_HEAD(&work->rcu_head);
  61. work->sb = wbc->sb;
  62. work->nr_pages = wbc->nr_to_write;
  63. work->sync_mode = wbc->sync_mode;
  64. work->state = WS_USED;
  65. }
  66. static inline void bdi_work_init_on_stack(struct bdi_work *work,
  67. struct writeback_control *wbc)
  68. {
  69. bdi_work_init(work, wbc);
  70. work->state |= WS_ONSTACK;
  71. }
  72. /**
  73. * writeback_in_progress - determine whether there is writeback in progress
  74. * @bdi: the device's backing_dev_info structure.
  75. *
  76. * Determine whether there is writeback waiting to be handled against a
  77. * backing device.
  78. */
  79. int writeback_in_progress(struct backing_dev_info *bdi)
  80. {
  81. return !list_empty(&bdi->work_list);
  82. }
  83. static void bdi_work_clear(struct bdi_work *work)
  84. {
  85. clear_bit(WS_USED_B, &work->state);
  86. smp_mb__after_clear_bit();
  87. wake_up_bit(&work->state, WS_USED_B);
  88. }
  89. static void bdi_work_free(struct rcu_head *head)
  90. {
  91. struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
  92. if (!bdi_work_on_stack(work))
  93. kfree(work);
  94. else
  95. bdi_work_clear(work);
  96. }
  97. static void wb_work_complete(struct bdi_work *work)
  98. {
  99. const enum writeback_sync_modes sync_mode = work->sync_mode;
  100. /*
  101. * For allocated work, we can clear the done/seen bit right here.
  102. * For on-stack work, we need to postpone both the clear and free
  103. * to after the RCU grace period, since the stack could be invalidated
  104. * as soon as bdi_work_clear() has done the wakeup.
  105. */
  106. if (!bdi_work_on_stack(work))
  107. bdi_work_clear(work);
  108. if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work))
  109. call_rcu(&work->rcu_head, bdi_work_free);
  110. }
  111. static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
  112. {
  113. /*
  114. * The caller has retrieved the work arguments from this work,
  115. * drop our reference. If this is the last ref, delete and free it
  116. */
  117. if (atomic_dec_and_test(&work->pending)) {
  118. struct backing_dev_info *bdi = wb->bdi;
  119. spin_lock(&bdi->wb_lock);
  120. list_del_rcu(&work->list);
  121. spin_unlock(&bdi->wb_lock);
  122. wb_work_complete(work);
  123. }
  124. }
  125. static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
  126. {
  127. if (work) {
  128. work->seen = bdi->wb_mask;
  129. BUG_ON(!work->seen);
  130. atomic_set(&work->pending, bdi->wb_cnt);
  131. BUG_ON(!bdi->wb_cnt);
  132. /*
  133. * Make sure stores are seen before it appears on the list
  134. */
  135. smp_mb();
  136. spin_lock(&bdi->wb_lock);
  137. list_add_tail_rcu(&work->list, &bdi->work_list);
  138. spin_unlock(&bdi->wb_lock);
  139. }
  140. /*
  141. * If the default thread isn't there, make sure we add it. When
  142. * it gets created and wakes up, we'll run this work.
  143. */
  144. if (unlikely(list_empty_careful(&bdi->wb_list)))
  145. wake_up_process(default_backing_dev_info.wb.task);
  146. else {
  147. struct bdi_writeback *wb = &bdi->wb;
  148. /*
  149. * If we failed allocating the bdi work item, wake up the wb
  150. * thread always. As a safety precaution, it'll flush out
  151. * everything
  152. */
  153. if (!wb_has_dirty_io(wb)) {
  154. if (work)
  155. wb_clear_pending(wb, work);
  156. } else if (wb->task)
  157. wake_up_process(wb->task);
  158. }
  159. }
  160. /*
  161. * Used for on-stack allocated work items. The caller needs to wait until
  162. * the wb threads have acked the work before it's safe to continue.
  163. */
  164. static void bdi_wait_on_work_clear(struct bdi_work *work)
  165. {
  166. wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
  167. TASK_UNINTERRUPTIBLE);
  168. }
  169. static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc)
  170. {
  171. struct bdi_work *work;
  172. work = kmalloc(sizeof(*work), GFP_ATOMIC);
  173. if (work)
  174. bdi_work_init(work, wbc);
  175. return work;
  176. }
  177. void bdi_start_writeback(struct writeback_control *wbc)
  178. {
  179. const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
  180. struct bdi_work work_stack, *work = NULL;
  181. if (!must_wait)
  182. work = bdi_alloc_work(wbc);
  183. if (!work) {
  184. work = &work_stack;
  185. bdi_work_init_on_stack(work, wbc);
  186. }
  187. bdi_queue_work(wbc->bdi, work);
  188. /*
  189. * If the sync mode is WB_SYNC_ALL, block waiting for the work to
  190. * complete. If not, we only need to wait for the work to be started,
  191. * if we allocated it on-stack. We use the same mechanism, if the
  192. * wait bit is set in the bdi_work struct, then threads will not
  193. * clear pending until after they are done.
  194. *
  195. * Note that work == &work_stack if must_wait is true, so we don't
  196. * need to do call_rcu() here ever, since the completion path will
  197. * have done that for us.
  198. */
  199. if (must_wait || work == &work_stack) {
  200. bdi_wait_on_work_clear(work);
  201. if (work != &work_stack)
  202. call_rcu(&work->rcu_head, bdi_work_free);
  203. }
  204. }
  205. /*
  206. * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  207. * furthest end of its superblock's dirty-inode list.
  208. *
  209. * Before stamping the inode's ->dirtied_when, we check to see whether it is
  210. * already the most-recently-dirtied inode on the b_dirty list. If that is
  211. * the case then the inode must have been redirtied while it was being written
  212. * out and we don't reset its dirtied_when.
  213. */
  214. static void redirty_tail(struct inode *inode)
  215. {
  216. struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
  217. if (!list_empty(&wb->b_dirty)) {
  218. struct inode *tail;
  219. tail = list_entry(wb->b_dirty.next, struct inode, i_list);
  220. if (time_before(inode->dirtied_when, tail->dirtied_when))
  221. inode->dirtied_when = jiffies;
  222. }
  223. list_move(&inode->i_list, &wb->b_dirty);
  224. }
  225. /*
  226. * requeue inode for re-scanning after bdi->b_io list is exhausted.
  227. */
  228. static void requeue_io(struct inode *inode)
  229. {
  230. struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
  231. list_move(&inode->i_list, &wb->b_more_io);
  232. }
  233. static void inode_sync_complete(struct inode *inode)
  234. {
  235. /*
  236. * Prevent speculative execution through spin_unlock(&inode_lock);
  237. */
  238. smp_mb();
  239. wake_up_bit(&inode->i_state, __I_SYNC);
  240. }
  241. static bool inode_dirtied_after(struct inode *inode, unsigned long t)
  242. {
  243. bool ret = time_after(inode->dirtied_when, t);
  244. #ifndef CONFIG_64BIT
  245. /*
  246. * For inodes being constantly redirtied, dirtied_when can get stuck.
  247. * It _appears_ to be in the future, but is actually in distant past.
  248. * This test is necessary to prevent such wrapped-around relative times
  249. * from permanently stopping the whole pdflush writeback.
  250. */
  251. ret = ret && time_before_eq(inode->dirtied_when, jiffies);
  252. #endif
  253. return ret;
  254. }
  255. /*
  256. * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
  257. */
  258. static void move_expired_inodes(struct list_head *delaying_queue,
  259. struct list_head *dispatch_queue,
  260. unsigned long *older_than_this)
  261. {
  262. while (!list_empty(delaying_queue)) {
  263. struct inode *inode = list_entry(delaying_queue->prev,
  264. struct inode, i_list);
  265. if (older_than_this &&
  266. inode_dirtied_after(inode, *older_than_this))
  267. break;
  268. list_move(&inode->i_list, dispatch_queue);
  269. }
  270. }
  271. /*
  272. * Queue all expired dirty inodes for io, eldest first.
  273. */
  274. static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
  275. {
  276. list_splice_init(&wb->b_more_io, wb->b_io.prev);
  277. move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
  278. }
  279. static int write_inode(struct inode *inode, int sync)
  280. {
  281. if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
  282. return inode->i_sb->s_op->write_inode(inode, sync);
  283. return 0;
  284. }
  285. /*
  286. * Wait for writeback on an inode to complete.
  287. */
  288. static void inode_wait_for_writeback(struct inode *inode)
  289. {
  290. DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
  291. wait_queue_head_t *wqh;
  292. wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
  293. do {
  294. spin_unlock(&inode_lock);
  295. __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
  296. spin_lock(&inode_lock);
  297. } while (inode->i_state & I_SYNC);
  298. }
  299. /*
  300. * Write out an inode's dirty pages. Called under inode_lock. Either the
  301. * caller has ref on the inode (either via __iget or via syscall against an fd)
  302. * or the inode has I_WILL_FREE set (via generic_forget_inode)
  303. *
  304. * If `wait' is set, wait on the writeout.
  305. *
  306. * The whole writeout design is quite complex and fragile. We want to avoid
  307. * starvation of particular inodes when others are being redirtied, prevent
  308. * livelocks, etc.
  309. *
  310. * Called under inode_lock.
  311. */
  312. static int
  313. writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  314. {
  315. struct address_space *mapping = inode->i_mapping;
  316. int wait = wbc->sync_mode == WB_SYNC_ALL;
  317. unsigned dirty;
  318. int ret;
  319. if (!atomic_read(&inode->i_count))
  320. WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
  321. else
  322. WARN_ON(inode->i_state & I_WILL_FREE);
  323. if (inode->i_state & I_SYNC) {
  324. /*
  325. * If this inode is locked for writeback and we are not doing
  326. * writeback-for-data-integrity, move it to b_more_io so that
  327. * writeback can proceed with the other inodes on s_io.
  328. *
  329. * We'll have another go at writing back this inode when we
  330. * completed a full scan of b_io.
  331. */
  332. if (!wait) {
  333. requeue_io(inode);
  334. return 0;
  335. }
  336. /*
  337. * It's a data-integrity sync. We must wait.
  338. */
  339. inode_wait_for_writeback(inode);
  340. }
  341. BUG_ON(inode->i_state & I_SYNC);
  342. /* Set I_SYNC, reset I_DIRTY */
  343. dirty = inode->i_state & I_DIRTY;
  344. inode->i_state |= I_SYNC;
  345. inode->i_state &= ~I_DIRTY;
  346. spin_unlock(&inode_lock);
  347. ret = do_writepages(mapping, wbc);
  348. /* Don't write the inode if only I_DIRTY_PAGES was set */
  349. if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
  350. int err = write_inode(inode, wait);
  351. if (ret == 0)
  352. ret = err;
  353. }
  354. if (wait) {
  355. int err = filemap_fdatawait(mapping);
  356. if (ret == 0)
  357. ret = err;
  358. }
  359. spin_lock(&inode_lock);
  360. inode->i_state &= ~I_SYNC;
  361. if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
  362. if (!(inode->i_state & I_DIRTY) &&
  363. mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
  364. /*
  365. * We didn't write back all the pages. nfs_writepages()
  366. * sometimes bales out without doing anything. Redirty
  367. * the inode; Move it from b_io onto b_more_io/b_dirty.
  368. */
  369. /*
  370. * akpm: if the caller was the kupdate function we put
  371. * this inode at the head of b_dirty so it gets first
  372. * consideration. Otherwise, move it to the tail, for
  373. * the reasons described there. I'm not really sure
  374. * how much sense this makes. Presumably I had a good
  375. * reasons for doing it this way, and I'd rather not
  376. * muck with it at present.
  377. */
  378. if (wbc->for_kupdate) {
  379. /*
  380. * For the kupdate function we move the inode
  381. * to b_more_io so it will get more writeout as
  382. * soon as the queue becomes uncongested.
  383. */
  384. inode->i_state |= I_DIRTY_PAGES;
  385. if (wbc->nr_to_write <= 0) {
  386. /*
  387. * slice used up: queue for next turn
  388. */
  389. requeue_io(inode);
  390. } else {
  391. /*
  392. * somehow blocked: retry later
  393. */
  394. redirty_tail(inode);
  395. }
  396. } else {
  397. /*
  398. * Otherwise fully redirty the inode so that
  399. * other inodes on this superblock will get some
  400. * writeout. Otherwise heavy writing to one
  401. * file would indefinitely suspend writeout of
  402. * all the other files.
  403. */
  404. inode->i_state |= I_DIRTY_PAGES;
  405. redirty_tail(inode);
  406. }
  407. } else if (inode->i_state & I_DIRTY) {
  408. /*
  409. * Someone redirtied the inode while were writing back
  410. * the pages.
  411. */
  412. redirty_tail(inode);
  413. } else if (atomic_read(&inode->i_count)) {
  414. /*
  415. * The inode is clean, inuse
  416. */
  417. list_move(&inode->i_list, &inode_in_use);
  418. } else {
  419. /*
  420. * The inode is clean, unused
  421. */
  422. list_move(&inode->i_list, &inode_unused);
  423. }
  424. }
  425. inode_sync_complete(inode);
  426. return ret;
  427. }
  428. /*
  429. * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
  430. * before calling writeback. So make sure that we do pin it, so it doesn't
  431. * go away while we are writing inodes from it.
  432. *
  433. * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
  434. * 1 if we failed.
  435. */
  436. static int pin_sb_for_writeback(struct writeback_control *wbc,
  437. struct inode *inode)
  438. {
  439. struct super_block *sb = inode->i_sb;
  440. /*
  441. * Caller must already hold the ref for this
  442. */
  443. if (wbc->sync_mode == WB_SYNC_ALL) {
  444. WARN_ON(!rwsem_is_locked(&sb->s_umount));
  445. return 0;
  446. }
  447. spin_lock(&sb_lock);
  448. sb->s_count++;
  449. if (down_read_trylock(&sb->s_umount)) {
  450. if (sb->s_root) {
  451. spin_unlock(&sb_lock);
  452. return 0;
  453. }
  454. /*
  455. * umounted, drop rwsem again and fall through to failure
  456. */
  457. up_read(&sb->s_umount);
  458. }
  459. sb->s_count--;
  460. spin_unlock(&sb_lock);
  461. return 1;
  462. }
  463. static void unpin_sb_for_writeback(struct writeback_control *wbc,
  464. struct inode *inode)
  465. {
  466. struct super_block *sb = inode->i_sb;
  467. if (wbc->sync_mode == WB_SYNC_ALL)
  468. return;
  469. up_read(&sb->s_umount);
  470. put_super(sb);
  471. }
  472. static void writeback_inodes_wb(struct bdi_writeback *wb,
  473. struct writeback_control *wbc)
  474. {
  475. struct super_block *sb = wbc->sb;
  476. const int is_blkdev_sb = sb_is_blkdev_sb(sb);
  477. const unsigned long start = jiffies; /* livelock avoidance */
  478. spin_lock(&inode_lock);
  479. if (!wbc->for_kupdate || list_empty(&wb->b_io))
  480. queue_io(wb, wbc->older_than_this);
  481. while (!list_empty(&wb->b_io)) {
  482. struct inode *inode = list_entry(wb->b_io.prev,
  483. struct inode, i_list);
  484. long pages_skipped;
  485. /*
  486. * super block given and doesn't match, skip this inode
  487. */
  488. if (sb && sb != inode->i_sb) {
  489. redirty_tail(inode);
  490. continue;
  491. }
  492. if (!bdi_cap_writeback_dirty(wb->bdi)) {
  493. redirty_tail(inode);
  494. if (is_blkdev_sb) {
  495. /*
  496. * Dirty memory-backed blockdev: the ramdisk
  497. * driver does this. Skip just this inode
  498. */
  499. continue;
  500. }
  501. /*
  502. * Dirty memory-backed inode against a filesystem other
  503. * than the kernel-internal bdev filesystem. Skip the
  504. * entire superblock.
  505. */
  506. break;
  507. }
  508. if (inode->i_state & (I_NEW | I_WILL_FREE)) {
  509. requeue_io(inode);
  510. continue;
  511. }
  512. if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
  513. wbc->encountered_congestion = 1;
  514. if (!is_blkdev_sb)
  515. break; /* Skip a congested fs */
  516. requeue_io(inode);
  517. continue; /* Skip a congested blockdev */
  518. }
  519. /*
  520. * Was this inode dirtied after sync_sb_inodes was called?
  521. * This keeps sync from extra jobs and livelock.
  522. */
  523. if (inode_dirtied_after(inode, start))
  524. break;
  525. if (pin_sb_for_writeback(wbc, inode)) {
  526. requeue_io(inode);
  527. continue;
  528. }
  529. BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
  530. __iget(inode);
  531. pages_skipped = wbc->pages_skipped;
  532. writeback_single_inode(inode, wbc);
  533. unpin_sb_for_writeback(wbc, inode);
  534. if (wbc->pages_skipped != pages_skipped) {
  535. /*
  536. * writeback is not making progress due to locked
  537. * buffers. Skip this inode for now.
  538. */
  539. redirty_tail(inode);
  540. }
  541. spin_unlock(&inode_lock);
  542. iput(inode);
  543. cond_resched();
  544. spin_lock(&inode_lock);
  545. if (wbc->nr_to_write <= 0) {
  546. wbc->more_io = 1;
  547. break;
  548. }
  549. if (!list_empty(&wb->b_more_io))
  550. wbc->more_io = 1;
  551. }
  552. spin_unlock(&inode_lock);
  553. /* Leave any unwritten inodes on b_io */
  554. }
  555. void writeback_inodes_wbc(struct writeback_control *wbc)
  556. {
  557. struct backing_dev_info *bdi = wbc->bdi;
  558. writeback_inodes_wb(&bdi->wb, wbc);
  559. }
  560. /*
  561. * The maximum number of pages to writeout in a single bdi flush/kupdate
  562. * operation. We do this so we don't hold I_SYNC against an inode for
  563. * enormous amounts of time, which would block a userspace task which has
  564. * been forced to throttle against that inode. Also, the code reevaluates
  565. * the dirty each time it has written this many pages.
  566. */
  567. #define MAX_WRITEBACK_PAGES 1024
  568. static inline bool over_bground_thresh(void)
  569. {
  570. unsigned long background_thresh, dirty_thresh;
  571. get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
  572. return (global_page_state(NR_FILE_DIRTY) +
  573. global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
  574. }
  575. /*
  576. * Explicit flushing or periodic writeback of "old" data.
  577. *
  578. * Define "old": the first time one of an inode's pages is dirtied, we mark the
  579. * dirtying-time in the inode's address_space. So this periodic writeback code
  580. * just walks the superblock inode list, writing back any inodes which are
  581. * older than a specific point in time.
  582. *
  583. * Try to run once per dirty_writeback_interval. But if a writeback event
  584. * takes longer than a dirty_writeback_interval interval, then leave a
  585. * one-second gap.
  586. *
  587. * older_than_this takes precedence over nr_to_write. So we'll only write back
  588. * all dirty pages if they are all attached to "old" mappings.
  589. */
  590. static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
  591. struct super_block *sb,
  592. enum writeback_sync_modes sync_mode, int for_kupdate)
  593. {
  594. struct writeback_control wbc = {
  595. .bdi = wb->bdi,
  596. .sb = sb,
  597. .sync_mode = sync_mode,
  598. .older_than_this = NULL,
  599. .for_kupdate = for_kupdate,
  600. .range_cyclic = 1,
  601. };
  602. unsigned long oldest_jif;
  603. long wrote = 0;
  604. if (wbc.for_kupdate) {
  605. wbc.older_than_this = &oldest_jif;
  606. oldest_jif = jiffies -
  607. msecs_to_jiffies(dirty_expire_interval * 10);
  608. }
  609. for (;;) {
  610. /*
  611. * Don't flush anything for non-integrity writeback where
  612. * no nr_pages was given
  613. */
  614. if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
  615. break;
  616. /*
  617. * If no specific pages were given and this is just a
  618. * periodic background writeout and we are below the
  619. * background dirty threshold, don't do anything
  620. */
  621. if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
  622. break;
  623. wbc.more_io = 0;
  624. wbc.encountered_congestion = 0;
  625. wbc.nr_to_write = MAX_WRITEBACK_PAGES;
  626. wbc.pages_skipped = 0;
  627. writeback_inodes_wb(wb, &wbc);
  628. nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
  629. wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
  630. /*
  631. * If we ran out of stuff to write, bail unless more_io got set
  632. */
  633. if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
  634. if (wbc.more_io && !wbc.for_kupdate)
  635. continue;
  636. break;
  637. }
  638. }
  639. return wrote;
  640. }
  641. /*
  642. * Return the next bdi_work struct that hasn't been processed by this
  643. * wb thread yet
  644. */
  645. static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
  646. struct bdi_writeback *wb)
  647. {
  648. struct bdi_work *work, *ret = NULL;
  649. rcu_read_lock();
  650. list_for_each_entry_rcu(work, &bdi->work_list, list) {
  651. if (!test_and_clear_bit(wb->nr, &work->seen))
  652. continue;
  653. ret = work;
  654. break;
  655. }
  656. rcu_read_unlock();
  657. return ret;
  658. }
  659. static long wb_check_old_data_flush(struct bdi_writeback *wb)
  660. {
  661. unsigned long expired;
  662. long nr_pages;
  663. expired = wb->last_old_flush +
  664. msecs_to_jiffies(dirty_writeback_interval * 10);
  665. if (time_before(jiffies, expired))
  666. return 0;
  667. wb->last_old_flush = jiffies;
  668. nr_pages = global_page_state(NR_FILE_DIRTY) +
  669. global_page_state(NR_UNSTABLE_NFS) +
  670. (inodes_stat.nr_inodes - inodes_stat.nr_unused);
  671. if (nr_pages)
  672. return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
  673. return 0;
  674. }
  675. /*
  676. * Retrieve work items and do the writeback they describe
  677. */
  678. long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
  679. {
  680. struct backing_dev_info *bdi = wb->bdi;
  681. struct bdi_work *work;
  682. long nr_pages, wrote = 0;
  683. while ((work = get_next_work_item(bdi, wb)) != NULL) {
  684. enum writeback_sync_modes sync_mode;
  685. nr_pages = work->nr_pages;
  686. /*
  687. * Override sync mode, in case we must wait for completion
  688. */
  689. if (force_wait)
  690. work->sync_mode = sync_mode = WB_SYNC_ALL;
  691. else
  692. sync_mode = work->sync_mode;
  693. /*
  694. * If this isn't a data integrity operation, just notify
  695. * that we have seen this work and we are now starting it.
  696. */
  697. if (sync_mode == WB_SYNC_NONE)
  698. wb_clear_pending(wb, work);
  699. wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
  700. /*
  701. * This is a data integrity writeback, so only do the
  702. * notification when we have completed the work.
  703. */
  704. if (sync_mode == WB_SYNC_ALL)
  705. wb_clear_pending(wb, work);
  706. }
  707. /*
  708. * Check for periodic writeback, kupdated() style
  709. */
  710. wrote += wb_check_old_data_flush(wb);
  711. return wrote;
  712. }
  713. /*
  714. * Handle writeback of dirty data for the device backed by this bdi. Also
  715. * wakes up periodically and does kupdated style flushing.
  716. */
  717. int bdi_writeback_task(struct bdi_writeback *wb)
  718. {
  719. unsigned long last_active = jiffies;
  720. unsigned long wait_jiffies = -1UL;
  721. long pages_written;
  722. while (!kthread_should_stop()) {
  723. pages_written = wb_do_writeback(wb, 0);
  724. if (pages_written)
  725. last_active = jiffies;
  726. else if (wait_jiffies != -1UL) {
  727. unsigned long max_idle;
  728. /*
  729. * Longest period of inactivity that we tolerate. If we
  730. * see dirty data again later, the task will get
  731. * recreated automatically.
  732. */
  733. max_idle = max(5UL * 60 * HZ, wait_jiffies);
  734. if (time_after(jiffies, max_idle + last_active))
  735. break;
  736. }
  737. wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
  738. set_current_state(TASK_INTERRUPTIBLE);
  739. schedule_timeout(wait_jiffies);
  740. try_to_freeze();
  741. }
  742. return 0;
  743. }
  744. /*
  745. * Schedule writeback for all backing devices. Expensive! If this is a data
  746. * integrity operation, writeback will be complete when this returns. If
  747. * we are simply called for WB_SYNC_NONE, then writeback will merely be
  748. * scheduled to run.
  749. */
  750. static void bdi_writeback_all(struct writeback_control *wbc)
  751. {
  752. const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
  753. struct backing_dev_info *bdi;
  754. struct bdi_work *work;
  755. LIST_HEAD(list);
  756. restart:
  757. spin_lock(&bdi_lock);
  758. list_for_each_entry(bdi, &bdi_list, bdi_list) {
  759. struct bdi_work *work;
  760. if (!bdi_has_dirty_io(bdi))
  761. continue;
  762. /*
  763. * If work allocation fails, do the writes inline. We drop
  764. * the lock and restart the list writeout. This should be OK,
  765. * since this happens rarely and because the writeout should
  766. * eventually make more free memory available.
  767. */
  768. work = bdi_alloc_work(wbc);
  769. if (!work) {
  770. struct writeback_control __wbc;
  771. /*
  772. * Not a data integrity writeout, just continue
  773. */
  774. if (!must_wait)
  775. continue;
  776. spin_unlock(&bdi_lock);
  777. __wbc = *wbc;
  778. __wbc.bdi = bdi;
  779. writeback_inodes_wbc(&__wbc);
  780. goto restart;
  781. }
  782. if (must_wait)
  783. list_add_tail(&work->wait_list, &list);
  784. bdi_queue_work(bdi, work);
  785. }
  786. spin_unlock(&bdi_lock);
  787. /*
  788. * If this is for WB_SYNC_ALL, wait for pending work to complete
  789. * before returning.
  790. */
  791. while (!list_empty(&list)) {
  792. work = list_entry(list.next, struct bdi_work, wait_list);
  793. list_del(&work->wait_list);
  794. bdi_wait_on_work_clear(work);
  795. call_rcu(&work->rcu_head, bdi_work_free);
  796. }
  797. }
  798. /*
  799. * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
  800. * the whole world.
  801. */
  802. void wakeup_flusher_threads(long nr_pages)
  803. {
  804. struct writeback_control wbc = {
  805. .sync_mode = WB_SYNC_NONE,
  806. .older_than_this = NULL,
  807. .range_cyclic = 1,
  808. };
  809. if (nr_pages == 0)
  810. nr_pages = global_page_state(NR_FILE_DIRTY) +
  811. global_page_state(NR_UNSTABLE_NFS);
  812. wbc.nr_to_write = nr_pages;
  813. bdi_writeback_all(&wbc);
  814. }
  815. static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  816. {
  817. if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
  818. struct dentry *dentry;
  819. const char *name = "?";
  820. dentry = d_find_alias(inode);
  821. if (dentry) {
  822. spin_lock(&dentry->d_lock);
  823. name = (const char *) dentry->d_name.name;
  824. }
  825. printk(KERN_DEBUG
  826. "%s(%d): dirtied inode %lu (%s) on %s\n",
  827. current->comm, task_pid_nr(current), inode->i_ino,
  828. name, inode->i_sb->s_id);
  829. if (dentry) {
  830. spin_unlock(&dentry->d_lock);
  831. dput(dentry);
  832. }
  833. }
  834. }
  835. /**
  836. * __mark_inode_dirty - internal function
  837. * @inode: inode to mark
  838. * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
  839. * Mark an inode as dirty. Callers should use mark_inode_dirty or
  840. * mark_inode_dirty_sync.
  841. *
  842. * Put the inode on the super block's dirty list.
  843. *
  844. * CAREFUL! We mark it dirty unconditionally, but move it onto the
  845. * dirty list only if it is hashed or if it refers to a blockdev.
  846. * If it was not hashed, it will never be added to the dirty list
  847. * even if it is later hashed, as it will have been marked dirty already.
  848. *
  849. * In short, make sure you hash any inodes _before_ you start marking
  850. * them dirty.
  851. *
  852. * This function *must* be atomic for the I_DIRTY_PAGES case -
  853. * set_page_dirty() is called under spinlock in several places.
  854. *
  855. * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
  856. * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
  857. * the kernel-internal blockdev inode represents the dirtying time of the
  858. * blockdev's pages. This is why for I_DIRTY_PAGES we always use
  859. * page->mapping->host, so the page-dirtying time is recorded in the internal
  860. * blockdev inode.
  861. */
  862. void __mark_inode_dirty(struct inode *inode, int flags)
  863. {
  864. struct super_block *sb = inode->i_sb;
  865. /*
  866. * Don't do this for I_DIRTY_PAGES - that doesn't actually
  867. * dirty the inode itself
  868. */
  869. if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
  870. if (sb->s_op->dirty_inode)
  871. sb->s_op->dirty_inode(inode);
  872. }
  873. /*
  874. * make sure that changes are seen by all cpus before we test i_state
  875. * -- mikulas
  876. */
  877. smp_mb();
  878. /* avoid the locking if we can */
  879. if ((inode->i_state & flags) == flags)
  880. return;
  881. if (unlikely(block_dump))
  882. block_dump___mark_inode_dirty(inode);
  883. spin_lock(&inode_lock);
  884. if ((inode->i_state & flags) != flags) {
  885. const int was_dirty = inode->i_state & I_DIRTY;
  886. inode->i_state |= flags;
  887. /*
  888. * If the inode is being synced, just update its dirty state.
  889. * The unlocker will place the inode on the appropriate
  890. * superblock list, based upon its state.
  891. */
  892. if (inode->i_state & I_SYNC)
  893. goto out;
  894. /*
  895. * Only add valid (hashed) inodes to the superblock's
  896. * dirty list. Add blockdev inodes as well.
  897. */
  898. if (!S_ISBLK(inode->i_mode)) {
  899. if (hlist_unhashed(&inode->i_hash))
  900. goto out;
  901. }
  902. if (inode->i_state & (I_FREEING|I_CLEAR))
  903. goto out;
  904. /*
  905. * If the inode was already on b_dirty/b_io/b_more_io, don't
  906. * reposition it (that would break b_dirty time-ordering).
  907. */
  908. if (!was_dirty) {
  909. struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
  910. struct backing_dev_info *bdi = wb->bdi;
  911. if (bdi_cap_writeback_dirty(bdi) &&
  912. !test_bit(BDI_registered, &bdi->state)) {
  913. WARN_ON(1);
  914. printk(KERN_ERR "bdi-%s not registered\n",
  915. bdi->name);
  916. }
  917. inode->dirtied_when = jiffies;
  918. list_move(&inode->i_list, &wb->b_dirty);
  919. }
  920. }
  921. out:
  922. spin_unlock(&inode_lock);
  923. }
  924. EXPORT_SYMBOL(__mark_inode_dirty);
  925. /*
  926. * Write out a superblock's list of dirty inodes. A wait will be performed
  927. * upon no inodes, all inodes or the final one, depending upon sync_mode.
  928. *
  929. * If older_than_this is non-NULL, then only write out inodes which
  930. * had their first dirtying at a time earlier than *older_than_this.
  931. *
  932. * If we're a pdlfush thread, then implement pdflush collision avoidance
  933. * against the entire list.
  934. *
  935. * If `bdi' is non-zero then we're being asked to writeback a specific queue.
  936. * This function assumes that the blockdev superblock's inodes are backed by
  937. * a variety of queues, so all inodes are searched. For other superblocks,
  938. * assume that all inodes are backed by the same queue.
  939. *
  940. * The inodes to be written are parked on bdi->b_io. They are moved back onto
  941. * bdi->b_dirty as they are selected for writing. This way, none can be missed
  942. * on the writer throttling path, and we get decent balancing between many
  943. * throttled threads: we don't want them all piling up on inode_sync_wait.
  944. */
  945. static void wait_sb_inodes(struct writeback_control *wbc)
  946. {
  947. struct inode *inode, *old_inode = NULL;
  948. /*
  949. * We need to be protected against the filesystem going from
  950. * r/o to r/w or vice versa.
  951. */
  952. WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
  953. spin_lock(&inode_lock);
  954. /*
  955. * Data integrity sync. Must wait for all pages under writeback,
  956. * because there may have been pages dirtied before our sync
  957. * call, but which had writeout started before we write it out.
  958. * In which case, the inode may not be on the dirty list, but
  959. * we still have to wait for that writeout.
  960. */
  961. list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
  962. struct address_space *mapping;
  963. if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
  964. continue;
  965. mapping = inode->i_mapping;
  966. if (mapping->nrpages == 0)
  967. continue;
  968. __iget(inode);
  969. spin_unlock(&inode_lock);
  970. /*
  971. * We hold a reference to 'inode' so it couldn't have
  972. * been removed from s_inodes list while we dropped the
  973. * inode_lock. We cannot iput the inode now as we can
  974. * be holding the last reference and we cannot iput it
  975. * under inode_lock. So we keep the reference and iput
  976. * it later.
  977. */
  978. iput(old_inode);
  979. old_inode = inode;
  980. filemap_fdatawait(mapping);
  981. cond_resched();
  982. spin_lock(&inode_lock);
  983. }
  984. spin_unlock(&inode_lock);
  985. iput(old_inode);
  986. }
  987. /**
  988. * writeback_inodes_sb - writeback dirty inodes from given super_block
  989. * @sb: the superblock
  990. *
  991. * Start writeback on some inodes on this super_block. No guarantees are made
  992. * on how many (if any) will be written, and this function does not wait
  993. * for IO completion of submitted IO. The number of pages submitted is
  994. * returned.
  995. */
  996. long writeback_inodes_sb(struct super_block *sb)
  997. {
  998. struct writeback_control wbc = {
  999. .sb = sb,
  1000. .sync_mode = WB_SYNC_NONE,
  1001. .range_start = 0,
  1002. .range_end = LLONG_MAX,
  1003. };
  1004. unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
  1005. unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
  1006. long nr_to_write;
  1007. nr_to_write = nr_dirty + nr_unstable +
  1008. (inodes_stat.nr_inodes - inodes_stat.nr_unused);
  1009. wbc.nr_to_write = nr_to_write;
  1010. bdi_writeback_all(&wbc);
  1011. return nr_to_write - wbc.nr_to_write;
  1012. }
  1013. EXPORT_SYMBOL(writeback_inodes_sb);
  1014. /**
  1015. * sync_inodes_sb - sync sb inode pages
  1016. * @sb: the superblock
  1017. *
  1018. * This function writes and waits on any dirty inode belonging to this
  1019. * super_block. The number of pages synced is returned.
  1020. */
  1021. long sync_inodes_sb(struct super_block *sb)
  1022. {
  1023. struct writeback_control wbc = {
  1024. .sb = sb,
  1025. .sync_mode = WB_SYNC_ALL,
  1026. .range_start = 0,
  1027. .range_end = LLONG_MAX,
  1028. };
  1029. long nr_to_write = LONG_MAX; /* doesn't actually matter */
  1030. wbc.nr_to_write = nr_to_write;
  1031. bdi_writeback_all(&wbc);
  1032. wait_sb_inodes(&wbc);
  1033. return nr_to_write - wbc.nr_to_write;
  1034. }
  1035. EXPORT_SYMBOL(sync_inodes_sb);
  1036. /**
  1037. * write_inode_now - write an inode to disk
  1038. * @inode: inode to write to disk
  1039. * @sync: whether the write should be synchronous or not
  1040. *
  1041. * This function commits an inode to disk immediately if it is dirty. This is
  1042. * primarily needed by knfsd.
  1043. *
  1044. * The caller must either have a ref on the inode or must have set I_WILL_FREE.
  1045. */
  1046. int write_inode_now(struct inode *inode, int sync)
  1047. {
  1048. int ret;
  1049. struct writeback_control wbc = {
  1050. .nr_to_write = LONG_MAX,
  1051. .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
  1052. .range_start = 0,
  1053. .range_end = LLONG_MAX,
  1054. };
  1055. if (!mapping_cap_writeback_dirty(inode->i_mapping))
  1056. wbc.nr_to_write = 0;
  1057. might_sleep();
  1058. spin_lock(&inode_lock);
  1059. ret = writeback_single_inode(inode, &wbc);
  1060. spin_unlock(&inode_lock);
  1061. if (sync)
  1062. inode_sync_wait(inode);
  1063. return ret;
  1064. }
  1065. EXPORT_SYMBOL(write_inode_now);
  1066. /**
  1067. * sync_inode - write an inode and its pages to disk.
  1068. * @inode: the inode to sync
  1069. * @wbc: controls the writeback mode
  1070. *
  1071. * sync_inode() will write an inode and its pages to disk. It will also
  1072. * correctly update the inode on its superblock's dirty inode lists and will
  1073. * update inode->i_state.
  1074. *
  1075. * The caller must have a ref on the inode.
  1076. */
  1077. int sync_inode(struct inode *inode, struct writeback_control *wbc)
  1078. {
  1079. int ret;
  1080. spin_lock(&inode_lock);
  1081. ret = writeback_single_inode(inode, wbc);
  1082. spin_unlock(&inode_lock);
  1083. return ret;
  1084. }
  1085. EXPORT_SYMBOL(sync_inode);