truncate.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. /*
  2. * mm/truncate.c - code for taking down pages from address_spaces
  3. *
  4. * Copyright (C) 2002, Linus Torvalds
  5. *
  6. * 10Sep2002 Andrew Morton
  7. * Initial version.
  8. */
  9. #include <linux/kernel.h>
  10. #include <linux/backing-dev.h>
  11. #include <linux/gfp.h>
  12. #include <linux/mm.h>
  13. #include <linux/swap.h>
  14. #include <linux/module.h>
  15. #include <linux/pagemap.h>
  16. #include <linux/highmem.h>
  17. #include <linux/pagevec.h>
  18. #include <linux/task_io_accounting_ops.h>
  19. #include <linux/buffer_head.h> /* grr. try_to_release_page,
  20. do_invalidatepage */
  21. #include "internal.h"
  22. /**
  23. * do_invalidatepage - invalidate part or all of a page
  24. * @page: the page which is affected
  25. * @offset: the index of the truncation point
  26. *
  27. * do_invalidatepage() is called when all or part of the page has become
  28. * invalidated by a truncate operation.
  29. *
  30. * do_invalidatepage() does not have to release all buffers, but it must
  31. * ensure that no dirty buffer is left outside @offset and that no I/O
  32. * is underway against any of the blocks which are outside the truncation
  33. * point. Because the caller is about to free (and possibly reuse) those
  34. * blocks on-disk.
  35. */
  36. void do_invalidatepage(struct page *page, unsigned long offset)
  37. {
  38. void (*invalidatepage)(struct page *, unsigned long);
  39. invalidatepage = page->mapping->a_ops->invalidatepage;
  40. #ifdef CONFIG_BLOCK
  41. if (!invalidatepage)
  42. invalidatepage = block_invalidatepage;
  43. #endif
  44. if (invalidatepage)
  45. (*invalidatepage)(page, offset);
  46. }
  47. static inline void truncate_partial_page(struct page *page, unsigned partial)
  48. {
  49. zero_user_segment(page, partial, PAGE_CACHE_SIZE);
  50. if (page_has_private(page))
  51. do_invalidatepage(page, partial);
  52. }
  53. /*
  54. * This cancels just the dirty bit on the kernel page itself, it
  55. * does NOT actually remove dirty bits on any mmap's that may be
  56. * around. It also leaves the page tagged dirty, so any sync
  57. * activity will still find it on the dirty lists, and in particular,
  58. * clear_page_dirty_for_io() will still look at the dirty bits in
  59. * the VM.
  60. *
  61. * Doing this should *normally* only ever be done when a page
  62. * is truncated, and is not actually mapped anywhere at all. However,
  63. * fs/buffer.c does this when it notices that somebody has cleaned
  64. * out all the buffers on a page without actually doing it through
  65. * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
  66. */
  67. void cancel_dirty_page(struct page *page, unsigned int account_size)
  68. {
  69. if (TestClearPageDirty(page)) {
  70. struct address_space *mapping = page->mapping;
  71. if (mapping && mapping_cap_account_dirty(mapping)) {
  72. dec_zone_page_state(page, NR_FILE_DIRTY);
  73. dec_bdi_stat(mapping->backing_dev_info,
  74. BDI_RECLAIMABLE);
  75. if (account_size)
  76. task_io_account_cancelled_write(account_size);
  77. }
  78. }
  79. }
  80. EXPORT_SYMBOL(cancel_dirty_page);
  81. /*
  82. * If truncate cannot remove the fs-private metadata from the page, the page
  83. * becomes orphaned. It will be left on the LRU and may even be mapped into
  84. * user pagetables if we're racing with filemap_fault().
  85. *
  86. * We need to bale out if page->mapping is no longer equal to the original
  87. * mapping. This happens a) when the VM reclaimed the page while we waited on
  88. * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  89. * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  90. */
  91. static int
  92. truncate_complete_page(struct address_space *mapping, struct page *page)
  93. {
  94. if (page->mapping != mapping)
  95. return -EIO;
  96. if (page_has_private(page))
  97. do_invalidatepage(page, 0);
  98. cancel_dirty_page(page, PAGE_CACHE_SIZE);
  99. clear_page_mlock(page);
  100. remove_from_page_cache(page);
  101. ClearPageMappedToDisk(page);
  102. page_cache_release(page); /* pagecache ref */
  103. return 0;
  104. }
  105. /*
  106. * This is for invalidate_mapping_pages(). That function can be called at
  107. * any time, and is not supposed to throw away dirty pages. But pages can
  108. * be marked dirty at any time too, so use remove_mapping which safely
  109. * discards clean, unused pages.
  110. *
  111. * Returns non-zero if the page was successfully invalidated.
  112. */
  113. static int
  114. invalidate_complete_page(struct address_space *mapping, struct page *page)
  115. {
  116. int ret;
  117. if (page->mapping != mapping)
  118. return 0;
  119. if (page_has_private(page) && !try_to_release_page(page, 0))
  120. return 0;
  121. clear_page_mlock(page);
  122. ret = remove_mapping(mapping, page);
  123. return ret;
  124. }
  125. int truncate_inode_page(struct address_space *mapping, struct page *page)
  126. {
  127. if (page_mapped(page)) {
  128. unmap_mapping_range(mapping,
  129. (loff_t)page->index << PAGE_CACHE_SHIFT,
  130. PAGE_CACHE_SIZE, 0);
  131. }
  132. return truncate_complete_page(mapping, page);
  133. }
  134. /*
  135. * Used to get rid of pages on hardware memory corruption.
  136. */
  137. int generic_error_remove_page(struct address_space *mapping, struct page *page)
  138. {
  139. if (!mapping)
  140. return -EINVAL;
  141. /*
  142. * Only punch for normal data pages for now.
  143. * Handling other types like directories would need more auditing.
  144. */
  145. if (!S_ISREG(mapping->host->i_mode))
  146. return -EIO;
  147. return truncate_inode_page(mapping, page);
  148. }
  149. EXPORT_SYMBOL(generic_error_remove_page);
  150. /*
  151. * Safely invalidate one page from its pagecache mapping.
  152. * It only drops clean, unused pages. The page must be locked.
  153. *
  154. * Returns 1 if the page is successfully invalidated, otherwise 0.
  155. */
  156. int invalidate_inode_page(struct page *page)
  157. {
  158. struct address_space *mapping = page_mapping(page);
  159. if (!mapping)
  160. return 0;
  161. if (PageDirty(page) || PageWriteback(page))
  162. return 0;
  163. if (page_mapped(page))
  164. return 0;
  165. return invalidate_complete_page(mapping, page);
  166. }
  167. /**
  168. * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
  169. * @mapping: mapping to truncate
  170. * @lstart: offset from which to truncate
  171. * @lend: offset to which to truncate
  172. *
  173. * Truncate the page cache, removing the pages that are between
  174. * specified offsets (and zeroing out partial page
  175. * (if lstart is not page aligned)).
  176. *
  177. * Truncate takes two passes - the first pass is nonblocking. It will not
  178. * block on page locks and it will not block on writeback. The second pass
  179. * will wait. This is to prevent as much IO as possible in the affected region.
  180. * The first pass will remove most pages, so the search cost of the second pass
  181. * is low.
  182. *
  183. * When looking at page->index outside the page lock we need to be careful to
  184. * copy it into a local to avoid races (it could change at any time).
  185. *
  186. * We pass down the cache-hot hint to the page freeing code. Even if the
  187. * mapping is large, it is probably the case that the final pages are the most
  188. * recently touched, and freeing happens in ascending file offset order.
  189. */
  190. void truncate_inode_pages_range(struct address_space *mapping,
  191. loff_t lstart, loff_t lend)
  192. {
  193. const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
  194. pgoff_t end;
  195. const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
  196. struct pagevec pvec;
  197. pgoff_t next;
  198. int i;
  199. if (mapping->nrpages == 0)
  200. return;
  201. BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
  202. end = (lend >> PAGE_CACHE_SHIFT);
  203. pagevec_init(&pvec, 0);
  204. next = start;
  205. while (next <= end &&
  206. pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
  207. for (i = 0; i < pagevec_count(&pvec); i++) {
  208. struct page *page = pvec.pages[i];
  209. pgoff_t page_index = page->index;
  210. if (page_index > end) {
  211. next = page_index;
  212. break;
  213. }
  214. if (page_index > next)
  215. next = page_index;
  216. next++;
  217. if (!trylock_page(page))
  218. continue;
  219. if (PageWriteback(page)) {
  220. unlock_page(page);
  221. continue;
  222. }
  223. truncate_inode_page(mapping, page);
  224. unlock_page(page);
  225. }
  226. pagevec_release(&pvec);
  227. cond_resched();
  228. }
  229. if (partial) {
  230. struct page *page = find_lock_page(mapping, start - 1);
  231. if (page) {
  232. wait_on_page_writeback(page);
  233. truncate_partial_page(page, partial);
  234. unlock_page(page);
  235. page_cache_release(page);
  236. }
  237. }
  238. next = start;
  239. for ( ; ; ) {
  240. cond_resched();
  241. if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
  242. if (next == start)
  243. break;
  244. next = start;
  245. continue;
  246. }
  247. if (pvec.pages[0]->index > end) {
  248. pagevec_release(&pvec);
  249. break;
  250. }
  251. mem_cgroup_uncharge_start();
  252. for (i = 0; i < pagevec_count(&pvec); i++) {
  253. struct page *page = pvec.pages[i];
  254. if (page->index > end)
  255. break;
  256. lock_page(page);
  257. wait_on_page_writeback(page);
  258. truncate_inode_page(mapping, page);
  259. if (page->index > next)
  260. next = page->index;
  261. next++;
  262. unlock_page(page);
  263. }
  264. pagevec_release(&pvec);
  265. mem_cgroup_uncharge_end();
  266. }
  267. }
  268. EXPORT_SYMBOL(truncate_inode_pages_range);
  269. /**
  270. * truncate_inode_pages - truncate *all* the pages from an offset
  271. * @mapping: mapping to truncate
  272. * @lstart: offset from which to truncate
  273. *
  274. * Called under (and serialised by) inode->i_mutex.
  275. */
  276. void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
  277. {
  278. truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
  279. }
  280. EXPORT_SYMBOL(truncate_inode_pages);
  281. /**
  282. * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
  283. * @mapping: the address_space which holds the pages to invalidate
  284. * @start: the offset 'from' which to invalidate
  285. * @end: the offset 'to' which to invalidate (inclusive)
  286. *
  287. * This function only removes the unlocked pages, if you want to
  288. * remove all the pages of one inode, you must call truncate_inode_pages.
  289. *
  290. * invalidate_mapping_pages() will not block on IO activity. It will not
  291. * invalidate pages which are dirty, locked, under writeback or mapped into
  292. * pagetables.
  293. */
  294. unsigned long invalidate_mapping_pages(struct address_space *mapping,
  295. pgoff_t start, pgoff_t end)
  296. {
  297. struct pagevec pvec;
  298. pgoff_t next = start;
  299. unsigned long ret = 0;
  300. int i;
  301. pagevec_init(&pvec, 0);
  302. while (next <= end &&
  303. pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
  304. mem_cgroup_uncharge_start();
  305. for (i = 0; i < pagevec_count(&pvec); i++) {
  306. struct page *page = pvec.pages[i];
  307. pgoff_t index;
  308. int lock_failed;
  309. lock_failed = !trylock_page(page);
  310. /*
  311. * We really shouldn't be looking at the ->index of an
  312. * unlocked page. But we're not allowed to lock these
  313. * pages. So we rely upon nobody altering the ->index
  314. * of this (pinned-by-us) page.
  315. */
  316. index = page->index;
  317. if (index > next)
  318. next = index;
  319. next++;
  320. if (lock_failed)
  321. continue;
  322. ret += invalidate_inode_page(page);
  323. unlock_page(page);
  324. if (next > end)
  325. break;
  326. }
  327. pagevec_release(&pvec);
  328. mem_cgroup_uncharge_end();
  329. cond_resched();
  330. }
  331. return ret;
  332. }
  333. EXPORT_SYMBOL(invalidate_mapping_pages);
  334. /*
  335. * This is like invalidate_complete_page(), except it ignores the page's
  336. * refcount. We do this because invalidate_inode_pages2() needs stronger
  337. * invalidation guarantees, and cannot afford to leave pages behind because
  338. * shrink_page_list() has a temp ref on them, or because they're transiently
  339. * sitting in the lru_cache_add() pagevecs.
  340. */
  341. static int
  342. invalidate_complete_page2(struct address_space *mapping, struct page *page)
  343. {
  344. if (page->mapping != mapping)
  345. return 0;
  346. if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
  347. return 0;
  348. spin_lock_irq(&mapping->tree_lock);
  349. if (PageDirty(page))
  350. goto failed;
  351. clear_page_mlock(page);
  352. BUG_ON(page_has_private(page));
  353. __remove_from_page_cache(page);
  354. spin_unlock_irq(&mapping->tree_lock);
  355. mem_cgroup_uncharge_cache_page(page);
  356. page_cache_release(page); /* pagecache ref */
  357. return 1;
  358. failed:
  359. spin_unlock_irq(&mapping->tree_lock);
  360. return 0;
  361. }
  362. static int do_launder_page(struct address_space *mapping, struct page *page)
  363. {
  364. if (!PageDirty(page))
  365. return 0;
  366. if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
  367. return 0;
  368. return mapping->a_ops->launder_page(page);
  369. }
  370. /**
  371. * invalidate_inode_pages2_range - remove range of pages from an address_space
  372. * @mapping: the address_space
  373. * @start: the page offset 'from' which to invalidate
  374. * @end: the page offset 'to' which to invalidate (inclusive)
  375. *
  376. * Any pages which are found to be mapped into pagetables are unmapped prior to
  377. * invalidation.
  378. *
  379. * Returns -EBUSY if any pages could not be invalidated.
  380. */
  381. int invalidate_inode_pages2_range(struct address_space *mapping,
  382. pgoff_t start, pgoff_t end)
  383. {
  384. struct pagevec pvec;
  385. pgoff_t next;
  386. int i;
  387. int ret = 0;
  388. int ret2 = 0;
  389. int did_range_unmap = 0;
  390. int wrapped = 0;
  391. pagevec_init(&pvec, 0);
  392. next = start;
  393. while (next <= end && !wrapped &&
  394. pagevec_lookup(&pvec, mapping, next,
  395. min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
  396. mem_cgroup_uncharge_start();
  397. for (i = 0; i < pagevec_count(&pvec); i++) {
  398. struct page *page = pvec.pages[i];
  399. pgoff_t page_index;
  400. lock_page(page);
  401. if (page->mapping != mapping) {
  402. unlock_page(page);
  403. continue;
  404. }
  405. page_index = page->index;
  406. next = page_index + 1;
  407. if (next == 0)
  408. wrapped = 1;
  409. if (page_index > end) {
  410. unlock_page(page);
  411. break;
  412. }
  413. wait_on_page_writeback(page);
  414. if (page_mapped(page)) {
  415. if (!did_range_unmap) {
  416. /*
  417. * Zap the rest of the file in one hit.
  418. */
  419. unmap_mapping_range(mapping,
  420. (loff_t)page_index<<PAGE_CACHE_SHIFT,
  421. (loff_t)(end - page_index + 1)
  422. << PAGE_CACHE_SHIFT,
  423. 0);
  424. did_range_unmap = 1;
  425. } else {
  426. /*
  427. * Just zap this page
  428. */
  429. unmap_mapping_range(mapping,
  430. (loff_t)page_index<<PAGE_CACHE_SHIFT,
  431. PAGE_CACHE_SIZE, 0);
  432. }
  433. }
  434. BUG_ON(page_mapped(page));
  435. ret2 = do_launder_page(mapping, page);
  436. if (ret2 == 0) {
  437. if (!invalidate_complete_page2(mapping, page))
  438. ret2 = -EBUSY;
  439. }
  440. if (ret2 < 0)
  441. ret = ret2;
  442. unlock_page(page);
  443. }
  444. pagevec_release(&pvec);
  445. mem_cgroup_uncharge_end();
  446. cond_resched();
  447. }
  448. return ret;
  449. }
  450. EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
  451. /**
  452. * invalidate_inode_pages2 - remove all pages from an address_space
  453. * @mapping: the address_space
  454. *
  455. * Any pages which are found to be mapped into pagetables are unmapped prior to
  456. * invalidation.
  457. *
  458. * Returns -EBUSY if any pages could not be invalidated.
  459. */
  460. int invalidate_inode_pages2(struct address_space *mapping)
  461. {
  462. return invalidate_inode_pages2_range(mapping, 0, -1);
  463. }
  464. EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
  465. /**
  466. * truncate_pagecache - unmap and remove pagecache that has been truncated
  467. * @inode: inode
  468. * @old: old file offset
  469. * @new: new file offset
  470. *
  471. * inode's new i_size must already be written before truncate_pagecache
  472. * is called.
  473. *
  474. * This function should typically be called before the filesystem
  475. * releases resources associated with the freed range (eg. deallocates
  476. * blocks). This way, pagecache will always stay logically coherent
  477. * with on-disk format, and the filesystem would not have to deal with
  478. * situations such as writepage being called for a page that has already
  479. * had its underlying blocks deallocated.
  480. */
  481. void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
  482. {
  483. struct address_space *mapping = inode->i_mapping;
  484. /*
  485. * unmap_mapping_range is called twice, first simply for
  486. * efficiency so that truncate_inode_pages does fewer
  487. * single-page unmaps. However after this first call, and
  488. * before truncate_inode_pages finishes, it is possible for
  489. * private pages to be COWed, which remain after
  490. * truncate_inode_pages finishes, hence the second
  491. * unmap_mapping_range call must be made for correctness.
  492. */
  493. unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
  494. truncate_inode_pages(mapping, new);
  495. unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
  496. }
  497. EXPORT_SYMBOL(truncate_pagecache);
  498. /**
  499. * truncate_setsize - update inode and pagecache for a new file size
  500. * @inode: inode
  501. * @newsize: new file size
  502. *
  503. * truncate_setsize updastes i_size update and performs pagecache
  504. * truncation (if necessary) for a file size updates. It will be
  505. * typically be called from the filesystem's setattr function when
  506. * ATTR_SIZE is passed in.
  507. *
  508. * Must be called with inode_mutex held and after all filesystem
  509. * specific block truncation has been performed.
  510. */
  511. void truncate_setsize(struct inode *inode, loff_t newsize)
  512. {
  513. loff_t oldsize;
  514. oldsize = inode->i_size;
  515. i_size_write(inode, newsize);
  516. truncate_pagecache(inode, oldsize, newsize);
  517. }
  518. EXPORT_SYMBOL(truncate_setsize);
  519. /**
  520. * vmtruncate - unmap mappings "freed" by truncate() syscall
  521. * @inode: inode of the file used
  522. * @offset: file offset to start truncating
  523. *
  524. * This function is deprecated and truncate_setsize or truncate_pagecache
  525. * should be used instead, together with filesystem specific block truncation.
  526. */
  527. int vmtruncate(struct inode *inode, loff_t offset)
  528. {
  529. int error;
  530. error = inode_newsize_ok(inode, offset);
  531. if (error)
  532. return error;
  533. truncate_setsize(inode, offset);
  534. if (inode->i_op->truncate)
  535. inode->i_op->truncate(inode);
  536. return 0;
  537. }
  538. EXPORT_SYMBOL(vmtruncate);