truncate.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. /*
  2. * mm/truncate.c - code for taking down pages from address_spaces
  3. *
  4. * Copyright (C) 2002, Linus Torvalds
  5. *
  6. * 10Sep2002 Andrew Morton
  7. * Initial version.
  8. */
  9. #include <linux/kernel.h>
  10. #include <linux/backing-dev.h>
  11. #include <linux/gfp.h>
  12. #include <linux/mm.h>
  13. #include <linux/swap.h>
  14. #include <linux/module.h>
  15. #include <linux/pagemap.h>
  16. #include <linux/highmem.h>
  17. #include <linux/pagevec.h>
  18. #include <linux/task_io_accounting_ops.h>
  19. #include <linux/buffer_head.h> /* grr. try_to_release_page,
  20. do_invalidatepage */
  21. #include "internal.h"
  22. /**
  23. * do_invalidatepage - invalidate part or all of a page
  24. * @page: the page which is affected
  25. * @offset: the index of the truncation point
  26. *
  27. * do_invalidatepage() is called when all or part of the page has become
  28. * invalidated by a truncate operation.
  29. *
  30. * do_invalidatepage() does not have to release all buffers, but it must
  31. * ensure that no dirty buffer is left outside @offset and that no I/O
  32. * is underway against any of the blocks which are outside the truncation
  33. * point. Because the caller is about to free (and possibly reuse) those
  34. * blocks on-disk.
  35. */
  36. void do_invalidatepage(struct page *page, unsigned long offset)
  37. {
  38. void (*invalidatepage)(struct page *, unsigned long);
  39. invalidatepage = page->mapping->a_ops->invalidatepage;
  40. #ifdef CONFIG_BLOCK
  41. if (!invalidatepage)
  42. invalidatepage = block_invalidatepage;
  43. #endif
  44. if (invalidatepage)
  45. (*invalidatepage)(page, offset);
  46. }
  47. static inline void truncate_partial_page(struct page *page, unsigned partial)
  48. {
  49. zero_user_segment(page, partial, PAGE_CACHE_SIZE);
  50. if (page_has_private(page))
  51. do_invalidatepage(page, partial);
  52. }
  53. /*
  54. * This cancels just the dirty bit on the kernel page itself, it
  55. * does NOT actually remove dirty bits on any mmap's that may be
  56. * around. It also leaves the page tagged dirty, so any sync
  57. * activity will still find it on the dirty lists, and in particular,
  58. * clear_page_dirty_for_io() will still look at the dirty bits in
  59. * the VM.
  60. *
  61. * Doing this should *normally* only ever be done when a page
  62. * is truncated, and is not actually mapped anywhere at all. However,
  63. * fs/buffer.c does this when it notices that somebody has cleaned
  64. * out all the buffers on a page without actually doing it through
  65. * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
  66. */
  67. void cancel_dirty_page(struct page *page, unsigned int account_size)
  68. {
  69. if (TestClearPageDirty(page)) {
  70. struct address_space *mapping = page->mapping;
  71. if (mapping && mapping_cap_account_dirty(mapping)) {
  72. dec_zone_page_state(page, NR_FILE_DIRTY);
  73. dec_bdi_stat(mapping->backing_dev_info,
  74. BDI_RECLAIMABLE);
  75. if (account_size)
  76. task_io_account_cancelled_write(account_size);
  77. }
  78. }
  79. }
  80. EXPORT_SYMBOL(cancel_dirty_page);
  81. /*
  82. * If truncate cannot remove the fs-private metadata from the page, the page
  83. * becomes orphaned. It will be left on the LRU and may even be mapped into
  84. * user pagetables if we're racing with filemap_fault().
  85. *
  86. * We need to bale out if page->mapping is no longer equal to the original
  87. * mapping. This happens a) when the VM reclaimed the page while we waited on
  88. * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  89. * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  90. */
  91. static int
  92. truncate_complete_page(struct address_space *mapping, struct page *page)
  93. {
  94. if (page->mapping != mapping)
  95. return -EIO;
  96. if (page_has_private(page))
  97. do_invalidatepage(page, 0);
  98. cancel_dirty_page(page, PAGE_CACHE_SIZE);
  99. clear_page_mlock(page);
  100. remove_from_page_cache(page);
  101. ClearPageMappedToDisk(page);
  102. page_cache_release(page); /* pagecache ref */
  103. return 0;
  104. }
  105. /*
  106. * This is for invalidate_mapping_pages(). That function can be called at
  107. * any time, and is not supposed to throw away dirty pages. But pages can
  108. * be marked dirty at any time too, so use remove_mapping which safely
  109. * discards clean, unused pages.
  110. *
  111. * Returns non-zero if the page was successfully invalidated.
  112. */
  113. static int
  114. invalidate_complete_page(struct address_space *mapping, struct page *page)
  115. {
  116. int ret;
  117. if (page->mapping != mapping)
  118. return 0;
  119. if (page_has_private(page) && !try_to_release_page(page, 0))
  120. return 0;
  121. clear_page_mlock(page);
  122. ret = remove_mapping(mapping, page);
  123. return ret;
  124. }
  125. int truncate_inode_page(struct address_space *mapping, struct page *page)
  126. {
  127. if (page_mapped(page)) {
  128. unmap_mapping_range(mapping,
  129. (loff_t)page->index << PAGE_CACHE_SHIFT,
  130. PAGE_CACHE_SIZE, 0);
  131. }
  132. return truncate_complete_page(mapping, page);
  133. }
  134. /*
  135. * Used to get rid of pages on hardware memory corruption.
  136. */
  137. int generic_error_remove_page(struct address_space *mapping, struct page *page)
  138. {
  139. if (!mapping)
  140. return -EINVAL;
  141. /*
  142. * Only punch for normal data pages for now.
  143. * Handling other types like directories would need more auditing.
  144. */
  145. if (!S_ISREG(mapping->host->i_mode))
  146. return -EIO;
  147. return truncate_inode_page(mapping, page);
  148. }
  149. EXPORT_SYMBOL(generic_error_remove_page);
  150. /*
  151. * Safely invalidate one page from its pagecache mapping.
  152. * It only drops clean, unused pages. The page must be locked.
  153. *
  154. * Returns 1 if the page is successfully invalidated, otherwise 0.
  155. */
  156. int invalidate_inode_page(struct page *page)
  157. {
  158. struct address_space *mapping = page_mapping(page);
  159. if (!mapping)
  160. return 0;
  161. if (PageDirty(page) || PageWriteback(page))
  162. return 0;
  163. if (page_mapped(page))
  164. return 0;
  165. return invalidate_complete_page(mapping, page);
  166. }
  167. /**
  168. * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
  169. * @mapping: mapping to truncate
  170. * @lstart: offset from which to truncate
  171. * @lend: offset to which to truncate
  172. *
  173. * Truncate the page cache, removing the pages that are between
  174. * specified offsets (and zeroing out partial page
  175. * (if lstart is not page aligned)).
  176. *
  177. * Truncate takes two passes - the first pass is nonblocking. It will not
  178. * block on page locks and it will not block on writeback. The second pass
  179. * will wait. This is to prevent as much IO as possible in the affected region.
  180. * The first pass will remove most pages, so the search cost of the second pass
  181. * is low.
  182. *
  183. * When looking at page->index outside the page lock we need to be careful to
  184. * copy it into a local to avoid races (it could change at any time).
  185. *
  186. * We pass down the cache-hot hint to the page freeing code. Even if the
  187. * mapping is large, it is probably the case that the final pages are the most
  188. * recently touched, and freeing happens in ascending file offset order.
  189. */
  190. void truncate_inode_pages_range(struct address_space *mapping,
  191. loff_t lstart, loff_t lend)
  192. {
  193. const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
  194. pgoff_t end;
  195. const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
  196. struct pagevec pvec;
  197. pgoff_t next;
  198. int i;
  199. if (mapping->nrpages == 0)
  200. return;
  201. BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
  202. end = (lend >> PAGE_CACHE_SHIFT);
  203. pagevec_init(&pvec, 0);
  204. next = start;
  205. while (next <= end &&
  206. pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
  207. mem_cgroup_uncharge_start();
  208. for (i = 0; i < pagevec_count(&pvec); i++) {
  209. struct page *page = pvec.pages[i];
  210. pgoff_t page_index = page->index;
  211. if (page_index > end) {
  212. next = page_index;
  213. break;
  214. }
  215. if (page_index > next)
  216. next = page_index;
  217. next++;
  218. if (!trylock_page(page))
  219. continue;
  220. if (PageWriteback(page)) {
  221. unlock_page(page);
  222. continue;
  223. }
  224. truncate_inode_page(mapping, page);
  225. unlock_page(page);
  226. }
  227. pagevec_release(&pvec);
  228. mem_cgroup_uncharge_end();
  229. cond_resched();
  230. }
  231. if (partial) {
  232. struct page *page = find_lock_page(mapping, start - 1);
  233. if (page) {
  234. wait_on_page_writeback(page);
  235. truncate_partial_page(page, partial);
  236. unlock_page(page);
  237. page_cache_release(page);
  238. }
  239. }
  240. next = start;
  241. for ( ; ; ) {
  242. cond_resched();
  243. if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
  244. if (next == start)
  245. break;
  246. next = start;
  247. continue;
  248. }
  249. if (pvec.pages[0]->index > end) {
  250. pagevec_release(&pvec);
  251. break;
  252. }
  253. mem_cgroup_uncharge_start();
  254. for (i = 0; i < pagevec_count(&pvec); i++) {
  255. struct page *page = pvec.pages[i];
  256. if (page->index > end)
  257. break;
  258. lock_page(page);
  259. wait_on_page_writeback(page);
  260. truncate_inode_page(mapping, page);
  261. if (page->index > next)
  262. next = page->index;
  263. next++;
  264. unlock_page(page);
  265. }
  266. pagevec_release(&pvec);
  267. mem_cgroup_uncharge_end();
  268. }
  269. }
  270. EXPORT_SYMBOL(truncate_inode_pages_range);
  271. /**
  272. * truncate_inode_pages - truncate *all* the pages from an offset
  273. * @mapping: mapping to truncate
  274. * @lstart: offset from which to truncate
  275. *
  276. * Called under (and serialised by) inode->i_mutex.
  277. */
  278. void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
  279. {
  280. truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
  281. }
  282. EXPORT_SYMBOL(truncate_inode_pages);
  283. /**
  284. * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
  285. * @mapping: the address_space which holds the pages to invalidate
  286. * @start: the offset 'from' which to invalidate
  287. * @end: the offset 'to' which to invalidate (inclusive)
  288. *
  289. * This function only removes the unlocked pages, if you want to
  290. * remove all the pages of one inode, you must call truncate_inode_pages.
  291. *
  292. * invalidate_mapping_pages() will not block on IO activity. It will not
  293. * invalidate pages which are dirty, locked, under writeback or mapped into
  294. * pagetables.
  295. */
  296. unsigned long invalidate_mapping_pages(struct address_space *mapping,
  297. pgoff_t start, pgoff_t end)
  298. {
  299. struct pagevec pvec;
  300. pgoff_t next = start;
  301. unsigned long ret = 0;
  302. int i;
  303. pagevec_init(&pvec, 0);
  304. while (next <= end &&
  305. pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
  306. mem_cgroup_uncharge_start();
  307. for (i = 0; i < pagevec_count(&pvec); i++) {
  308. struct page *page = pvec.pages[i];
  309. pgoff_t index;
  310. int lock_failed;
  311. lock_failed = !trylock_page(page);
  312. /*
  313. * We really shouldn't be looking at the ->index of an
  314. * unlocked page. But we're not allowed to lock these
  315. * pages. So we rely upon nobody altering the ->index
  316. * of this (pinned-by-us) page.
  317. */
  318. index = page->index;
  319. if (index > next)
  320. next = index;
  321. next++;
  322. if (lock_failed)
  323. continue;
  324. ret += invalidate_inode_page(page);
  325. unlock_page(page);
  326. if (next > end)
  327. break;
  328. }
  329. pagevec_release(&pvec);
  330. mem_cgroup_uncharge_end();
  331. cond_resched();
  332. }
  333. return ret;
  334. }
  335. EXPORT_SYMBOL(invalidate_mapping_pages);
  336. /*
  337. * This is like invalidate_complete_page(), except it ignores the page's
  338. * refcount. We do this because invalidate_inode_pages2() needs stronger
  339. * invalidation guarantees, and cannot afford to leave pages behind because
  340. * shrink_page_list() has a temp ref on them, or because they're transiently
  341. * sitting in the lru_cache_add() pagevecs.
  342. */
  343. static int
  344. invalidate_complete_page2(struct address_space *mapping, struct page *page)
  345. {
  346. if (page->mapping != mapping)
  347. return 0;
  348. if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
  349. return 0;
  350. spin_lock_irq(&mapping->tree_lock);
  351. if (PageDirty(page))
  352. goto failed;
  353. clear_page_mlock(page);
  354. BUG_ON(page_has_private(page));
  355. __remove_from_page_cache(page);
  356. spin_unlock_irq(&mapping->tree_lock);
  357. mem_cgroup_uncharge_cache_page(page);
  358. if (mapping->a_ops->freepage)
  359. mapping->a_ops->freepage(page);
  360. page_cache_release(page); /* pagecache ref */
  361. return 1;
  362. failed:
  363. spin_unlock_irq(&mapping->tree_lock);
  364. return 0;
  365. }
  366. static int do_launder_page(struct address_space *mapping, struct page *page)
  367. {
  368. if (!PageDirty(page))
  369. return 0;
  370. if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
  371. return 0;
  372. return mapping->a_ops->launder_page(page);
  373. }
  374. /**
  375. * invalidate_inode_pages2_range - remove range of pages from an address_space
  376. * @mapping: the address_space
  377. * @start: the page offset 'from' which to invalidate
  378. * @end: the page offset 'to' which to invalidate (inclusive)
  379. *
  380. * Any pages which are found to be mapped into pagetables are unmapped prior to
  381. * invalidation.
  382. *
  383. * Returns -EBUSY if any pages could not be invalidated.
  384. */
  385. int invalidate_inode_pages2_range(struct address_space *mapping,
  386. pgoff_t start, pgoff_t end)
  387. {
  388. struct pagevec pvec;
  389. pgoff_t next;
  390. int i;
  391. int ret = 0;
  392. int ret2 = 0;
  393. int did_range_unmap = 0;
  394. int wrapped = 0;
  395. pagevec_init(&pvec, 0);
  396. next = start;
  397. while (next <= end && !wrapped &&
  398. pagevec_lookup(&pvec, mapping, next,
  399. min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
  400. mem_cgroup_uncharge_start();
  401. for (i = 0; i < pagevec_count(&pvec); i++) {
  402. struct page *page = pvec.pages[i];
  403. pgoff_t page_index;
  404. lock_page(page);
  405. if (page->mapping != mapping) {
  406. unlock_page(page);
  407. continue;
  408. }
  409. page_index = page->index;
  410. next = page_index + 1;
  411. if (next == 0)
  412. wrapped = 1;
  413. if (page_index > end) {
  414. unlock_page(page);
  415. break;
  416. }
  417. wait_on_page_writeback(page);
  418. if (page_mapped(page)) {
  419. if (!did_range_unmap) {
  420. /*
  421. * Zap the rest of the file in one hit.
  422. */
  423. unmap_mapping_range(mapping,
  424. (loff_t)page_index<<PAGE_CACHE_SHIFT,
  425. (loff_t)(end - page_index + 1)
  426. << PAGE_CACHE_SHIFT,
  427. 0);
  428. did_range_unmap = 1;
  429. } else {
  430. /*
  431. * Just zap this page
  432. */
  433. unmap_mapping_range(mapping,
  434. (loff_t)page_index<<PAGE_CACHE_SHIFT,
  435. PAGE_CACHE_SIZE, 0);
  436. }
  437. }
  438. BUG_ON(page_mapped(page));
  439. ret2 = do_launder_page(mapping, page);
  440. if (ret2 == 0) {
  441. if (!invalidate_complete_page2(mapping, page))
  442. ret2 = -EBUSY;
  443. }
  444. if (ret2 < 0)
  445. ret = ret2;
  446. unlock_page(page);
  447. }
  448. pagevec_release(&pvec);
  449. mem_cgroup_uncharge_end();
  450. cond_resched();
  451. }
  452. return ret;
  453. }
  454. EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
  455. /**
  456. * invalidate_inode_pages2 - remove all pages from an address_space
  457. * @mapping: the address_space
  458. *
  459. * Any pages which are found to be mapped into pagetables are unmapped prior to
  460. * invalidation.
  461. *
  462. * Returns -EBUSY if any pages could not be invalidated.
  463. */
  464. int invalidate_inode_pages2(struct address_space *mapping)
  465. {
  466. return invalidate_inode_pages2_range(mapping, 0, -1);
  467. }
  468. EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
  469. /**
  470. * truncate_pagecache - unmap and remove pagecache that has been truncated
  471. * @inode: inode
  472. * @old: old file offset
  473. * @new: new file offset
  474. *
  475. * inode's new i_size must already be written before truncate_pagecache
  476. * is called.
  477. *
  478. * This function should typically be called before the filesystem
  479. * releases resources associated with the freed range (eg. deallocates
  480. * blocks). This way, pagecache will always stay logically coherent
  481. * with on-disk format, and the filesystem would not have to deal with
  482. * situations such as writepage being called for a page that has already
  483. * had its underlying blocks deallocated.
  484. */
  485. void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
  486. {
  487. struct address_space *mapping = inode->i_mapping;
  488. /*
  489. * unmap_mapping_range is called twice, first simply for
  490. * efficiency so that truncate_inode_pages does fewer
  491. * single-page unmaps. However after this first call, and
  492. * before truncate_inode_pages finishes, it is possible for
  493. * private pages to be COWed, which remain after
  494. * truncate_inode_pages finishes, hence the second
  495. * unmap_mapping_range call must be made for correctness.
  496. */
  497. unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
  498. truncate_inode_pages(mapping, new);
  499. unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
  500. }
  501. EXPORT_SYMBOL(truncate_pagecache);
  502. /**
  503. * truncate_setsize - update inode and pagecache for a new file size
  504. * @inode: inode
  505. * @newsize: new file size
  506. *
  507. * truncate_setsize updates i_size and performs pagecache truncation (if
  508. * necessary) to @newsize. It will be typically be called from the filesystem's
  509. * setattr function when ATTR_SIZE is passed in.
  510. *
  511. * Must be called with inode_mutex held and before all filesystem specific
  512. * block truncation has been performed.
  513. */
  514. void truncate_setsize(struct inode *inode, loff_t newsize)
  515. {
  516. loff_t oldsize;
  517. oldsize = inode->i_size;
  518. i_size_write(inode, newsize);
  519. truncate_pagecache(inode, oldsize, newsize);
  520. }
  521. EXPORT_SYMBOL(truncate_setsize);
  522. /**
  523. * vmtruncate - unmap mappings "freed" by truncate() syscall
  524. * @inode: inode of the file used
  525. * @offset: file offset to start truncating
  526. *
  527. * This function is deprecated and truncate_setsize or truncate_pagecache
  528. * should be used instead, together with filesystem specific block truncation.
  529. */
  530. int vmtruncate(struct inode *inode, loff_t offset)
  531. {
  532. int error;
  533. error = inode_newsize_ok(inode, offset);
  534. if (error)
  535. return error;
  536. truncate_setsize(inode, offset);
  537. if (inode->i_op->truncate)
  538. inode->i_op->truncate(inode);
  539. return 0;
  540. }
  541. EXPORT_SYMBOL(vmtruncate);