xfs_aops.c 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588
  1. /*
  2. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  3. * All Rights Reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it would be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write the Free Software Foundation,
  16. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "xfs.h"
  19. #include "xfs_bit.h"
  20. #include "xfs_log.h"
  21. #include "xfs_inum.h"
  22. #include "xfs_sb.h"
  23. #include "xfs_ag.h"
  24. #include "xfs_dir2.h"
  25. #include "xfs_trans.h"
  26. #include "xfs_dmapi.h"
  27. #include "xfs_mount.h"
  28. #include "xfs_bmap_btree.h"
  29. #include "xfs_alloc_btree.h"
  30. #include "xfs_ialloc_btree.h"
  31. #include "xfs_dir2_sf.h"
  32. #include "xfs_attr_sf.h"
  33. #include "xfs_dinode.h"
  34. #include "xfs_inode.h"
  35. #include "xfs_alloc.h"
  36. #include "xfs_btree.h"
  37. #include "xfs_error.h"
  38. #include "xfs_rw.h"
  39. #include "xfs_iomap.h"
  40. #include "xfs_vnodeops.h"
  41. #include <linux/mpage.h>
  42. #include <linux/pagevec.h>
  43. #include <linux/writeback.h>
  44. STATIC void
  45. xfs_count_page_state(
  46. struct page *page,
  47. int *delalloc,
  48. int *unmapped,
  49. int *unwritten)
  50. {
  51. struct buffer_head *bh, *head;
  52. *delalloc = *unmapped = *unwritten = 0;
  53. bh = head = page_buffers(page);
  54. do {
  55. if (buffer_uptodate(bh) && !buffer_mapped(bh))
  56. (*unmapped) = 1;
  57. else if (buffer_unwritten(bh))
  58. (*unwritten) = 1;
  59. else if (buffer_delay(bh))
  60. (*delalloc) = 1;
  61. } while ((bh = bh->b_this_page) != head);
  62. }
  63. #if defined(XFS_RW_TRACE)
  64. void
  65. xfs_page_trace(
  66. int tag,
  67. struct inode *inode,
  68. struct page *page,
  69. unsigned long pgoff)
  70. {
  71. xfs_inode_t *ip;
  72. loff_t isize = i_size_read(inode);
  73. loff_t offset = page_offset(page);
  74. int delalloc = -1, unmapped = -1, unwritten = -1;
  75. if (page_has_buffers(page))
  76. xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
  77. ip = XFS_I(inode);
  78. if (!ip->i_rwtrace)
  79. return;
  80. ktrace_enter(ip->i_rwtrace,
  81. (void *)((unsigned long)tag),
  82. (void *)ip,
  83. (void *)inode,
  84. (void *)page,
  85. (void *)pgoff,
  86. (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
  87. (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
  88. (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
  89. (void *)((unsigned long)(isize & 0xffffffff)),
  90. (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
  91. (void *)((unsigned long)(offset & 0xffffffff)),
  92. (void *)((unsigned long)delalloc),
  93. (void *)((unsigned long)unmapped),
  94. (void *)((unsigned long)unwritten),
  95. (void *)((unsigned long)current_pid()),
  96. (void *)NULL);
  97. }
  98. #else
  99. #define xfs_page_trace(tag, inode, page, pgoff)
  100. #endif
  101. STATIC struct block_device *
  102. xfs_find_bdev_for_inode(
  103. struct xfs_inode *ip)
  104. {
  105. struct xfs_mount *mp = ip->i_mount;
  106. if (XFS_IS_REALTIME_INODE(ip))
  107. return mp->m_rtdev_targp->bt_bdev;
  108. else
  109. return mp->m_ddev_targp->bt_bdev;
  110. }
  111. /*
  112. * Schedule IO completion handling on a xfsdatad if this was
  113. * the final hold on this ioend. If we are asked to wait,
  114. * flush the workqueue.
  115. */
  116. STATIC void
  117. xfs_finish_ioend(
  118. xfs_ioend_t *ioend,
  119. int wait)
  120. {
  121. if (atomic_dec_and_test(&ioend->io_remaining)) {
  122. queue_work(xfsdatad_workqueue, &ioend->io_work);
  123. if (wait)
  124. flush_workqueue(xfsdatad_workqueue);
  125. }
  126. }
  127. /*
  128. * We're now finished for good with this ioend structure.
  129. * Update the page state via the associated buffer_heads,
  130. * release holds on the inode and bio, and finally free
  131. * up memory. Do not use the ioend after this.
  132. */
  133. STATIC void
  134. xfs_destroy_ioend(
  135. xfs_ioend_t *ioend)
  136. {
  137. struct buffer_head *bh, *next;
  138. for (bh = ioend->io_buffer_head; bh; bh = next) {
  139. next = bh->b_private;
  140. bh->b_end_io(bh, !ioend->io_error);
  141. }
  142. if (unlikely(ioend->io_error)) {
  143. vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error,
  144. __FILE__,__LINE__);
  145. }
  146. vn_iowake(XFS_I(ioend->io_inode));
  147. mempool_free(ioend, xfs_ioend_pool);
  148. }
  149. /*
  150. * Update on-disk file size now that data has been written to disk.
  151. * The current in-memory file size is i_size. If a write is beyond
  152. * eof i_new_size will be the intended file size until i_size is
  153. * updated. If this write does not extend all the way to the valid
  154. * file size then restrict this update to the end of the write.
  155. */
  156. STATIC void
  157. xfs_setfilesize(
  158. xfs_ioend_t *ioend)
  159. {
  160. xfs_inode_t *ip = XFS_I(ioend->io_inode);
  161. xfs_fsize_t isize;
  162. xfs_fsize_t bsize;
  163. ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
  164. ASSERT(ioend->io_type != IOMAP_READ);
  165. if (unlikely(ioend->io_error))
  166. return;
  167. bsize = ioend->io_offset + ioend->io_size;
  168. xfs_ilock(ip, XFS_ILOCK_EXCL);
  169. isize = MAX(ip->i_size, ip->i_new_size);
  170. isize = MIN(isize, bsize);
  171. if (ip->i_d.di_size < isize) {
  172. ip->i_d.di_size = isize;
  173. ip->i_update_core = 1;
  174. ip->i_update_size = 1;
  175. mark_inode_dirty_sync(ioend->io_inode);
  176. }
  177. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  178. }
  179. /*
  180. * Buffered IO write completion for delayed allocate extents.
  181. */
  182. STATIC void
  183. xfs_end_bio_delalloc(
  184. struct work_struct *work)
  185. {
  186. xfs_ioend_t *ioend =
  187. container_of(work, xfs_ioend_t, io_work);
  188. xfs_setfilesize(ioend);
  189. xfs_destroy_ioend(ioend);
  190. }
  191. /*
  192. * Buffered IO write completion for regular, written extents.
  193. */
  194. STATIC void
  195. xfs_end_bio_written(
  196. struct work_struct *work)
  197. {
  198. xfs_ioend_t *ioend =
  199. container_of(work, xfs_ioend_t, io_work);
  200. xfs_setfilesize(ioend);
  201. xfs_destroy_ioend(ioend);
  202. }
  203. /*
  204. * IO write completion for unwritten extents.
  205. *
  206. * Issue transactions to convert a buffer range from unwritten
  207. * to written extents.
  208. */
  209. STATIC void
  210. xfs_end_bio_unwritten(
  211. struct work_struct *work)
  212. {
  213. xfs_ioend_t *ioend =
  214. container_of(work, xfs_ioend_t, io_work);
  215. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  216. xfs_off_t offset = ioend->io_offset;
  217. size_t size = ioend->io_size;
  218. if (likely(!ioend->io_error)) {
  219. if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  220. int error;
  221. error = xfs_iomap_write_unwritten(ip, offset, size);
  222. if (error)
  223. ioend->io_error = error;
  224. }
  225. xfs_setfilesize(ioend);
  226. }
  227. xfs_destroy_ioend(ioend);
  228. }
  229. /*
  230. * IO read completion for regular, written extents.
  231. */
  232. STATIC void
  233. xfs_end_bio_read(
  234. struct work_struct *work)
  235. {
  236. xfs_ioend_t *ioend =
  237. container_of(work, xfs_ioend_t, io_work);
  238. xfs_destroy_ioend(ioend);
  239. }
  240. /*
  241. * Allocate and initialise an IO completion structure.
  242. * We need to track unwritten extent write completion here initially.
  243. * We'll need to extend this for updating the ondisk inode size later
  244. * (vs. incore size).
  245. */
  246. STATIC xfs_ioend_t *
  247. xfs_alloc_ioend(
  248. struct inode *inode,
  249. unsigned int type)
  250. {
  251. xfs_ioend_t *ioend;
  252. ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
  253. /*
  254. * Set the count to 1 initially, which will prevent an I/O
  255. * completion callback from happening before we have started
  256. * all the I/O from calling the completion routine too early.
  257. */
  258. atomic_set(&ioend->io_remaining, 1);
  259. ioend->io_error = 0;
  260. ioend->io_list = NULL;
  261. ioend->io_type = type;
  262. ioend->io_inode = inode;
  263. ioend->io_buffer_head = NULL;
  264. ioend->io_buffer_tail = NULL;
  265. atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
  266. ioend->io_offset = 0;
  267. ioend->io_size = 0;
  268. if (type == IOMAP_UNWRITTEN)
  269. INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
  270. else if (type == IOMAP_DELAY)
  271. INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
  272. else if (type == IOMAP_READ)
  273. INIT_WORK(&ioend->io_work, xfs_end_bio_read);
  274. else
  275. INIT_WORK(&ioend->io_work, xfs_end_bio_written);
  276. return ioend;
  277. }
  278. STATIC int
  279. xfs_map_blocks(
  280. struct inode *inode,
  281. loff_t offset,
  282. ssize_t count,
  283. xfs_iomap_t *mapp,
  284. int flags)
  285. {
  286. xfs_inode_t *ip = XFS_I(inode);
  287. int error, nmaps = 1;
  288. error = xfs_iomap(ip, offset, count,
  289. flags, mapp, &nmaps);
  290. if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
  291. xfs_iflags_set(ip, XFS_IMODIFIED);
  292. return -error;
  293. }
  294. STATIC_INLINE int
  295. xfs_iomap_valid(
  296. xfs_iomap_t *iomapp,
  297. loff_t offset)
  298. {
  299. return offset >= iomapp->iomap_offset &&
  300. offset < iomapp->iomap_offset + iomapp->iomap_bsize;
  301. }
  302. /*
  303. * BIO completion handler for buffered IO.
  304. */
  305. STATIC void
  306. xfs_end_bio(
  307. struct bio *bio,
  308. int error)
  309. {
  310. xfs_ioend_t *ioend = bio->bi_private;
  311. ASSERT(atomic_read(&bio->bi_cnt) >= 1);
  312. ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
  313. /* Toss bio and pass work off to an xfsdatad thread */
  314. bio->bi_private = NULL;
  315. bio->bi_end_io = NULL;
  316. bio_put(bio);
  317. xfs_finish_ioend(ioend, 0);
  318. }
  319. STATIC void
  320. xfs_submit_ioend_bio(
  321. xfs_ioend_t *ioend,
  322. struct bio *bio)
  323. {
  324. atomic_inc(&ioend->io_remaining);
  325. bio->bi_private = ioend;
  326. bio->bi_end_io = xfs_end_bio;
  327. submit_bio(WRITE, bio);
  328. ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
  329. bio_put(bio);
  330. }
  331. STATIC struct bio *
  332. xfs_alloc_ioend_bio(
  333. struct buffer_head *bh)
  334. {
  335. struct bio *bio;
  336. int nvecs = bio_get_nr_vecs(bh->b_bdev);
  337. do {
  338. bio = bio_alloc(GFP_NOIO, nvecs);
  339. nvecs >>= 1;
  340. } while (!bio);
  341. ASSERT(bio->bi_private == NULL);
  342. bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
  343. bio->bi_bdev = bh->b_bdev;
  344. bio_get(bio);
  345. return bio;
  346. }
  347. STATIC void
  348. xfs_start_buffer_writeback(
  349. struct buffer_head *bh)
  350. {
  351. ASSERT(buffer_mapped(bh));
  352. ASSERT(buffer_locked(bh));
  353. ASSERT(!buffer_delay(bh));
  354. ASSERT(!buffer_unwritten(bh));
  355. mark_buffer_async_write(bh);
  356. set_buffer_uptodate(bh);
  357. clear_buffer_dirty(bh);
  358. }
  359. STATIC void
  360. xfs_start_page_writeback(
  361. struct page *page,
  362. int clear_dirty,
  363. int buffers)
  364. {
  365. ASSERT(PageLocked(page));
  366. ASSERT(!PageWriteback(page));
  367. if (clear_dirty)
  368. clear_page_dirty_for_io(page);
  369. set_page_writeback(page);
  370. unlock_page(page);
  371. /* If no buffers on the page are to be written, finish it here */
  372. if (!buffers)
  373. end_page_writeback(page);
  374. }
  375. static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  376. {
  377. return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
  378. }
  379. /*
  380. * Submit all of the bios for all of the ioends we have saved up, covering the
  381. * initial writepage page and also any probed pages.
  382. *
  383. * Because we may have multiple ioends spanning a page, we need to start
  384. * writeback on all the buffers before we submit them for I/O. If we mark the
  385. * buffers as we got, then we can end up with a page that only has buffers
  386. * marked async write and I/O complete on can occur before we mark the other
  387. * buffers async write.
  388. *
  389. * The end result of this is that we trip a bug in end_page_writeback() because
  390. * we call it twice for the one page as the code in end_buffer_async_write()
  391. * assumes that all buffers on the page are started at the same time.
  392. *
  393. * The fix is two passes across the ioend list - one to start writeback on the
  394. * buffer_heads, and then submit them for I/O on the second pass.
  395. */
  396. STATIC void
  397. xfs_submit_ioend(
  398. xfs_ioend_t *ioend)
  399. {
  400. xfs_ioend_t *head = ioend;
  401. xfs_ioend_t *next;
  402. struct buffer_head *bh;
  403. struct bio *bio;
  404. sector_t lastblock = 0;
  405. /* Pass 1 - start writeback */
  406. do {
  407. next = ioend->io_list;
  408. for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
  409. xfs_start_buffer_writeback(bh);
  410. }
  411. } while ((ioend = next) != NULL);
  412. /* Pass 2 - submit I/O */
  413. ioend = head;
  414. do {
  415. next = ioend->io_list;
  416. bio = NULL;
  417. for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
  418. if (!bio) {
  419. retry:
  420. bio = xfs_alloc_ioend_bio(bh);
  421. } else if (bh->b_blocknr != lastblock + 1) {
  422. xfs_submit_ioend_bio(ioend, bio);
  423. goto retry;
  424. }
  425. if (bio_add_buffer(bio, bh) != bh->b_size) {
  426. xfs_submit_ioend_bio(ioend, bio);
  427. goto retry;
  428. }
  429. lastblock = bh->b_blocknr;
  430. }
  431. if (bio)
  432. xfs_submit_ioend_bio(ioend, bio);
  433. xfs_finish_ioend(ioend, 0);
  434. } while ((ioend = next) != NULL);
  435. }
  436. /*
  437. * Cancel submission of all buffer_heads so far in this endio.
  438. * Toss the endio too. Only ever called for the initial page
  439. * in a writepage request, so only ever one page.
  440. */
  441. STATIC void
  442. xfs_cancel_ioend(
  443. xfs_ioend_t *ioend)
  444. {
  445. xfs_ioend_t *next;
  446. struct buffer_head *bh, *next_bh;
  447. do {
  448. next = ioend->io_list;
  449. bh = ioend->io_buffer_head;
  450. do {
  451. next_bh = bh->b_private;
  452. clear_buffer_async_write(bh);
  453. unlock_buffer(bh);
  454. } while ((bh = next_bh) != NULL);
  455. vn_iowake(XFS_I(ioend->io_inode));
  456. mempool_free(ioend, xfs_ioend_pool);
  457. } while ((ioend = next) != NULL);
  458. }
  459. /*
  460. * Test to see if we've been building up a completion structure for
  461. * earlier buffers -- if so, we try to append to this ioend if we
  462. * can, otherwise we finish off any current ioend and start another.
  463. * Return true if we've finished the given ioend.
  464. */
  465. STATIC void
  466. xfs_add_to_ioend(
  467. struct inode *inode,
  468. struct buffer_head *bh,
  469. xfs_off_t offset,
  470. unsigned int type,
  471. xfs_ioend_t **result,
  472. int need_ioend)
  473. {
  474. xfs_ioend_t *ioend = *result;
  475. if (!ioend || need_ioend || type != ioend->io_type) {
  476. xfs_ioend_t *previous = *result;
  477. ioend = xfs_alloc_ioend(inode, type);
  478. ioend->io_offset = offset;
  479. ioend->io_buffer_head = bh;
  480. ioend->io_buffer_tail = bh;
  481. if (previous)
  482. previous->io_list = ioend;
  483. *result = ioend;
  484. } else {
  485. ioend->io_buffer_tail->b_private = bh;
  486. ioend->io_buffer_tail = bh;
  487. }
  488. bh->b_private = NULL;
  489. ioend->io_size += bh->b_size;
  490. }
  491. STATIC void
  492. xfs_map_buffer(
  493. struct buffer_head *bh,
  494. xfs_iomap_t *mp,
  495. xfs_off_t offset,
  496. uint block_bits)
  497. {
  498. sector_t bn;
  499. ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
  500. bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
  501. ((offset - mp->iomap_offset) >> block_bits);
  502. ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
  503. bh->b_blocknr = bn;
  504. set_buffer_mapped(bh);
  505. }
  506. STATIC void
  507. xfs_map_at_offset(
  508. struct buffer_head *bh,
  509. loff_t offset,
  510. int block_bits,
  511. xfs_iomap_t *iomapp)
  512. {
  513. ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
  514. ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
  515. lock_buffer(bh);
  516. xfs_map_buffer(bh, iomapp, offset, block_bits);
  517. bh->b_bdev = iomapp->iomap_target->bt_bdev;
  518. set_buffer_mapped(bh);
  519. clear_buffer_delay(bh);
  520. clear_buffer_unwritten(bh);
  521. }
  522. /*
  523. * Look for a page at index that is suitable for clustering.
  524. */
  525. STATIC unsigned int
  526. xfs_probe_page(
  527. struct page *page,
  528. unsigned int pg_offset,
  529. int mapped)
  530. {
  531. int ret = 0;
  532. if (PageWriteback(page))
  533. return 0;
  534. if (page->mapping && PageDirty(page)) {
  535. if (page_has_buffers(page)) {
  536. struct buffer_head *bh, *head;
  537. bh = head = page_buffers(page);
  538. do {
  539. if (!buffer_uptodate(bh))
  540. break;
  541. if (mapped != buffer_mapped(bh))
  542. break;
  543. ret += bh->b_size;
  544. if (ret >= pg_offset)
  545. break;
  546. } while ((bh = bh->b_this_page) != head);
  547. } else
  548. ret = mapped ? 0 : PAGE_CACHE_SIZE;
  549. }
  550. return ret;
  551. }
  552. STATIC size_t
  553. xfs_probe_cluster(
  554. struct inode *inode,
  555. struct page *startpage,
  556. struct buffer_head *bh,
  557. struct buffer_head *head,
  558. int mapped)
  559. {
  560. struct pagevec pvec;
  561. pgoff_t tindex, tlast, tloff;
  562. size_t total = 0;
  563. int done = 0, i;
  564. /* First sum forwards in this page */
  565. do {
  566. if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
  567. return total;
  568. total += bh->b_size;
  569. } while ((bh = bh->b_this_page) != head);
  570. /* if we reached the end of the page, sum forwards in following pages */
  571. tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
  572. tindex = startpage->index + 1;
  573. /* Prune this back to avoid pathological behavior */
  574. tloff = min(tlast, startpage->index + 64);
  575. pagevec_init(&pvec, 0);
  576. while (!done && tindex <= tloff) {
  577. unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
  578. if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
  579. break;
  580. for (i = 0; i < pagevec_count(&pvec); i++) {
  581. struct page *page = pvec.pages[i];
  582. size_t pg_offset, pg_len = 0;
  583. if (tindex == tlast) {
  584. pg_offset =
  585. i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
  586. if (!pg_offset) {
  587. done = 1;
  588. break;
  589. }
  590. } else
  591. pg_offset = PAGE_CACHE_SIZE;
  592. if (page->index == tindex && trylock_page(page)) {
  593. pg_len = xfs_probe_page(page, pg_offset, mapped);
  594. unlock_page(page);
  595. }
  596. if (!pg_len) {
  597. done = 1;
  598. break;
  599. }
  600. total += pg_len;
  601. tindex++;
  602. }
  603. pagevec_release(&pvec);
  604. cond_resched();
  605. }
  606. return total;
  607. }
  608. /*
  609. * Test if a given page is suitable for writing as part of an unwritten
  610. * or delayed allocate extent.
  611. */
  612. STATIC int
  613. xfs_is_delayed_page(
  614. struct page *page,
  615. unsigned int type)
  616. {
  617. if (PageWriteback(page))
  618. return 0;
  619. if (page->mapping && page_has_buffers(page)) {
  620. struct buffer_head *bh, *head;
  621. int acceptable = 0;
  622. bh = head = page_buffers(page);
  623. do {
  624. if (buffer_unwritten(bh))
  625. acceptable = (type == IOMAP_UNWRITTEN);
  626. else if (buffer_delay(bh))
  627. acceptable = (type == IOMAP_DELAY);
  628. else if (buffer_dirty(bh) && buffer_mapped(bh))
  629. acceptable = (type == IOMAP_NEW);
  630. else
  631. break;
  632. } while ((bh = bh->b_this_page) != head);
  633. if (acceptable)
  634. return 1;
  635. }
  636. return 0;
  637. }
  638. /*
  639. * Allocate & map buffers for page given the extent map. Write it out.
  640. * except for the original page of a writepage, this is called on
  641. * delalloc/unwritten pages only, for the original page it is possible
  642. * that the page has no mapping at all.
  643. */
  644. STATIC int
  645. xfs_convert_page(
  646. struct inode *inode,
  647. struct page *page,
  648. loff_t tindex,
  649. xfs_iomap_t *mp,
  650. xfs_ioend_t **ioendp,
  651. struct writeback_control *wbc,
  652. int startio,
  653. int all_bh)
  654. {
  655. struct buffer_head *bh, *head;
  656. xfs_off_t end_offset;
  657. unsigned long p_offset;
  658. unsigned int type;
  659. int bbits = inode->i_blkbits;
  660. int len, page_dirty;
  661. int count = 0, done = 0, uptodate = 1;
  662. xfs_off_t offset = page_offset(page);
  663. if (page->index != tindex)
  664. goto fail;
  665. if (!trylock_page(page))
  666. goto fail;
  667. if (PageWriteback(page))
  668. goto fail_unlock_page;
  669. if (page->mapping != inode->i_mapping)
  670. goto fail_unlock_page;
  671. if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
  672. goto fail_unlock_page;
  673. /*
  674. * page_dirty is initially a count of buffers on the page before
  675. * EOF and is decremented as we move each into a cleanable state.
  676. *
  677. * Derivation:
  678. *
  679. * End offset is the highest offset that this page should represent.
  680. * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
  681. * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
  682. * hence give us the correct page_dirty count. On any other page,
  683. * it will be zero and in that case we need page_dirty to be the
  684. * count of buffers on the page.
  685. */
  686. end_offset = min_t(unsigned long long,
  687. (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
  688. i_size_read(inode));
  689. len = 1 << inode->i_blkbits;
  690. p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
  691. PAGE_CACHE_SIZE);
  692. p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
  693. page_dirty = p_offset / len;
  694. bh = head = page_buffers(page);
  695. do {
  696. if (offset >= end_offset)
  697. break;
  698. if (!buffer_uptodate(bh))
  699. uptodate = 0;
  700. if (!(PageUptodate(page) || buffer_uptodate(bh))) {
  701. done = 1;
  702. continue;
  703. }
  704. if (buffer_unwritten(bh) || buffer_delay(bh)) {
  705. if (buffer_unwritten(bh))
  706. type = IOMAP_UNWRITTEN;
  707. else
  708. type = IOMAP_DELAY;
  709. if (!xfs_iomap_valid(mp, offset)) {
  710. done = 1;
  711. continue;
  712. }
  713. ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
  714. ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
  715. xfs_map_at_offset(bh, offset, bbits, mp);
  716. if (startio) {
  717. xfs_add_to_ioend(inode, bh, offset,
  718. type, ioendp, done);
  719. } else {
  720. set_buffer_dirty(bh);
  721. unlock_buffer(bh);
  722. mark_buffer_dirty(bh);
  723. }
  724. page_dirty--;
  725. count++;
  726. } else {
  727. type = IOMAP_NEW;
  728. if (buffer_mapped(bh) && all_bh && startio) {
  729. lock_buffer(bh);
  730. xfs_add_to_ioend(inode, bh, offset,
  731. type, ioendp, done);
  732. count++;
  733. page_dirty--;
  734. } else {
  735. done = 1;
  736. }
  737. }
  738. } while (offset += len, (bh = bh->b_this_page) != head);
  739. if (uptodate && bh == head)
  740. SetPageUptodate(page);
  741. if (startio) {
  742. if (count) {
  743. struct backing_dev_info *bdi;
  744. bdi = inode->i_mapping->backing_dev_info;
  745. wbc->nr_to_write--;
  746. if (bdi_write_congested(bdi)) {
  747. wbc->encountered_congestion = 1;
  748. done = 1;
  749. } else if (wbc->nr_to_write <= 0) {
  750. done = 1;
  751. }
  752. }
  753. xfs_start_page_writeback(page, !page_dirty, count);
  754. }
  755. return done;
  756. fail_unlock_page:
  757. unlock_page(page);
  758. fail:
  759. return 1;
  760. }
  761. /*
  762. * Convert & write out a cluster of pages in the same extent as defined
  763. * by mp and following the start page.
  764. */
  765. STATIC void
  766. xfs_cluster_write(
  767. struct inode *inode,
  768. pgoff_t tindex,
  769. xfs_iomap_t *iomapp,
  770. xfs_ioend_t **ioendp,
  771. struct writeback_control *wbc,
  772. int startio,
  773. int all_bh,
  774. pgoff_t tlast)
  775. {
  776. struct pagevec pvec;
  777. int done = 0, i;
  778. pagevec_init(&pvec, 0);
  779. while (!done && tindex <= tlast) {
  780. unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
  781. if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
  782. break;
  783. for (i = 0; i < pagevec_count(&pvec); i++) {
  784. done = xfs_convert_page(inode, pvec.pages[i], tindex++,
  785. iomapp, ioendp, wbc, startio, all_bh);
  786. if (done)
  787. break;
  788. }
  789. pagevec_release(&pvec);
  790. cond_resched();
  791. }
  792. }
  793. /*
  794. * Calling this without startio set means we are being asked to make a dirty
  795. * page ready for freeing it's buffers. When called with startio set then
  796. * we are coming from writepage.
  797. *
  798. * When called with startio set it is important that we write the WHOLE
  799. * page if possible.
  800. * The bh->b_state's cannot know if any of the blocks or which block for
  801. * that matter are dirty due to mmap writes, and therefore bh uptodate is
  802. * only valid if the page itself isn't completely uptodate. Some layers
  803. * may clear the page dirty flag prior to calling write page, under the
  804. * assumption the entire page will be written out; by not writing out the
  805. * whole page the page can be reused before all valid dirty data is
  806. * written out. Note: in the case of a page that has been dirty'd by
  807. * mapwrite and but partially setup by block_prepare_write the
  808. * bh->b_states's will not agree and only ones setup by BPW/BCW will have
  809. * valid state, thus the whole page must be written out thing.
  810. */
  811. STATIC int
  812. xfs_page_state_convert(
  813. struct inode *inode,
  814. struct page *page,
  815. struct writeback_control *wbc,
  816. int startio,
  817. int unmapped) /* also implies page uptodate */
  818. {
  819. struct buffer_head *bh, *head;
  820. xfs_iomap_t iomap;
  821. xfs_ioend_t *ioend = NULL, *iohead = NULL;
  822. loff_t offset;
  823. unsigned long p_offset = 0;
  824. unsigned int type;
  825. __uint64_t end_offset;
  826. pgoff_t end_index, last_index, tlast;
  827. ssize_t size, len;
  828. int flags, err, iomap_valid = 0, uptodate = 1;
  829. int page_dirty, count = 0;
  830. int trylock = 0;
  831. int all_bh = unmapped;
  832. if (startio) {
  833. if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
  834. trylock |= BMAPI_TRYLOCK;
  835. }
  836. /* Is this page beyond the end of the file? */
  837. offset = i_size_read(inode);
  838. end_index = offset >> PAGE_CACHE_SHIFT;
  839. last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
  840. if (page->index >= end_index) {
  841. if ((page->index >= end_index + 1) ||
  842. !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
  843. if (startio)
  844. unlock_page(page);
  845. return 0;
  846. }
  847. }
  848. /*
  849. * page_dirty is initially a count of buffers on the page before
  850. * EOF and is decremented as we move each into a cleanable state.
  851. *
  852. * Derivation:
  853. *
  854. * End offset is the highest offset that this page should represent.
  855. * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
  856. * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
  857. * hence give us the correct page_dirty count. On any other page,
  858. * it will be zero and in that case we need page_dirty to be the
  859. * count of buffers on the page.
  860. */
  861. end_offset = min_t(unsigned long long,
  862. (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
  863. len = 1 << inode->i_blkbits;
  864. p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
  865. PAGE_CACHE_SIZE);
  866. p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
  867. page_dirty = p_offset / len;
  868. bh = head = page_buffers(page);
  869. offset = page_offset(page);
  870. flags = BMAPI_READ;
  871. type = IOMAP_NEW;
  872. /* TODO: cleanup count and page_dirty */
  873. do {
  874. if (offset >= end_offset)
  875. break;
  876. if (!buffer_uptodate(bh))
  877. uptodate = 0;
  878. if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
  879. /*
  880. * the iomap is actually still valid, but the ioend
  881. * isn't. shouldn't happen too often.
  882. */
  883. iomap_valid = 0;
  884. continue;
  885. }
  886. if (iomap_valid)
  887. iomap_valid = xfs_iomap_valid(&iomap, offset);
  888. /*
  889. * First case, map an unwritten extent and prepare for
  890. * extent state conversion transaction on completion.
  891. *
  892. * Second case, allocate space for a delalloc buffer.
  893. * We can return EAGAIN here in the release page case.
  894. *
  895. * Third case, an unmapped buffer was found, and we are
  896. * in a path where we need to write the whole page out.
  897. */
  898. if (buffer_unwritten(bh) || buffer_delay(bh) ||
  899. ((buffer_uptodate(bh) || PageUptodate(page)) &&
  900. !buffer_mapped(bh) && (unmapped || startio))) {
  901. int new_ioend = 0;
  902. /*
  903. * Make sure we don't use a read-only iomap
  904. */
  905. if (flags == BMAPI_READ)
  906. iomap_valid = 0;
  907. if (buffer_unwritten(bh)) {
  908. type = IOMAP_UNWRITTEN;
  909. flags = BMAPI_WRITE | BMAPI_IGNSTATE;
  910. } else if (buffer_delay(bh)) {
  911. type = IOMAP_DELAY;
  912. flags = BMAPI_ALLOCATE | trylock;
  913. } else {
  914. type = IOMAP_NEW;
  915. flags = BMAPI_WRITE | BMAPI_MMAP;
  916. }
  917. if (!iomap_valid) {
  918. /*
  919. * if we didn't have a valid mapping then we
  920. * need to ensure that we put the new mapping
  921. * in a new ioend structure. This needs to be
  922. * done to ensure that the ioends correctly
  923. * reflect the block mappings at io completion
  924. * for unwritten extent conversion.
  925. */
  926. new_ioend = 1;
  927. if (type == IOMAP_NEW) {
  928. size = xfs_probe_cluster(inode,
  929. page, bh, head, 0);
  930. } else {
  931. size = len;
  932. }
  933. err = xfs_map_blocks(inode, offset, size,
  934. &iomap, flags);
  935. if (err)
  936. goto error;
  937. iomap_valid = xfs_iomap_valid(&iomap, offset);
  938. }
  939. if (iomap_valid) {
  940. xfs_map_at_offset(bh, offset,
  941. inode->i_blkbits, &iomap);
  942. if (startio) {
  943. xfs_add_to_ioend(inode, bh, offset,
  944. type, &ioend,
  945. new_ioend);
  946. } else {
  947. set_buffer_dirty(bh);
  948. unlock_buffer(bh);
  949. mark_buffer_dirty(bh);
  950. }
  951. page_dirty--;
  952. count++;
  953. }
  954. } else if (buffer_uptodate(bh) && startio) {
  955. /*
  956. * we got here because the buffer is already mapped.
  957. * That means it must already have extents allocated
  958. * underneath it. Map the extent by reading it.
  959. */
  960. if (!iomap_valid || flags != BMAPI_READ) {
  961. flags = BMAPI_READ;
  962. size = xfs_probe_cluster(inode, page, bh,
  963. head, 1);
  964. err = xfs_map_blocks(inode, offset, size,
  965. &iomap, flags);
  966. if (err)
  967. goto error;
  968. iomap_valid = xfs_iomap_valid(&iomap, offset);
  969. }
  970. /*
  971. * We set the type to IOMAP_NEW in case we are doing a
  972. * small write at EOF that is extending the file but
  973. * without needing an allocation. We need to update the
  974. * file size on I/O completion in this case so it is
  975. * the same case as having just allocated a new extent
  976. * that we are writing into for the first time.
  977. */
  978. type = IOMAP_NEW;
  979. if (trylock_buffer(bh)) {
  980. ASSERT(buffer_mapped(bh));
  981. if (iomap_valid)
  982. all_bh = 1;
  983. xfs_add_to_ioend(inode, bh, offset, type,
  984. &ioend, !iomap_valid);
  985. page_dirty--;
  986. count++;
  987. } else {
  988. iomap_valid = 0;
  989. }
  990. } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
  991. (unmapped || startio)) {
  992. iomap_valid = 0;
  993. }
  994. if (!iohead)
  995. iohead = ioend;
  996. } while (offset += len, ((bh = bh->b_this_page) != head));
  997. if (uptodate && bh == head)
  998. SetPageUptodate(page);
  999. if (startio)
  1000. xfs_start_page_writeback(page, 1, count);
  1001. if (ioend && iomap_valid) {
  1002. offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
  1003. PAGE_CACHE_SHIFT;
  1004. tlast = min_t(pgoff_t, offset, last_index);
  1005. xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
  1006. wbc, startio, all_bh, tlast);
  1007. }
  1008. if (iohead)
  1009. xfs_submit_ioend(iohead);
  1010. return page_dirty;
  1011. error:
  1012. if (iohead)
  1013. xfs_cancel_ioend(iohead);
  1014. /*
  1015. * If it's delalloc and we have nowhere to put it,
  1016. * throw it away, unless the lower layers told
  1017. * us to try again.
  1018. */
  1019. if (err != -EAGAIN) {
  1020. if (!unmapped)
  1021. block_invalidatepage(page, 0);
  1022. ClearPageUptodate(page);
  1023. }
  1024. return err;
  1025. }
  1026. /*
  1027. * writepage: Called from one of two places:
  1028. *
  1029. * 1. we are flushing a delalloc buffer head.
  1030. *
  1031. * 2. we are writing out a dirty page. Typically the page dirty
  1032. * state is cleared before we get here. In this case is it
  1033. * conceivable we have no buffer heads.
  1034. *
  1035. * For delalloc space on the page we need to allocate space and
  1036. * flush it. For unmapped buffer heads on the page we should
  1037. * allocate space if the page is uptodate. For any other dirty
  1038. * buffer heads on the page we should flush them.
  1039. *
  1040. * If we detect that a transaction would be required to flush
  1041. * the page, we have to check the process flags first, if we
  1042. * are already in a transaction or disk I/O during allocations
  1043. * is off, we need to fail the writepage and redirty the page.
  1044. */
  1045. STATIC int
  1046. xfs_vm_writepage(
  1047. struct page *page,
  1048. struct writeback_control *wbc)
  1049. {
  1050. int error;
  1051. int need_trans;
  1052. int delalloc, unmapped, unwritten;
  1053. struct inode *inode = page->mapping->host;
  1054. xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
  1055. /*
  1056. * We need a transaction if:
  1057. * 1. There are delalloc buffers on the page
  1058. * 2. The page is uptodate and we have unmapped buffers
  1059. * 3. The page is uptodate and we have no buffers
  1060. * 4. There are unwritten buffers on the page
  1061. */
  1062. if (!page_has_buffers(page)) {
  1063. unmapped = 1;
  1064. need_trans = 1;
  1065. } else {
  1066. xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
  1067. if (!PageUptodate(page))
  1068. unmapped = 0;
  1069. need_trans = delalloc + unmapped + unwritten;
  1070. }
  1071. /*
  1072. * If we need a transaction and the process flags say
  1073. * we are already in a transaction, or no IO is allowed
  1074. * then mark the page dirty again and leave the page
  1075. * as is.
  1076. */
  1077. if (current_test_flags(PF_FSTRANS) && need_trans)
  1078. goto out_fail;
  1079. /*
  1080. * Delay hooking up buffer heads until we have
  1081. * made our go/no-go decision.
  1082. */
  1083. if (!page_has_buffers(page))
  1084. create_empty_buffers(page, 1 << inode->i_blkbits, 0);
  1085. /*
  1086. * Convert delayed allocate, unwritten or unmapped space
  1087. * to real space and flush out to disk.
  1088. */
  1089. error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
  1090. if (error == -EAGAIN)
  1091. goto out_fail;
  1092. if (unlikely(error < 0))
  1093. goto out_unlock;
  1094. return 0;
  1095. out_fail:
  1096. redirty_page_for_writepage(wbc, page);
  1097. unlock_page(page);
  1098. return 0;
  1099. out_unlock:
  1100. unlock_page(page);
  1101. return error;
  1102. }
  1103. STATIC int
  1104. xfs_vm_writepages(
  1105. struct address_space *mapping,
  1106. struct writeback_control *wbc)
  1107. {
  1108. xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
  1109. return generic_writepages(mapping, wbc);
  1110. }
  1111. /*
  1112. * Called to move a page into cleanable state - and from there
  1113. * to be released. Possibly the page is already clean. We always
  1114. * have buffer heads in this call.
  1115. *
  1116. * Returns 0 if the page is ok to release, 1 otherwise.
  1117. *
  1118. * Possible scenarios are:
  1119. *
  1120. * 1. We are being called to release a page which has been written
  1121. * to via regular I/O. buffer heads will be dirty and possibly
  1122. * delalloc. If no delalloc buffer heads in this case then we
  1123. * can just return zero.
  1124. *
  1125. * 2. We are called to release a page which has been written via
  1126. * mmap, all we need to do is ensure there is no delalloc
  1127. * state in the buffer heads, if not we can let the caller
  1128. * free them and we should come back later via writepage.
  1129. */
  1130. STATIC int
  1131. xfs_vm_releasepage(
  1132. struct page *page,
  1133. gfp_t gfp_mask)
  1134. {
  1135. struct inode *inode = page->mapping->host;
  1136. int dirty, delalloc, unmapped, unwritten;
  1137. struct writeback_control wbc = {
  1138. .sync_mode = WB_SYNC_ALL,
  1139. .nr_to_write = 1,
  1140. };
  1141. xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0);
  1142. if (!page_has_buffers(page))
  1143. return 0;
  1144. xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
  1145. if (!delalloc && !unwritten)
  1146. goto free_buffers;
  1147. if (!(gfp_mask & __GFP_FS))
  1148. return 0;
  1149. /* If we are already inside a transaction or the thread cannot
  1150. * do I/O, we cannot release this page.
  1151. */
  1152. if (current_test_flags(PF_FSTRANS))
  1153. return 0;
  1154. /*
  1155. * Convert delalloc space to real space, do not flush the
  1156. * data out to disk, that will be done by the caller.
  1157. * Never need to allocate space here - we will always
  1158. * come back to writepage in that case.
  1159. */
  1160. dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
  1161. if (dirty == 0 && !unwritten)
  1162. goto free_buffers;
  1163. return 0;
  1164. free_buffers:
  1165. return try_to_free_buffers(page);
  1166. }
  1167. STATIC int
  1168. __xfs_get_blocks(
  1169. struct inode *inode,
  1170. sector_t iblock,
  1171. struct buffer_head *bh_result,
  1172. int create,
  1173. int direct,
  1174. bmapi_flags_t flags)
  1175. {
  1176. xfs_iomap_t iomap;
  1177. xfs_off_t offset;
  1178. ssize_t size;
  1179. int niomap = 1;
  1180. int error;
  1181. offset = (xfs_off_t)iblock << inode->i_blkbits;
  1182. ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
  1183. size = bh_result->b_size;
  1184. if (!create && direct && offset >= i_size_read(inode))
  1185. return 0;
  1186. error = xfs_iomap(XFS_I(inode), offset, size,
  1187. create ? flags : BMAPI_READ, &iomap, &niomap);
  1188. if (error)
  1189. return -error;
  1190. if (niomap == 0)
  1191. return 0;
  1192. if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
  1193. /*
  1194. * For unwritten extents do not report a disk address on
  1195. * the read case (treat as if we're reading into a hole).
  1196. */
  1197. if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
  1198. xfs_map_buffer(bh_result, &iomap, offset,
  1199. inode->i_blkbits);
  1200. }
  1201. if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
  1202. if (direct)
  1203. bh_result->b_private = inode;
  1204. set_buffer_unwritten(bh_result);
  1205. }
  1206. }
  1207. /*
  1208. * If this is a realtime file, data may be on a different device.
  1209. * to that pointed to from the buffer_head b_bdev currently.
  1210. */
  1211. bh_result->b_bdev = iomap.iomap_target->bt_bdev;
  1212. /*
  1213. * If we previously allocated a block out beyond eof and we are now
  1214. * coming back to use it then we will need to flag it as new even if it
  1215. * has a disk address.
  1216. *
  1217. * With sub-block writes into unwritten extents we also need to mark
  1218. * the buffer as new so that the unwritten parts of the buffer gets
  1219. * correctly zeroed.
  1220. */
  1221. if (create &&
  1222. ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
  1223. (offset >= i_size_read(inode)) ||
  1224. (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
  1225. set_buffer_new(bh_result);
  1226. if (iomap.iomap_flags & IOMAP_DELAY) {
  1227. BUG_ON(direct);
  1228. if (create) {
  1229. set_buffer_uptodate(bh_result);
  1230. set_buffer_mapped(bh_result);
  1231. set_buffer_delay(bh_result);
  1232. }
  1233. }
  1234. if (direct || size > (1 << inode->i_blkbits)) {
  1235. ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
  1236. offset = min_t(xfs_off_t,
  1237. iomap.iomap_bsize - iomap.iomap_delta, size);
  1238. bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
  1239. }
  1240. return 0;
  1241. }
  1242. int
  1243. xfs_get_blocks(
  1244. struct inode *inode,
  1245. sector_t iblock,
  1246. struct buffer_head *bh_result,
  1247. int create)
  1248. {
  1249. return __xfs_get_blocks(inode, iblock,
  1250. bh_result, create, 0, BMAPI_WRITE);
  1251. }
  1252. STATIC int
  1253. xfs_get_blocks_direct(
  1254. struct inode *inode,
  1255. sector_t iblock,
  1256. struct buffer_head *bh_result,
  1257. int create)
  1258. {
  1259. return __xfs_get_blocks(inode, iblock,
  1260. bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
  1261. }
  1262. STATIC void
  1263. xfs_end_io_direct(
  1264. struct kiocb *iocb,
  1265. loff_t offset,
  1266. ssize_t size,
  1267. void *private)
  1268. {
  1269. xfs_ioend_t *ioend = iocb->private;
  1270. /*
  1271. * Non-NULL private data means we need to issue a transaction to
  1272. * convert a range from unwritten to written extents. This needs
  1273. * to happen from process context but aio+dio I/O completion
  1274. * happens from irq context so we need to defer it to a workqueue.
  1275. * This is not necessary for synchronous direct I/O, but we do
  1276. * it anyway to keep the code uniform and simpler.
  1277. *
  1278. * Well, if only it were that simple. Because synchronous direct I/O
  1279. * requires extent conversion to occur *before* we return to userspace,
  1280. * we have to wait for extent conversion to complete. Look at the
  1281. * iocb that has been passed to us to determine if this is AIO or
  1282. * not. If it is synchronous, tell xfs_finish_ioend() to kick the
  1283. * workqueue and wait for it to complete.
  1284. *
  1285. * The core direct I/O code might be changed to always call the
  1286. * completion handler in the future, in which case all this can
  1287. * go away.
  1288. */
  1289. ioend->io_offset = offset;
  1290. ioend->io_size = size;
  1291. if (ioend->io_type == IOMAP_READ) {
  1292. xfs_finish_ioend(ioend, 0);
  1293. } else if (private && size > 0) {
  1294. xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
  1295. } else {
  1296. /*
  1297. * A direct I/O write ioend starts it's life in unwritten
  1298. * state in case they map an unwritten extent. This write
  1299. * didn't map an unwritten extent so switch it's completion
  1300. * handler.
  1301. */
  1302. INIT_WORK(&ioend->io_work, xfs_end_bio_written);
  1303. xfs_finish_ioend(ioend, 0);
  1304. }
  1305. /*
  1306. * blockdev_direct_IO can return an error even after the I/O
  1307. * completion handler was called. Thus we need to protect
  1308. * against double-freeing.
  1309. */
  1310. iocb->private = NULL;
  1311. }
  1312. STATIC ssize_t
  1313. xfs_vm_direct_IO(
  1314. int rw,
  1315. struct kiocb *iocb,
  1316. const struct iovec *iov,
  1317. loff_t offset,
  1318. unsigned long nr_segs)
  1319. {
  1320. struct file *file = iocb->ki_filp;
  1321. struct inode *inode = file->f_mapping->host;
  1322. struct block_device *bdev;
  1323. ssize_t ret;
  1324. bdev = xfs_find_bdev_for_inode(XFS_I(inode));
  1325. if (rw == WRITE) {
  1326. iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
  1327. ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
  1328. bdev, iov, offset, nr_segs,
  1329. xfs_get_blocks_direct,
  1330. xfs_end_io_direct);
  1331. } else {
  1332. iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
  1333. ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
  1334. bdev, iov, offset, nr_segs,
  1335. xfs_get_blocks_direct,
  1336. xfs_end_io_direct);
  1337. }
  1338. if (unlikely(ret != -EIOCBQUEUED && iocb->private))
  1339. xfs_destroy_ioend(iocb->private);
  1340. return ret;
  1341. }
  1342. STATIC int
  1343. xfs_vm_write_begin(
  1344. struct file *file,
  1345. struct address_space *mapping,
  1346. loff_t pos,
  1347. unsigned len,
  1348. unsigned flags,
  1349. struct page **pagep,
  1350. void **fsdata)
  1351. {
  1352. *pagep = NULL;
  1353. return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
  1354. xfs_get_blocks);
  1355. }
  1356. STATIC sector_t
  1357. xfs_vm_bmap(
  1358. struct address_space *mapping,
  1359. sector_t block)
  1360. {
  1361. struct inode *inode = (struct inode *)mapping->host;
  1362. struct xfs_inode *ip = XFS_I(inode);
  1363. xfs_itrace_entry(XFS_I(inode));
  1364. xfs_ilock(ip, XFS_IOLOCK_SHARED);
  1365. xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
  1366. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  1367. return generic_block_bmap(mapping, block, xfs_get_blocks);
  1368. }
  1369. STATIC int
  1370. xfs_vm_readpage(
  1371. struct file *unused,
  1372. struct page *page)
  1373. {
  1374. return mpage_readpage(page, xfs_get_blocks);
  1375. }
  1376. STATIC int
  1377. xfs_vm_readpages(
  1378. struct file *unused,
  1379. struct address_space *mapping,
  1380. struct list_head *pages,
  1381. unsigned nr_pages)
  1382. {
  1383. return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
  1384. }
  1385. STATIC void
  1386. xfs_vm_invalidatepage(
  1387. struct page *page,
  1388. unsigned long offset)
  1389. {
  1390. xfs_page_trace(XFS_INVALIDPAGE_ENTER,
  1391. page->mapping->host, page, offset);
  1392. block_invalidatepage(page, offset);
  1393. }
  1394. const struct address_space_operations xfs_address_space_operations = {
  1395. .readpage = xfs_vm_readpage,
  1396. .readpages = xfs_vm_readpages,
  1397. .writepage = xfs_vm_writepage,
  1398. .writepages = xfs_vm_writepages,
  1399. .sync_page = block_sync_page,
  1400. .releasepage = xfs_vm_releasepage,
  1401. .invalidatepage = xfs_vm_invalidatepage,
  1402. .write_begin = xfs_vm_write_begin,
  1403. .write_end = generic_write_end,
  1404. .bmap = xfs_vm_bmap,
  1405. .direct_IO = xfs_vm_direct_IO,
  1406. .migratepage = buffer_migrate_page,
  1407. };