xfs_file.c 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039
  1. /*
  2. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  3. * All Rights Reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it would be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write the Free Software Foundation,
  16. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "xfs.h"
  19. #include "xfs_fs.h"
  20. #include "xfs_bit.h"
  21. #include "xfs_log.h"
  22. #include "xfs_inum.h"
  23. #include "xfs_sb.h"
  24. #include "xfs_ag.h"
  25. #include "xfs_dir2.h"
  26. #include "xfs_trans.h"
  27. #include "xfs_dmapi.h"
  28. #include "xfs_mount.h"
  29. #include "xfs_bmap_btree.h"
  30. #include "xfs_alloc_btree.h"
  31. #include "xfs_ialloc_btree.h"
  32. #include "xfs_alloc.h"
  33. #include "xfs_btree.h"
  34. #include "xfs_attr_sf.h"
  35. #include "xfs_dir2_sf.h"
  36. #include "xfs_dinode.h"
  37. #include "xfs_inode.h"
  38. #include "xfs_inode_item.h"
  39. #include "xfs_bmap.h"
  40. #include "xfs_error.h"
  41. #include "xfs_rw.h"
  42. #include "xfs_vnodeops.h"
  43. #include "xfs_da_btree.h"
  44. #include "xfs_ioctl.h"
  45. #include "xfs_trace.h"
  46. #include <linux/dcache.h>
  47. static const struct vm_operations_struct xfs_file_vm_ops;
  48. /*
  49. * xfs_iozero
  50. *
  51. * xfs_iozero clears the specified range of buffer supplied,
  52. * and marks all the affected blocks as valid and modified. If
  53. * an affected block is not allocated, it will be allocated. If
  54. * an affected block is not completely overwritten, and is not
  55. * valid before the operation, it will be read from disk before
  56. * being partially zeroed.
  57. */
  58. STATIC int
  59. xfs_iozero(
  60. struct xfs_inode *ip, /* inode */
  61. loff_t pos, /* offset in file */
  62. size_t count) /* size of data to zero */
  63. {
  64. struct page *page;
  65. struct address_space *mapping;
  66. int status;
  67. mapping = VFS_I(ip)->i_mapping;
  68. do {
  69. unsigned offset, bytes;
  70. void *fsdata;
  71. offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
  72. bytes = PAGE_CACHE_SIZE - offset;
  73. if (bytes > count)
  74. bytes = count;
  75. status = pagecache_write_begin(NULL, mapping, pos, bytes,
  76. AOP_FLAG_UNINTERRUPTIBLE,
  77. &page, &fsdata);
  78. if (status)
  79. break;
  80. zero_user(page, offset, bytes);
  81. status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
  82. page, fsdata);
  83. WARN_ON(status <= 0); /* can't return less than zero! */
  84. pos += bytes;
  85. count -= bytes;
  86. status = 0;
  87. } while (count);
  88. return (-status);
  89. }
  90. /*
  91. * We ignore the datasync flag here because a datasync is effectively
  92. * identical to an fsync. That is, datasync implies that we need to write
  93. * only the metadata needed to be able to access the data that is written
  94. * if we crash after the call completes. Hence if we are writing beyond
  95. * EOF we have to log the inode size change as well, which makes it a
  96. * full fsync. If we don't write beyond EOF, the inode core will be
  97. * clean in memory and so we don't need to log the inode, just like
  98. * fsync.
  99. */
  100. STATIC int
  101. xfs_file_fsync(
  102. struct file *file,
  103. struct dentry *dentry,
  104. int datasync)
  105. {
  106. struct xfs_inode *ip = XFS_I(dentry->d_inode);
  107. struct xfs_trans *tp;
  108. int error = 0;
  109. int log_flushed = 0;
  110. xfs_itrace_entry(ip);
  111. if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  112. return -XFS_ERROR(EIO);
  113. xfs_iflags_clear(ip, XFS_ITRUNCATED);
  114. /*
  115. * We always need to make sure that the required inode state is safe on
  116. * disk. The inode might be clean but we still might need to force the
  117. * log because of committed transactions that haven't hit the disk yet.
  118. * Likewise, there could be unflushed non-transactional changes to the
  119. * inode core that have to go to disk and this requires us to issue
  120. * a synchronous transaction to capture these changes correctly.
  121. *
  122. * This code relies on the assumption that if the i_update_core field
  123. * of the inode is clear and the inode is unpinned then it is clean
  124. * and no action is required.
  125. */
  126. xfs_ilock(ip, XFS_ILOCK_SHARED);
  127. if (ip->i_update_core) {
  128. /*
  129. * Kick off a transaction to log the inode core to get the
  130. * updates. The sync transaction will also force the log.
  131. */
  132. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  133. tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
  134. error = xfs_trans_reserve(tp, 0,
  135. XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
  136. if (error) {
  137. xfs_trans_cancel(tp, 0);
  138. return -error;
  139. }
  140. xfs_ilock(ip, XFS_ILOCK_EXCL);
  141. /*
  142. * Note - it's possible that we might have pushed ourselves out
  143. * of the way during trans_reserve which would flush the inode.
  144. * But there's no guarantee that the inode buffer has actually
  145. * gone out yet (it's delwri). Plus the buffer could be pinned
  146. * anyway if it's part of an inode in another recent
  147. * transaction. So we play it safe and fire off the
  148. * transaction anyway.
  149. */
  150. xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  151. xfs_trans_ihold(tp, ip);
  152. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  153. xfs_trans_set_sync(tp);
  154. error = _xfs_trans_commit(tp, 0, &log_flushed);
  155. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  156. } else {
  157. /*
  158. * Timestamps/size haven't changed since last inode flush or
  159. * inode transaction commit. That means either nothing got
  160. * written or a transaction committed which caught the updates.
  161. * If the latter happened and the transaction hasn't hit the
  162. * disk yet, the inode will be still be pinned. If it is,
  163. * force the log.
  164. */
  165. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  166. if (xfs_ipincount(ip)) {
  167. if (ip->i_itemp->ili_last_lsn) {
  168. error = _xfs_log_force_lsn(ip->i_mount,
  169. ip->i_itemp->ili_last_lsn,
  170. XFS_LOG_SYNC, &log_flushed);
  171. } else {
  172. error = _xfs_log_force(ip->i_mount,
  173. XFS_LOG_SYNC, &log_flushed);
  174. }
  175. }
  176. }
  177. if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
  178. /*
  179. * If the log write didn't issue an ordered tag we need
  180. * to flush the disk cache for the data device now.
  181. */
  182. if (!log_flushed)
  183. xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
  184. /*
  185. * If this inode is on the RT dev we need to flush that
  186. * cache as well.
  187. */
  188. if (XFS_IS_REALTIME_INODE(ip))
  189. xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
  190. }
  191. return -error;
  192. }
  193. STATIC ssize_t
  194. xfs_file_aio_read(
  195. struct kiocb *iocb,
  196. const struct iovec *iovp,
  197. unsigned long nr_segs,
  198. loff_t pos)
  199. {
  200. struct file *file = iocb->ki_filp;
  201. struct inode *inode = file->f_mapping->host;
  202. struct xfs_inode *ip = XFS_I(inode);
  203. struct xfs_mount *mp = ip->i_mount;
  204. size_t size = 0;
  205. ssize_t ret = 0;
  206. int ioflags = 0;
  207. xfs_fsize_t n;
  208. unsigned long seg;
  209. XFS_STATS_INC(xs_read_calls);
  210. BUG_ON(iocb->ki_pos != pos);
  211. if (unlikely(file->f_flags & O_DIRECT))
  212. ioflags |= IO_ISDIRECT;
  213. if (file->f_mode & FMODE_NOCMTIME)
  214. ioflags |= IO_INVIS;
  215. /* START copy & waste from filemap.c */
  216. for (seg = 0; seg < nr_segs; seg++) {
  217. const struct iovec *iv = &iovp[seg];
  218. /*
  219. * If any segment has a negative length, or the cumulative
  220. * length ever wraps negative then return -EINVAL.
  221. */
  222. size += iv->iov_len;
  223. if (unlikely((ssize_t)(size|iv->iov_len) < 0))
  224. return XFS_ERROR(-EINVAL);
  225. }
  226. /* END copy & waste from filemap.c */
  227. if (unlikely(ioflags & IO_ISDIRECT)) {
  228. xfs_buftarg_t *target =
  229. XFS_IS_REALTIME_INODE(ip) ?
  230. mp->m_rtdev_targp : mp->m_ddev_targp;
  231. if ((iocb->ki_pos & target->bt_smask) ||
  232. (size & target->bt_smask)) {
  233. if (iocb->ki_pos == ip->i_size)
  234. return 0;
  235. return -XFS_ERROR(EINVAL);
  236. }
  237. }
  238. n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
  239. if (n <= 0 || size == 0)
  240. return 0;
  241. if (n < size)
  242. size = n;
  243. if (XFS_FORCED_SHUTDOWN(mp))
  244. return -EIO;
  245. if (unlikely(ioflags & IO_ISDIRECT))
  246. mutex_lock(&inode->i_mutex);
  247. xfs_ilock(ip, XFS_IOLOCK_SHARED);
  248. if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
  249. int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
  250. int iolock = XFS_IOLOCK_SHARED;
  251. ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
  252. dmflags, &iolock);
  253. if (ret) {
  254. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  255. if (unlikely(ioflags & IO_ISDIRECT))
  256. mutex_unlock(&inode->i_mutex);
  257. return ret;
  258. }
  259. }
  260. if (unlikely(ioflags & IO_ISDIRECT)) {
  261. if (inode->i_mapping->nrpages) {
  262. ret = -xfs_flushinval_pages(ip,
  263. (iocb->ki_pos & PAGE_CACHE_MASK),
  264. -1, FI_REMAPF_LOCKED);
  265. }
  266. mutex_unlock(&inode->i_mutex);
  267. if (ret) {
  268. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  269. return ret;
  270. }
  271. }
  272. trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
  273. ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
  274. if (ret > 0)
  275. XFS_STATS_ADD(xs_read_bytes, ret);
  276. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  277. return ret;
  278. }
  279. STATIC ssize_t
  280. xfs_file_splice_read(
  281. struct file *infilp,
  282. loff_t *ppos,
  283. struct pipe_inode_info *pipe,
  284. size_t count,
  285. unsigned int flags)
  286. {
  287. struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
  288. struct xfs_mount *mp = ip->i_mount;
  289. int ioflags = 0;
  290. ssize_t ret;
  291. XFS_STATS_INC(xs_read_calls);
  292. if (infilp->f_mode & FMODE_NOCMTIME)
  293. ioflags |= IO_INVIS;
  294. if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  295. return -EIO;
  296. xfs_ilock(ip, XFS_IOLOCK_SHARED);
  297. if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
  298. int iolock = XFS_IOLOCK_SHARED;
  299. int error;
  300. error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
  301. FILP_DELAY_FLAG(infilp), &iolock);
  302. if (error) {
  303. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  304. return -error;
  305. }
  306. }
  307. trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
  308. ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
  309. if (ret > 0)
  310. XFS_STATS_ADD(xs_read_bytes, ret);
  311. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  312. return ret;
  313. }
  314. STATIC ssize_t
  315. xfs_file_splice_write(
  316. struct pipe_inode_info *pipe,
  317. struct file *outfilp,
  318. loff_t *ppos,
  319. size_t count,
  320. unsigned int flags)
  321. {
  322. struct inode *inode = outfilp->f_mapping->host;
  323. struct xfs_inode *ip = XFS_I(inode);
  324. struct xfs_mount *mp = ip->i_mount;
  325. xfs_fsize_t isize, new_size;
  326. int ioflags = 0;
  327. ssize_t ret;
  328. XFS_STATS_INC(xs_write_calls);
  329. if (outfilp->f_mode & FMODE_NOCMTIME)
  330. ioflags |= IO_INVIS;
  331. if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  332. return -EIO;
  333. xfs_ilock(ip, XFS_IOLOCK_EXCL);
  334. if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
  335. int iolock = XFS_IOLOCK_EXCL;
  336. int error;
  337. error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
  338. FILP_DELAY_FLAG(outfilp), &iolock);
  339. if (error) {
  340. xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  341. return -error;
  342. }
  343. }
  344. new_size = *ppos + count;
  345. xfs_ilock(ip, XFS_ILOCK_EXCL);
  346. if (new_size > ip->i_size)
  347. ip->i_new_size = new_size;
  348. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  349. trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
  350. ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
  351. if (ret > 0)
  352. XFS_STATS_ADD(xs_write_bytes, ret);
  353. isize = i_size_read(inode);
  354. if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
  355. *ppos = isize;
  356. if (*ppos > ip->i_size) {
  357. xfs_ilock(ip, XFS_ILOCK_EXCL);
  358. if (*ppos > ip->i_size)
  359. ip->i_size = *ppos;
  360. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  361. }
  362. if (ip->i_new_size) {
  363. xfs_ilock(ip, XFS_ILOCK_EXCL);
  364. ip->i_new_size = 0;
  365. if (ip->i_d.di_size > ip->i_size)
  366. ip->i_d.di_size = ip->i_size;
  367. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  368. }
  369. xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  370. return ret;
  371. }
  372. /*
  373. * This routine is called to handle zeroing any space in the last
  374. * block of the file that is beyond the EOF. We do this since the
  375. * size is being increased without writing anything to that block
  376. * and we don't want anyone to read the garbage on the disk.
  377. */
  378. STATIC int /* error (positive) */
  379. xfs_zero_last_block(
  380. xfs_inode_t *ip,
  381. xfs_fsize_t offset,
  382. xfs_fsize_t isize)
  383. {
  384. xfs_fileoff_t last_fsb;
  385. xfs_mount_t *mp = ip->i_mount;
  386. int nimaps;
  387. int zero_offset;
  388. int zero_len;
  389. int error = 0;
  390. xfs_bmbt_irec_t imap;
  391. ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
  392. zero_offset = XFS_B_FSB_OFFSET(mp, isize);
  393. if (zero_offset == 0) {
  394. /*
  395. * There are no extra bytes in the last block on disk to
  396. * zero, so return.
  397. */
  398. return 0;
  399. }
  400. last_fsb = XFS_B_TO_FSBT(mp, isize);
  401. nimaps = 1;
  402. error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
  403. &nimaps, NULL, NULL);
  404. if (error) {
  405. return error;
  406. }
  407. ASSERT(nimaps > 0);
  408. /*
  409. * If the block underlying isize is just a hole, then there
  410. * is nothing to zero.
  411. */
  412. if (imap.br_startblock == HOLESTARTBLOCK) {
  413. return 0;
  414. }
  415. /*
  416. * Zero the part of the last block beyond the EOF, and write it
  417. * out sync. We need to drop the ilock while we do this so we
  418. * don't deadlock when the buffer cache calls back to us.
  419. */
  420. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  421. zero_len = mp->m_sb.sb_blocksize - zero_offset;
  422. if (isize + zero_len > offset)
  423. zero_len = offset - isize;
  424. error = xfs_iozero(ip, isize, zero_len);
  425. xfs_ilock(ip, XFS_ILOCK_EXCL);
  426. ASSERT(error >= 0);
  427. return error;
  428. }
  429. /*
  430. * Zero any on disk space between the current EOF and the new,
  431. * larger EOF. This handles the normal case of zeroing the remainder
  432. * of the last block in the file and the unusual case of zeroing blocks
  433. * out beyond the size of the file. This second case only happens
  434. * with fixed size extents and when the system crashes before the inode
  435. * size was updated but after blocks were allocated. If fill is set,
  436. * then any holes in the range are filled and zeroed. If not, the holes
  437. * are left alone as holes.
  438. */
  439. int /* error (positive) */
  440. xfs_zero_eof(
  441. xfs_inode_t *ip,
  442. xfs_off_t offset, /* starting I/O offset */
  443. xfs_fsize_t isize) /* current inode size */
  444. {
  445. xfs_mount_t *mp = ip->i_mount;
  446. xfs_fileoff_t start_zero_fsb;
  447. xfs_fileoff_t end_zero_fsb;
  448. xfs_fileoff_t zero_count_fsb;
  449. xfs_fileoff_t last_fsb;
  450. xfs_fileoff_t zero_off;
  451. xfs_fsize_t zero_len;
  452. int nimaps;
  453. int error = 0;
  454. xfs_bmbt_irec_t imap;
  455. ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
  456. ASSERT(offset > isize);
  457. /*
  458. * First handle zeroing the block on which isize resides.
  459. * We only zero a part of that block so it is handled specially.
  460. */
  461. error = xfs_zero_last_block(ip, offset, isize);
  462. if (error) {
  463. ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
  464. return error;
  465. }
  466. /*
  467. * Calculate the range between the new size and the old
  468. * where blocks needing to be zeroed may exist. To get the
  469. * block where the last byte in the file currently resides,
  470. * we need to subtract one from the size and truncate back
  471. * to a block boundary. We subtract 1 in case the size is
  472. * exactly on a block boundary.
  473. */
  474. last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
  475. start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
  476. end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
  477. ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
  478. if (last_fsb == end_zero_fsb) {
  479. /*
  480. * The size was only incremented on its last block.
  481. * We took care of that above, so just return.
  482. */
  483. return 0;
  484. }
  485. ASSERT(start_zero_fsb <= end_zero_fsb);
  486. while (start_zero_fsb <= end_zero_fsb) {
  487. nimaps = 1;
  488. zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
  489. error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
  490. 0, NULL, 0, &imap, &nimaps, NULL, NULL);
  491. if (error) {
  492. ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
  493. return error;
  494. }
  495. ASSERT(nimaps > 0);
  496. if (imap.br_state == XFS_EXT_UNWRITTEN ||
  497. imap.br_startblock == HOLESTARTBLOCK) {
  498. /*
  499. * This loop handles initializing pages that were
  500. * partially initialized by the code below this
  501. * loop. It basically zeroes the part of the page
  502. * that sits on a hole and sets the page as P_HOLE
  503. * and calls remapf if it is a mapped file.
  504. */
  505. start_zero_fsb = imap.br_startoff + imap.br_blockcount;
  506. ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
  507. continue;
  508. }
  509. /*
  510. * There are blocks we need to zero.
  511. * Drop the inode lock while we're doing the I/O.
  512. * We'll still have the iolock to protect us.
  513. */
  514. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  515. zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
  516. zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
  517. if ((zero_off + zero_len) > offset)
  518. zero_len = offset - zero_off;
  519. error = xfs_iozero(ip, zero_off, zero_len);
  520. if (error) {
  521. goto out_lock;
  522. }
  523. start_zero_fsb = imap.br_startoff + imap.br_blockcount;
  524. ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
  525. xfs_ilock(ip, XFS_ILOCK_EXCL);
  526. }
  527. return 0;
  528. out_lock:
  529. xfs_ilock(ip, XFS_ILOCK_EXCL);
  530. ASSERT(error >= 0);
  531. return error;
  532. }
  533. STATIC ssize_t
  534. xfs_file_aio_write(
  535. struct kiocb *iocb,
  536. const struct iovec *iovp,
  537. unsigned long nr_segs,
  538. loff_t pos)
  539. {
  540. struct file *file = iocb->ki_filp;
  541. struct address_space *mapping = file->f_mapping;
  542. struct inode *inode = mapping->host;
  543. struct xfs_inode *ip = XFS_I(inode);
  544. struct xfs_mount *mp = ip->i_mount;
  545. ssize_t ret = 0, error = 0;
  546. int ioflags = 0;
  547. xfs_fsize_t isize, new_size;
  548. int iolock;
  549. int eventsent = 0;
  550. size_t ocount = 0, count;
  551. int need_i_mutex;
  552. XFS_STATS_INC(xs_write_calls);
  553. BUG_ON(iocb->ki_pos != pos);
  554. if (unlikely(file->f_flags & O_DIRECT))
  555. ioflags |= IO_ISDIRECT;
  556. if (file->f_mode & FMODE_NOCMTIME)
  557. ioflags |= IO_INVIS;
  558. error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
  559. if (error)
  560. return error;
  561. count = ocount;
  562. if (count == 0)
  563. return 0;
  564. xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
  565. if (XFS_FORCED_SHUTDOWN(mp))
  566. return -EIO;
  567. relock:
  568. if (ioflags & IO_ISDIRECT) {
  569. iolock = XFS_IOLOCK_SHARED;
  570. need_i_mutex = 0;
  571. } else {
  572. iolock = XFS_IOLOCK_EXCL;
  573. need_i_mutex = 1;
  574. mutex_lock(&inode->i_mutex);
  575. }
  576. xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
  577. start:
  578. error = -generic_write_checks(file, &pos, &count,
  579. S_ISBLK(inode->i_mode));
  580. if (error) {
  581. xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
  582. goto out_unlock_mutex;
  583. }
  584. if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
  585. !(ioflags & IO_INVIS) && !eventsent)) {
  586. int dmflags = FILP_DELAY_FLAG(file);
  587. if (need_i_mutex)
  588. dmflags |= DM_FLAGS_IMUX;
  589. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  590. error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
  591. pos, count, dmflags, &iolock);
  592. if (error) {
  593. goto out_unlock_internal;
  594. }
  595. xfs_ilock(ip, XFS_ILOCK_EXCL);
  596. eventsent = 1;
  597. /*
  598. * The iolock was dropped and reacquired in XFS_SEND_DATA
  599. * so we have to recheck the size when appending.
  600. * We will only "goto start;" once, since having sent the
  601. * event prevents another call to XFS_SEND_DATA, which is
  602. * what allows the size to change in the first place.
  603. */
  604. if ((file->f_flags & O_APPEND) && pos != ip->i_size)
  605. goto start;
  606. }
  607. if (ioflags & IO_ISDIRECT) {
  608. xfs_buftarg_t *target =
  609. XFS_IS_REALTIME_INODE(ip) ?
  610. mp->m_rtdev_targp : mp->m_ddev_targp;
  611. if ((pos & target->bt_smask) || (count & target->bt_smask)) {
  612. xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
  613. return XFS_ERROR(-EINVAL);
  614. }
  615. if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
  616. xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
  617. iolock = XFS_IOLOCK_EXCL;
  618. need_i_mutex = 1;
  619. mutex_lock(&inode->i_mutex);
  620. xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
  621. goto start;
  622. }
  623. }
  624. new_size = pos + count;
  625. if (new_size > ip->i_size)
  626. ip->i_new_size = new_size;
  627. if (likely(!(ioflags & IO_INVIS)))
  628. file_update_time(file);
  629. /*
  630. * If the offset is beyond the size of the file, we have a couple
  631. * of things to do. First, if there is already space allocated
  632. * we need to either create holes or zero the disk or ...
  633. *
  634. * If there is a page where the previous size lands, we need
  635. * to zero it out up to the new size.
  636. */
  637. if (pos > ip->i_size) {
  638. error = xfs_zero_eof(ip, pos, ip->i_size);
  639. if (error) {
  640. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  641. goto out_unlock_internal;
  642. }
  643. }
  644. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  645. /*
  646. * If we're writing the file then make sure to clear the
  647. * setuid and setgid bits if the process is not being run
  648. * by root. This keeps people from modifying setuid and
  649. * setgid binaries.
  650. */
  651. error = -file_remove_suid(file);
  652. if (unlikely(error))
  653. goto out_unlock_internal;
  654. /* We can write back this queue in page reclaim */
  655. current->backing_dev_info = mapping->backing_dev_info;
  656. if ((ioflags & IO_ISDIRECT)) {
  657. if (mapping->nrpages) {
  658. WARN_ON(need_i_mutex == 0);
  659. error = xfs_flushinval_pages(ip,
  660. (pos & PAGE_CACHE_MASK),
  661. -1, FI_REMAPF_LOCKED);
  662. if (error)
  663. goto out_unlock_internal;
  664. }
  665. if (need_i_mutex) {
  666. /* demote the lock now the cached pages are gone */
  667. xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
  668. mutex_unlock(&inode->i_mutex);
  669. iolock = XFS_IOLOCK_SHARED;
  670. need_i_mutex = 0;
  671. }
  672. trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
  673. ret = generic_file_direct_write(iocb, iovp,
  674. &nr_segs, pos, &iocb->ki_pos, count, ocount);
  675. /*
  676. * direct-io write to a hole: fall through to buffered I/O
  677. * for completing the rest of the request.
  678. */
  679. if (ret >= 0 && ret != count) {
  680. XFS_STATS_ADD(xs_write_bytes, ret);
  681. pos += ret;
  682. count -= ret;
  683. ioflags &= ~IO_ISDIRECT;
  684. xfs_iunlock(ip, iolock);
  685. goto relock;
  686. }
  687. } else {
  688. int enospc = 0;
  689. ssize_t ret2 = 0;
  690. write_retry:
  691. trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
  692. ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
  693. pos, &iocb->ki_pos, count, ret);
  694. /*
  695. * if we just got an ENOSPC, flush the inode now we
  696. * aren't holding any page locks and retry *once*
  697. */
  698. if (ret2 == -ENOSPC && !enospc) {
  699. error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
  700. if (error)
  701. goto out_unlock_internal;
  702. enospc = 1;
  703. goto write_retry;
  704. }
  705. ret = ret2;
  706. }
  707. current->backing_dev_info = NULL;
  708. isize = i_size_read(inode);
  709. if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
  710. iocb->ki_pos = isize;
  711. if (iocb->ki_pos > ip->i_size) {
  712. xfs_ilock(ip, XFS_ILOCK_EXCL);
  713. if (iocb->ki_pos > ip->i_size)
  714. ip->i_size = iocb->ki_pos;
  715. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  716. }
  717. if (ret == -ENOSPC &&
  718. DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
  719. xfs_iunlock(ip, iolock);
  720. if (need_i_mutex)
  721. mutex_unlock(&inode->i_mutex);
  722. error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
  723. DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
  724. 0, 0, 0); /* Delay flag intentionally unused */
  725. if (need_i_mutex)
  726. mutex_lock(&inode->i_mutex);
  727. xfs_ilock(ip, iolock);
  728. if (error)
  729. goto out_unlock_internal;
  730. goto start;
  731. }
  732. error = -ret;
  733. if (ret <= 0)
  734. goto out_unlock_internal;
  735. XFS_STATS_ADD(xs_write_bytes, ret);
  736. /* Handle various SYNC-type writes */
  737. if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
  738. loff_t end = pos + ret - 1;
  739. int error2;
  740. xfs_iunlock(ip, iolock);
  741. if (need_i_mutex)
  742. mutex_unlock(&inode->i_mutex);
  743. error2 = filemap_write_and_wait_range(mapping, pos, end);
  744. if (!error)
  745. error = error2;
  746. if (need_i_mutex)
  747. mutex_lock(&inode->i_mutex);
  748. xfs_ilock(ip, iolock);
  749. error2 = -xfs_file_fsync(file, file->f_path.dentry,
  750. (file->f_flags & __O_SYNC) ? 0 : 1);
  751. if (!error)
  752. error = error2;
  753. }
  754. out_unlock_internal:
  755. if (ip->i_new_size) {
  756. xfs_ilock(ip, XFS_ILOCK_EXCL);
  757. ip->i_new_size = 0;
  758. /*
  759. * If this was a direct or synchronous I/O that failed (such
  760. * as ENOSPC) then part of the I/O may have been written to
  761. * disk before the error occured. In this case the on-disk
  762. * file size may have been adjusted beyond the in-memory file
  763. * size and now needs to be truncated back.
  764. */
  765. if (ip->i_d.di_size > ip->i_size)
  766. ip->i_d.di_size = ip->i_size;
  767. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  768. }
  769. xfs_iunlock(ip, iolock);
  770. out_unlock_mutex:
  771. if (need_i_mutex)
  772. mutex_unlock(&inode->i_mutex);
  773. return -error;
  774. }
  775. STATIC int
  776. xfs_file_open(
  777. struct inode *inode,
  778. struct file *file)
  779. {
  780. if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
  781. return -EFBIG;
  782. if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
  783. return -EIO;
  784. return 0;
  785. }
  786. STATIC int
  787. xfs_dir_open(
  788. struct inode *inode,
  789. struct file *file)
  790. {
  791. struct xfs_inode *ip = XFS_I(inode);
  792. int mode;
  793. int error;
  794. error = xfs_file_open(inode, file);
  795. if (error)
  796. return error;
  797. /*
  798. * If there are any blocks, read-ahead block 0 as we're almost
  799. * certain to have the next operation be a read there.
  800. */
  801. mode = xfs_ilock_map_shared(ip);
  802. if (ip->i_d.di_nextents > 0)
  803. xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  804. xfs_iunlock(ip, mode);
  805. return 0;
  806. }
  807. STATIC int
  808. xfs_file_release(
  809. struct inode *inode,
  810. struct file *filp)
  811. {
  812. return -xfs_release(XFS_I(inode));
  813. }
  814. STATIC int
  815. xfs_file_readdir(
  816. struct file *filp,
  817. void *dirent,
  818. filldir_t filldir)
  819. {
  820. struct inode *inode = filp->f_path.dentry->d_inode;
  821. xfs_inode_t *ip = XFS_I(inode);
  822. int error;
  823. size_t bufsize;
  824. /*
  825. * The Linux API doesn't pass down the total size of the buffer
  826. * we read into down to the filesystem. With the filldir concept
  827. * it's not needed for correct information, but the XFS dir2 leaf
  828. * code wants an estimate of the buffer size to calculate it's
  829. * readahead window and size the buffers used for mapping to
  830. * physical blocks.
  831. *
  832. * Try to give it an estimate that's good enough, maybe at some
  833. * point we can change the ->readdir prototype to include the
  834. * buffer size. For now we use the current glibc buffer size.
  835. */
  836. bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
  837. error = xfs_readdir(ip, dirent, bufsize,
  838. (xfs_off_t *)&filp->f_pos, filldir);
  839. if (error)
  840. return -error;
  841. return 0;
  842. }
  843. STATIC int
  844. xfs_file_mmap(
  845. struct file *filp,
  846. struct vm_area_struct *vma)
  847. {
  848. vma->vm_ops = &xfs_file_vm_ops;
  849. vma->vm_flags |= VM_CAN_NONLINEAR;
  850. file_accessed(filp);
  851. return 0;
  852. }
  853. /*
  854. * mmap()d file has taken write protection fault and is being made
  855. * writable. We can set the page state up correctly for a writable
  856. * page, which means we can do correct delalloc accounting (ENOSPC
  857. * checking!) and unwritten extent mapping.
  858. */
  859. STATIC int
  860. xfs_vm_page_mkwrite(
  861. struct vm_area_struct *vma,
  862. struct vm_fault *vmf)
  863. {
  864. return block_page_mkwrite(vma, vmf, xfs_get_blocks);
  865. }
  866. const struct file_operations xfs_file_operations = {
  867. .llseek = generic_file_llseek,
  868. .read = do_sync_read,
  869. .write = do_sync_write,
  870. .aio_read = xfs_file_aio_read,
  871. .aio_write = xfs_file_aio_write,
  872. .splice_read = xfs_file_splice_read,
  873. .splice_write = xfs_file_splice_write,
  874. .unlocked_ioctl = xfs_file_ioctl,
  875. #ifdef CONFIG_COMPAT
  876. .compat_ioctl = xfs_file_compat_ioctl,
  877. #endif
  878. .mmap = xfs_file_mmap,
  879. .open = xfs_file_open,
  880. .release = xfs_file_release,
  881. .fsync = xfs_file_fsync,
  882. #ifdef HAVE_FOP_OPEN_EXEC
  883. .open_exec = xfs_file_open_exec,
  884. #endif
  885. };
  886. const struct file_operations xfs_dir_file_operations = {
  887. .open = xfs_dir_open,
  888. .read = generic_read_dir,
  889. .readdir = xfs_file_readdir,
  890. .llseek = generic_file_llseek,
  891. .unlocked_ioctl = xfs_file_ioctl,
  892. #ifdef CONFIG_COMPAT
  893. .compat_ioctl = xfs_file_compat_ioctl,
  894. #endif
  895. .fsync = xfs_file_fsync,
  896. };
  897. static const struct vm_operations_struct xfs_file_vm_ops = {
  898. .fault = filemap_fault,
  899. .page_mkwrite = xfs_vm_page_mkwrite,
  900. };