read_write.c 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/aio.h>
  12. #include <linux/fsnotify.h>
  13. #include <linux/security.h>
  14. #include <linux/export.h>
  15. #include <linux/syscalls.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/splice.h>
  18. #include <linux/compat.h>
  19. #include "internal.h"
  20. #include <asm/uaccess.h>
  21. #include <asm/unistd.h>
  22. typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  23. typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  24. unsigned long, loff_t);
  25. const struct file_operations generic_ro_fops = {
  26. .llseek = generic_file_llseek,
  27. .read = do_sync_read,
  28. .aio_read = generic_file_aio_read,
  29. .mmap = generic_file_readonly_mmap,
  30. .splice_read = generic_file_splice_read,
  31. };
  32. EXPORT_SYMBOL(generic_ro_fops);
  33. static inline int unsigned_offsets(struct file *file)
  34. {
  35. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  36. }
  37. /**
  38. * vfs_setpos - update the file offset for lseek
  39. * @file: file structure in question
  40. * @offset: file offset to seek to
  41. * @maxsize: maximum file size
  42. *
  43. * This is a low-level filesystem helper for updating the file offset to
  44. * the value specified by @offset if the given offset is valid and it is
  45. * not equal to the current file offset.
  46. *
  47. * Return the specified offset on success and -EINVAL on invalid offset.
  48. */
  49. loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  50. {
  51. if (offset < 0 && !unsigned_offsets(file))
  52. return -EINVAL;
  53. if (offset > maxsize)
  54. return -EINVAL;
  55. if (offset != file->f_pos) {
  56. file->f_pos = offset;
  57. file->f_version = 0;
  58. }
  59. return offset;
  60. }
  61. EXPORT_SYMBOL(vfs_setpos);
  62. /**
  63. * generic_file_llseek_size - generic llseek implementation for regular files
  64. * @file: file structure to seek on
  65. * @offset: file offset to seek to
  66. * @whence: type of seek
  67. * @size: max size of this file in file system
  68. * @eof: offset used for SEEK_END position
  69. *
  70. * This is a variant of generic_file_llseek that allows passing in a custom
  71. * maximum file size and a custom EOF position, for e.g. hashed directories
  72. *
  73. * Synchronization:
  74. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  75. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  76. * read/writes behave like SEEK_SET against seeks.
  77. */
  78. loff_t
  79. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  80. loff_t maxsize, loff_t eof)
  81. {
  82. switch (whence) {
  83. case SEEK_END:
  84. offset += eof;
  85. break;
  86. case SEEK_CUR:
  87. /*
  88. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  89. * position-querying operation. Avoid rewriting the "same"
  90. * f_pos value back to the file because a concurrent read(),
  91. * write() or lseek() might have altered it
  92. */
  93. if (offset == 0)
  94. return file->f_pos;
  95. /*
  96. * f_lock protects against read/modify/write race with other
  97. * SEEK_CURs. Note that parallel writes and reads behave
  98. * like SEEK_SET.
  99. */
  100. spin_lock(&file->f_lock);
  101. offset = vfs_setpos(file, file->f_pos + offset, maxsize);
  102. spin_unlock(&file->f_lock);
  103. return offset;
  104. case SEEK_DATA:
  105. /*
  106. * In the generic case the entire file is data, so as long as
  107. * offset isn't at the end of the file then the offset is data.
  108. */
  109. if (offset >= eof)
  110. return -ENXIO;
  111. break;
  112. case SEEK_HOLE:
  113. /*
  114. * There is a virtual hole at the end of the file, so as long as
  115. * offset isn't i_size or larger, return i_size.
  116. */
  117. if (offset >= eof)
  118. return -ENXIO;
  119. offset = eof;
  120. break;
  121. }
  122. return vfs_setpos(file, offset, maxsize);
  123. }
  124. EXPORT_SYMBOL(generic_file_llseek_size);
  125. /**
  126. * generic_file_llseek - generic llseek implementation for regular files
  127. * @file: file structure to seek on
  128. * @offset: file offset to seek to
  129. * @whence: type of seek
  130. *
  131. * This is a generic implemenation of ->llseek useable for all normal local
  132. * filesystems. It just updates the file offset to the value specified by
  133. * @offset and @whence.
  134. */
  135. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  136. {
  137. struct inode *inode = file->f_mapping->host;
  138. return generic_file_llseek_size(file, offset, whence,
  139. inode->i_sb->s_maxbytes,
  140. i_size_read(inode));
  141. }
  142. EXPORT_SYMBOL(generic_file_llseek);
  143. /**
  144. * fixed_size_llseek - llseek implementation for fixed-sized devices
  145. * @file: file structure to seek on
  146. * @offset: file offset to seek to
  147. * @whence: type of seek
  148. * @size: size of the file
  149. *
  150. */
  151. loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
  152. {
  153. switch (whence) {
  154. case SEEK_SET: case SEEK_CUR: case SEEK_END:
  155. return generic_file_llseek_size(file, offset, whence,
  156. size, size);
  157. default:
  158. return -EINVAL;
  159. }
  160. }
  161. EXPORT_SYMBOL(fixed_size_llseek);
  162. /**
  163. * noop_llseek - No Operation Performed llseek implementation
  164. * @file: file structure to seek on
  165. * @offset: file offset to seek to
  166. * @whence: type of seek
  167. *
  168. * This is an implementation of ->llseek useable for the rare special case when
  169. * userspace expects the seek to succeed but the (device) file is actually not
  170. * able to perform the seek. In this case you use noop_llseek() instead of
  171. * falling back to the default implementation of ->llseek.
  172. */
  173. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  174. {
  175. return file->f_pos;
  176. }
  177. EXPORT_SYMBOL(noop_llseek);
  178. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  179. {
  180. return -ESPIPE;
  181. }
  182. EXPORT_SYMBOL(no_llseek);
  183. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  184. {
  185. struct inode *inode = file_inode(file);
  186. loff_t retval;
  187. mutex_lock(&inode->i_mutex);
  188. switch (whence) {
  189. case SEEK_END:
  190. offset += i_size_read(inode);
  191. break;
  192. case SEEK_CUR:
  193. if (offset == 0) {
  194. retval = file->f_pos;
  195. goto out;
  196. }
  197. offset += file->f_pos;
  198. break;
  199. case SEEK_DATA:
  200. /*
  201. * In the generic case the entire file is data, so as
  202. * long as offset isn't at the end of the file then the
  203. * offset is data.
  204. */
  205. if (offset >= inode->i_size) {
  206. retval = -ENXIO;
  207. goto out;
  208. }
  209. break;
  210. case SEEK_HOLE:
  211. /*
  212. * There is a virtual hole at the end of the file, so
  213. * as long as offset isn't i_size or larger, return
  214. * i_size.
  215. */
  216. if (offset >= inode->i_size) {
  217. retval = -ENXIO;
  218. goto out;
  219. }
  220. offset = inode->i_size;
  221. break;
  222. }
  223. retval = -EINVAL;
  224. if (offset >= 0 || unsigned_offsets(file)) {
  225. if (offset != file->f_pos) {
  226. file->f_pos = offset;
  227. file->f_version = 0;
  228. }
  229. retval = offset;
  230. }
  231. out:
  232. mutex_unlock(&inode->i_mutex);
  233. return retval;
  234. }
  235. EXPORT_SYMBOL(default_llseek);
  236. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  237. {
  238. loff_t (*fn)(struct file *, loff_t, int);
  239. fn = no_llseek;
  240. if (file->f_mode & FMODE_LSEEK) {
  241. if (file->f_op && file->f_op->llseek)
  242. fn = file->f_op->llseek;
  243. }
  244. return fn(file, offset, whence);
  245. }
  246. EXPORT_SYMBOL(vfs_llseek);
  247. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  248. {
  249. off_t retval;
  250. struct fd f = fdget(fd);
  251. if (!f.file)
  252. return -EBADF;
  253. retval = -EINVAL;
  254. if (whence <= SEEK_MAX) {
  255. loff_t res = vfs_llseek(f.file, offset, whence);
  256. retval = res;
  257. if (res != (loff_t)retval)
  258. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  259. }
  260. fdput(f);
  261. return retval;
  262. }
  263. #ifdef CONFIG_COMPAT
  264. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  265. {
  266. return sys_lseek(fd, offset, whence);
  267. }
  268. #endif
  269. #ifdef __ARCH_WANT_SYS_LLSEEK
  270. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  271. unsigned long, offset_low, loff_t __user *, result,
  272. unsigned int, whence)
  273. {
  274. int retval;
  275. struct fd f = fdget(fd);
  276. loff_t offset;
  277. if (!f.file)
  278. return -EBADF;
  279. retval = -EINVAL;
  280. if (whence > SEEK_MAX)
  281. goto out_putf;
  282. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  283. whence);
  284. retval = (int)offset;
  285. if (offset >= 0) {
  286. retval = -EFAULT;
  287. if (!copy_to_user(result, &offset, sizeof(offset)))
  288. retval = 0;
  289. }
  290. out_putf:
  291. fdput(f);
  292. return retval;
  293. }
  294. #endif
  295. /*
  296. * rw_verify_area doesn't like huge counts. We limit
  297. * them to something that fits in "int" so that others
  298. * won't have to do range checks all the time.
  299. */
  300. int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
  301. {
  302. struct inode *inode;
  303. loff_t pos;
  304. int retval = -EINVAL;
  305. inode = file_inode(file);
  306. if (unlikely((ssize_t) count < 0))
  307. return retval;
  308. pos = *ppos;
  309. if (unlikely(pos < 0)) {
  310. if (!unsigned_offsets(file))
  311. return retval;
  312. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  313. return -EOVERFLOW;
  314. } else if (unlikely((loff_t) (pos + count) < 0)) {
  315. if (!unsigned_offsets(file))
  316. return retval;
  317. }
  318. if (unlikely(inode->i_flock && mandatory_lock(inode))) {
  319. retval = locks_mandatory_area(
  320. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  321. inode, file, pos, count);
  322. if (retval < 0)
  323. return retval;
  324. }
  325. retval = security_file_permission(file,
  326. read_write == READ ? MAY_READ : MAY_WRITE);
  327. if (retval)
  328. return retval;
  329. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  330. }
  331. ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  332. {
  333. struct iovec iov = { .iov_base = buf, .iov_len = len };
  334. struct kiocb kiocb;
  335. ssize_t ret;
  336. init_sync_kiocb(&kiocb, filp);
  337. kiocb.ki_pos = *ppos;
  338. kiocb.ki_left = len;
  339. kiocb.ki_nbytes = len;
  340. ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
  341. if (-EIOCBQUEUED == ret)
  342. ret = wait_on_sync_kiocb(&kiocb);
  343. *ppos = kiocb.ki_pos;
  344. return ret;
  345. }
  346. EXPORT_SYMBOL(do_sync_read);
  347. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  348. {
  349. ssize_t ret;
  350. if (!(file->f_mode & FMODE_READ))
  351. return -EBADF;
  352. if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
  353. return -EINVAL;
  354. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  355. return -EFAULT;
  356. ret = rw_verify_area(READ, file, pos, count);
  357. if (ret >= 0) {
  358. count = ret;
  359. if (file->f_op->read)
  360. ret = file->f_op->read(file, buf, count, pos);
  361. else
  362. ret = do_sync_read(file, buf, count, pos);
  363. if (ret > 0) {
  364. fsnotify_access(file);
  365. add_rchar(current, ret);
  366. }
  367. inc_syscr(current);
  368. }
  369. return ret;
  370. }
  371. EXPORT_SYMBOL(vfs_read);
  372. ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  373. {
  374. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  375. struct kiocb kiocb;
  376. ssize_t ret;
  377. init_sync_kiocb(&kiocb, filp);
  378. kiocb.ki_pos = *ppos;
  379. kiocb.ki_left = len;
  380. kiocb.ki_nbytes = len;
  381. ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
  382. if (-EIOCBQUEUED == ret)
  383. ret = wait_on_sync_kiocb(&kiocb);
  384. *ppos = kiocb.ki_pos;
  385. return ret;
  386. }
  387. EXPORT_SYMBOL(do_sync_write);
  388. ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
  389. {
  390. mm_segment_t old_fs;
  391. const char __user *p;
  392. ssize_t ret;
  393. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  394. return -EINVAL;
  395. old_fs = get_fs();
  396. set_fs(get_ds());
  397. p = (__force const char __user *)buf;
  398. if (count > MAX_RW_COUNT)
  399. count = MAX_RW_COUNT;
  400. if (file->f_op->write)
  401. ret = file->f_op->write(file, p, count, pos);
  402. else
  403. ret = do_sync_write(file, p, count, pos);
  404. set_fs(old_fs);
  405. if (ret > 0) {
  406. fsnotify_modify(file);
  407. add_wchar(current, ret);
  408. }
  409. inc_syscw(current);
  410. return ret;
  411. }
  412. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  413. {
  414. ssize_t ret;
  415. if (!(file->f_mode & FMODE_WRITE))
  416. return -EBADF;
  417. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  418. return -EINVAL;
  419. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  420. return -EFAULT;
  421. ret = rw_verify_area(WRITE, file, pos, count);
  422. if (ret >= 0) {
  423. count = ret;
  424. file_start_write(file);
  425. if (file->f_op->write)
  426. ret = file->f_op->write(file, buf, count, pos);
  427. else
  428. ret = do_sync_write(file, buf, count, pos);
  429. if (ret > 0) {
  430. fsnotify_modify(file);
  431. add_wchar(current, ret);
  432. }
  433. inc_syscw(current);
  434. file_end_write(file);
  435. }
  436. return ret;
  437. }
  438. EXPORT_SYMBOL(vfs_write);
  439. static inline loff_t file_pos_read(struct file *file)
  440. {
  441. return file->f_pos;
  442. }
  443. static inline void file_pos_write(struct file *file, loff_t pos)
  444. {
  445. file->f_pos = pos;
  446. }
  447. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  448. {
  449. struct fd f = fdget(fd);
  450. ssize_t ret = -EBADF;
  451. if (f.file) {
  452. loff_t pos = file_pos_read(f.file);
  453. ret = vfs_read(f.file, buf, count, &pos);
  454. if (ret >= 0)
  455. file_pos_write(f.file, pos);
  456. fdput(f);
  457. }
  458. return ret;
  459. }
  460. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  461. size_t, count)
  462. {
  463. struct fd f = fdget(fd);
  464. ssize_t ret = -EBADF;
  465. if (f.file) {
  466. loff_t pos = file_pos_read(f.file);
  467. ret = vfs_write(f.file, buf, count, &pos);
  468. if (ret >= 0)
  469. file_pos_write(f.file, pos);
  470. fdput(f);
  471. }
  472. return ret;
  473. }
  474. SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
  475. size_t, count, loff_t, pos)
  476. {
  477. struct fd f;
  478. ssize_t ret = -EBADF;
  479. if (pos < 0)
  480. return -EINVAL;
  481. f = fdget(fd);
  482. if (f.file) {
  483. ret = -ESPIPE;
  484. if (f.file->f_mode & FMODE_PREAD)
  485. ret = vfs_read(f.file, buf, count, &pos);
  486. fdput(f);
  487. }
  488. return ret;
  489. }
  490. SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
  491. size_t, count, loff_t, pos)
  492. {
  493. struct fd f;
  494. ssize_t ret = -EBADF;
  495. if (pos < 0)
  496. return -EINVAL;
  497. f = fdget(fd);
  498. if (f.file) {
  499. ret = -ESPIPE;
  500. if (f.file->f_mode & FMODE_PWRITE)
  501. ret = vfs_write(f.file, buf, count, &pos);
  502. fdput(f);
  503. }
  504. return ret;
  505. }
  506. /*
  507. * Reduce an iovec's length in-place. Return the resulting number of segments
  508. */
  509. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  510. {
  511. unsigned long seg = 0;
  512. size_t len = 0;
  513. while (seg < nr_segs) {
  514. seg++;
  515. if (len + iov->iov_len >= to) {
  516. iov->iov_len = to - len;
  517. break;
  518. }
  519. len += iov->iov_len;
  520. iov++;
  521. }
  522. return seg;
  523. }
  524. EXPORT_SYMBOL(iov_shorten);
  525. static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
  526. unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
  527. {
  528. struct kiocb kiocb;
  529. ssize_t ret;
  530. init_sync_kiocb(&kiocb, filp);
  531. kiocb.ki_pos = *ppos;
  532. kiocb.ki_left = len;
  533. kiocb.ki_nbytes = len;
  534. ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
  535. if (ret == -EIOCBQUEUED)
  536. ret = wait_on_sync_kiocb(&kiocb);
  537. *ppos = kiocb.ki_pos;
  538. return ret;
  539. }
  540. /* Do it by hand, with file-ops */
  541. static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
  542. unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
  543. {
  544. struct iovec *vector = iov;
  545. ssize_t ret = 0;
  546. while (nr_segs > 0) {
  547. void __user *base;
  548. size_t len;
  549. ssize_t nr;
  550. base = vector->iov_base;
  551. len = vector->iov_len;
  552. vector++;
  553. nr_segs--;
  554. nr = fn(filp, base, len, ppos);
  555. if (nr < 0) {
  556. if (!ret)
  557. ret = nr;
  558. break;
  559. }
  560. ret += nr;
  561. if (nr != len)
  562. break;
  563. }
  564. return ret;
  565. }
  566. /* A write operation does a read from user space and vice versa */
  567. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  568. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  569. unsigned long nr_segs, unsigned long fast_segs,
  570. struct iovec *fast_pointer,
  571. struct iovec **ret_pointer)
  572. {
  573. unsigned long seg;
  574. ssize_t ret;
  575. struct iovec *iov = fast_pointer;
  576. /*
  577. * SuS says "The readv() function *may* fail if the iovcnt argument
  578. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  579. * traditionally returned zero for zero segments, so...
  580. */
  581. if (nr_segs == 0) {
  582. ret = 0;
  583. goto out;
  584. }
  585. /*
  586. * First get the "struct iovec" from user memory and
  587. * verify all the pointers
  588. */
  589. if (nr_segs > UIO_MAXIOV) {
  590. ret = -EINVAL;
  591. goto out;
  592. }
  593. if (nr_segs > fast_segs) {
  594. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  595. if (iov == NULL) {
  596. ret = -ENOMEM;
  597. goto out;
  598. }
  599. }
  600. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  601. ret = -EFAULT;
  602. goto out;
  603. }
  604. /*
  605. * According to the Single Unix Specification we should return EINVAL
  606. * if an element length is < 0 when cast to ssize_t or if the
  607. * total length would overflow the ssize_t return value of the
  608. * system call.
  609. *
  610. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  611. * overflow case.
  612. */
  613. ret = 0;
  614. for (seg = 0; seg < nr_segs; seg++) {
  615. void __user *buf = iov[seg].iov_base;
  616. ssize_t len = (ssize_t)iov[seg].iov_len;
  617. /* see if we we're about to use an invalid len or if
  618. * it's about to overflow ssize_t */
  619. if (len < 0) {
  620. ret = -EINVAL;
  621. goto out;
  622. }
  623. if (type >= 0
  624. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  625. ret = -EFAULT;
  626. goto out;
  627. }
  628. if (len > MAX_RW_COUNT - ret) {
  629. len = MAX_RW_COUNT - ret;
  630. iov[seg].iov_len = len;
  631. }
  632. ret += len;
  633. }
  634. out:
  635. *ret_pointer = iov;
  636. return ret;
  637. }
  638. static ssize_t do_readv_writev(int type, struct file *file,
  639. const struct iovec __user * uvector,
  640. unsigned long nr_segs, loff_t *pos)
  641. {
  642. size_t tot_len;
  643. struct iovec iovstack[UIO_FASTIOV];
  644. struct iovec *iov = iovstack;
  645. ssize_t ret;
  646. io_fn_t fn;
  647. iov_fn_t fnv;
  648. if (!file->f_op) {
  649. ret = -EINVAL;
  650. goto out;
  651. }
  652. ret = rw_copy_check_uvector(type, uvector, nr_segs,
  653. ARRAY_SIZE(iovstack), iovstack, &iov);
  654. if (ret <= 0)
  655. goto out;
  656. tot_len = ret;
  657. ret = rw_verify_area(type, file, pos, tot_len);
  658. if (ret < 0)
  659. goto out;
  660. fnv = NULL;
  661. if (type == READ) {
  662. fn = file->f_op->read;
  663. fnv = file->f_op->aio_read;
  664. } else {
  665. fn = (io_fn_t)file->f_op->write;
  666. fnv = file->f_op->aio_write;
  667. file_start_write(file);
  668. }
  669. if (fnv)
  670. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  671. pos, fnv);
  672. else
  673. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  674. if (type != READ)
  675. file_end_write(file);
  676. out:
  677. if (iov != iovstack)
  678. kfree(iov);
  679. if ((ret + (type == READ)) > 0) {
  680. if (type == READ)
  681. fsnotify_access(file);
  682. else
  683. fsnotify_modify(file);
  684. }
  685. return ret;
  686. }
  687. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  688. unsigned long vlen, loff_t *pos)
  689. {
  690. if (!(file->f_mode & FMODE_READ))
  691. return -EBADF;
  692. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  693. return -EINVAL;
  694. return do_readv_writev(READ, file, vec, vlen, pos);
  695. }
  696. EXPORT_SYMBOL(vfs_readv);
  697. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  698. unsigned long vlen, loff_t *pos)
  699. {
  700. if (!(file->f_mode & FMODE_WRITE))
  701. return -EBADF;
  702. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  703. return -EINVAL;
  704. return do_readv_writev(WRITE, file, vec, vlen, pos);
  705. }
  706. EXPORT_SYMBOL(vfs_writev);
  707. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  708. unsigned long, vlen)
  709. {
  710. struct fd f = fdget(fd);
  711. ssize_t ret = -EBADF;
  712. if (f.file) {
  713. loff_t pos = file_pos_read(f.file);
  714. ret = vfs_readv(f.file, vec, vlen, &pos);
  715. if (ret >= 0)
  716. file_pos_write(f.file, pos);
  717. fdput(f);
  718. }
  719. if (ret > 0)
  720. add_rchar(current, ret);
  721. inc_syscr(current);
  722. return ret;
  723. }
  724. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  725. unsigned long, vlen)
  726. {
  727. struct fd f = fdget(fd);
  728. ssize_t ret = -EBADF;
  729. if (f.file) {
  730. loff_t pos = file_pos_read(f.file);
  731. ret = vfs_writev(f.file, vec, vlen, &pos);
  732. if (ret >= 0)
  733. file_pos_write(f.file, pos);
  734. fdput(f);
  735. }
  736. if (ret > 0)
  737. add_wchar(current, ret);
  738. inc_syscw(current);
  739. return ret;
  740. }
  741. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  742. {
  743. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  744. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  745. }
  746. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  747. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  748. {
  749. loff_t pos = pos_from_hilo(pos_h, pos_l);
  750. struct fd f;
  751. ssize_t ret = -EBADF;
  752. if (pos < 0)
  753. return -EINVAL;
  754. f = fdget(fd);
  755. if (f.file) {
  756. ret = -ESPIPE;
  757. if (f.file->f_mode & FMODE_PREAD)
  758. ret = vfs_readv(f.file, vec, vlen, &pos);
  759. fdput(f);
  760. }
  761. if (ret > 0)
  762. add_rchar(current, ret);
  763. inc_syscr(current);
  764. return ret;
  765. }
  766. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  767. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  768. {
  769. loff_t pos = pos_from_hilo(pos_h, pos_l);
  770. struct fd f;
  771. ssize_t ret = -EBADF;
  772. if (pos < 0)
  773. return -EINVAL;
  774. f = fdget(fd);
  775. if (f.file) {
  776. ret = -ESPIPE;
  777. if (f.file->f_mode & FMODE_PWRITE)
  778. ret = vfs_writev(f.file, vec, vlen, &pos);
  779. fdput(f);
  780. }
  781. if (ret > 0)
  782. add_wchar(current, ret);
  783. inc_syscw(current);
  784. return ret;
  785. }
  786. #ifdef CONFIG_COMPAT
  787. static ssize_t compat_do_readv_writev(int type, struct file *file,
  788. const struct compat_iovec __user *uvector,
  789. unsigned long nr_segs, loff_t *pos)
  790. {
  791. compat_ssize_t tot_len;
  792. struct iovec iovstack[UIO_FASTIOV];
  793. struct iovec *iov = iovstack;
  794. ssize_t ret;
  795. io_fn_t fn;
  796. iov_fn_t fnv;
  797. ret = -EINVAL;
  798. if (!file->f_op)
  799. goto out;
  800. ret = -EFAULT;
  801. if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
  802. goto out;
  803. ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
  804. UIO_FASTIOV, iovstack, &iov);
  805. if (ret <= 0)
  806. goto out;
  807. tot_len = ret;
  808. ret = rw_verify_area(type, file, pos, tot_len);
  809. if (ret < 0)
  810. goto out;
  811. fnv = NULL;
  812. if (type == READ) {
  813. fn = file->f_op->read;
  814. fnv = file->f_op->aio_read;
  815. } else {
  816. fn = (io_fn_t)file->f_op->write;
  817. fnv = file->f_op->aio_write;
  818. file_start_write(file);
  819. }
  820. if (fnv)
  821. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  822. pos, fnv);
  823. else
  824. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  825. if (type != READ)
  826. file_end_write(file);
  827. out:
  828. if (iov != iovstack)
  829. kfree(iov);
  830. if ((ret + (type == READ)) > 0) {
  831. if (type == READ)
  832. fsnotify_access(file);
  833. else
  834. fsnotify_modify(file);
  835. }
  836. return ret;
  837. }
  838. static size_t compat_readv(struct file *file,
  839. const struct compat_iovec __user *vec,
  840. unsigned long vlen, loff_t *pos)
  841. {
  842. ssize_t ret = -EBADF;
  843. if (!(file->f_mode & FMODE_READ))
  844. goto out;
  845. ret = -EINVAL;
  846. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  847. goto out;
  848. ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
  849. out:
  850. if (ret > 0)
  851. add_rchar(current, ret);
  852. inc_syscr(current);
  853. return ret;
  854. }
  855. COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
  856. const struct compat_iovec __user *,vec,
  857. unsigned long, vlen)
  858. {
  859. struct fd f = fdget(fd);
  860. ssize_t ret;
  861. loff_t pos;
  862. if (!f.file)
  863. return -EBADF;
  864. pos = f.file->f_pos;
  865. ret = compat_readv(f.file, vec, vlen, &pos);
  866. if (ret >= 0)
  867. f.file->f_pos = pos;
  868. fdput(f);
  869. return ret;
  870. }
  871. COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
  872. const struct compat_iovec __user *,vec,
  873. unsigned long, vlen, loff_t, pos)
  874. {
  875. struct fd f;
  876. ssize_t ret;
  877. if (pos < 0)
  878. return -EINVAL;
  879. f = fdget(fd);
  880. if (!f.file)
  881. return -EBADF;
  882. ret = -ESPIPE;
  883. if (f.file->f_mode & FMODE_PREAD)
  884. ret = compat_readv(f.file, vec, vlen, &pos);
  885. fdput(f);
  886. return ret;
  887. }
  888. COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
  889. const struct compat_iovec __user *,vec,
  890. unsigned long, vlen, u32, pos_low, u32, pos_high)
  891. {
  892. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  893. return compat_sys_preadv64(fd, vec, vlen, pos);
  894. }
  895. static size_t compat_writev(struct file *file,
  896. const struct compat_iovec __user *vec,
  897. unsigned long vlen, loff_t *pos)
  898. {
  899. ssize_t ret = -EBADF;
  900. if (!(file->f_mode & FMODE_WRITE))
  901. goto out;
  902. ret = -EINVAL;
  903. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  904. goto out;
  905. ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
  906. out:
  907. if (ret > 0)
  908. add_wchar(current, ret);
  909. inc_syscw(current);
  910. return ret;
  911. }
  912. COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
  913. const struct compat_iovec __user *, vec,
  914. unsigned long, vlen)
  915. {
  916. struct fd f = fdget(fd);
  917. ssize_t ret;
  918. loff_t pos;
  919. if (!f.file)
  920. return -EBADF;
  921. pos = f.file->f_pos;
  922. ret = compat_writev(f.file, vec, vlen, &pos);
  923. if (ret >= 0)
  924. f.file->f_pos = pos;
  925. fdput(f);
  926. return ret;
  927. }
  928. COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
  929. const struct compat_iovec __user *,vec,
  930. unsigned long, vlen, loff_t, pos)
  931. {
  932. struct fd f;
  933. ssize_t ret;
  934. if (pos < 0)
  935. return -EINVAL;
  936. f = fdget(fd);
  937. if (!f.file)
  938. return -EBADF;
  939. ret = -ESPIPE;
  940. if (f.file->f_mode & FMODE_PWRITE)
  941. ret = compat_writev(f.file, vec, vlen, &pos);
  942. fdput(f);
  943. return ret;
  944. }
  945. COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
  946. const struct compat_iovec __user *,vec,
  947. unsigned long, vlen, u32, pos_low, u32, pos_high)
  948. {
  949. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  950. return compat_sys_pwritev64(fd, vec, vlen, pos);
  951. }
  952. #endif
  953. static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  954. size_t count, loff_t max)
  955. {
  956. struct fd in, out;
  957. struct inode *in_inode, *out_inode;
  958. loff_t pos;
  959. loff_t out_pos;
  960. ssize_t retval;
  961. int fl;
  962. /*
  963. * Get input file, and verify that it is ok..
  964. */
  965. retval = -EBADF;
  966. in = fdget(in_fd);
  967. if (!in.file)
  968. goto out;
  969. if (!(in.file->f_mode & FMODE_READ))
  970. goto fput_in;
  971. retval = -ESPIPE;
  972. if (!ppos) {
  973. pos = in.file->f_pos;
  974. } else {
  975. pos = *ppos;
  976. if (!(in.file->f_mode & FMODE_PREAD))
  977. goto fput_in;
  978. }
  979. retval = rw_verify_area(READ, in.file, &pos, count);
  980. if (retval < 0)
  981. goto fput_in;
  982. count = retval;
  983. /*
  984. * Get output file, and verify that it is ok..
  985. */
  986. retval = -EBADF;
  987. out = fdget(out_fd);
  988. if (!out.file)
  989. goto fput_in;
  990. if (!(out.file->f_mode & FMODE_WRITE))
  991. goto fput_out;
  992. retval = -EINVAL;
  993. in_inode = file_inode(in.file);
  994. out_inode = file_inode(out.file);
  995. out_pos = out.file->f_pos;
  996. retval = rw_verify_area(WRITE, out.file, &out_pos, count);
  997. if (retval < 0)
  998. goto fput_out;
  999. count = retval;
  1000. if (!max)
  1001. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  1002. if (unlikely(pos + count > max)) {
  1003. retval = -EOVERFLOW;
  1004. if (pos >= max)
  1005. goto fput_out;
  1006. count = max - pos;
  1007. }
  1008. fl = 0;
  1009. #if 0
  1010. /*
  1011. * We need to debate whether we can enable this or not. The
  1012. * man page documents EAGAIN return for the output at least,
  1013. * and the application is arguably buggy if it doesn't expect
  1014. * EAGAIN on a non-blocking file descriptor.
  1015. */
  1016. if (in.file->f_flags & O_NONBLOCK)
  1017. fl = SPLICE_F_NONBLOCK;
  1018. #endif
  1019. file_start_write(out.file);
  1020. retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
  1021. file_end_write(out.file);
  1022. if (retval > 0) {
  1023. add_rchar(current, retval);
  1024. add_wchar(current, retval);
  1025. fsnotify_access(in.file);
  1026. fsnotify_modify(out.file);
  1027. out.file->f_pos = out_pos;
  1028. if (ppos)
  1029. *ppos = pos;
  1030. else
  1031. in.file->f_pos = pos;
  1032. }
  1033. inc_syscr(current);
  1034. inc_syscw(current);
  1035. if (pos > max)
  1036. retval = -EOVERFLOW;
  1037. fput_out:
  1038. fdput(out);
  1039. fput_in:
  1040. fdput(in);
  1041. out:
  1042. return retval;
  1043. }
  1044. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  1045. {
  1046. loff_t pos;
  1047. off_t off;
  1048. ssize_t ret;
  1049. if (offset) {
  1050. if (unlikely(get_user(off, offset)))
  1051. return -EFAULT;
  1052. pos = off;
  1053. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1054. if (unlikely(put_user(pos, offset)))
  1055. return -EFAULT;
  1056. return ret;
  1057. }
  1058. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1059. }
  1060. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  1061. {
  1062. loff_t pos;
  1063. ssize_t ret;
  1064. if (offset) {
  1065. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1066. return -EFAULT;
  1067. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1068. if (unlikely(put_user(pos, offset)))
  1069. return -EFAULT;
  1070. return ret;
  1071. }
  1072. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1073. }
  1074. #ifdef CONFIG_COMPAT
  1075. COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
  1076. compat_off_t __user *, offset, compat_size_t, count)
  1077. {
  1078. loff_t pos;
  1079. off_t off;
  1080. ssize_t ret;
  1081. if (offset) {
  1082. if (unlikely(get_user(off, offset)))
  1083. return -EFAULT;
  1084. pos = off;
  1085. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1086. if (unlikely(put_user(pos, offset)))
  1087. return -EFAULT;
  1088. return ret;
  1089. }
  1090. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1091. }
  1092. COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
  1093. compat_loff_t __user *, offset, compat_size_t, count)
  1094. {
  1095. loff_t pos;
  1096. ssize_t ret;
  1097. if (offset) {
  1098. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1099. return -EFAULT;
  1100. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1101. if (unlikely(put_user(pos, offset)))
  1102. return -EFAULT;
  1103. return ret;
  1104. }
  1105. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1106. }
  1107. #endif