read_write.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/fsnotify.h>
  12. #include <linux/security.h>
  13. #include <linux/export.h>
  14. #include <linux/syscalls.h>
  15. #include <linux/pagemap.h>
  16. #include <linux/splice.h>
  17. #include <linux/compat.h>
  18. #include "read_write.h"
  19. #include <asm/uaccess.h>
  20. #include <asm/unistd.h>
  21. const struct file_operations generic_ro_fops = {
  22. .llseek = generic_file_llseek,
  23. .read = do_sync_read,
  24. .aio_read = generic_file_aio_read,
  25. .mmap = generic_file_readonly_mmap,
  26. .splice_read = generic_file_splice_read,
  27. };
  28. EXPORT_SYMBOL(generic_ro_fops);
  29. static inline int unsigned_offsets(struct file *file)
  30. {
  31. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  32. }
  33. static loff_t lseek_execute(struct file *file, struct inode *inode,
  34. loff_t offset, loff_t maxsize)
  35. {
  36. if (offset < 0 && !unsigned_offsets(file))
  37. return -EINVAL;
  38. if (offset > maxsize)
  39. return -EINVAL;
  40. if (offset != file->f_pos) {
  41. file->f_pos = offset;
  42. file->f_version = 0;
  43. }
  44. return offset;
  45. }
  46. /**
  47. * generic_file_llseek_size - generic llseek implementation for regular files
  48. * @file: file structure to seek on
  49. * @offset: file offset to seek to
  50. * @whence: type of seek
  51. * @size: max size of this file in file system
  52. * @eof: offset used for SEEK_END position
  53. *
  54. * This is a variant of generic_file_llseek that allows passing in a custom
  55. * maximum file size and a custom EOF position, for e.g. hashed directories
  56. *
  57. * Synchronization:
  58. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  59. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  60. * read/writes behave like SEEK_SET against seeks.
  61. */
  62. loff_t
  63. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  64. loff_t maxsize, loff_t eof)
  65. {
  66. struct inode *inode = file->f_mapping->host;
  67. switch (whence) {
  68. case SEEK_END:
  69. offset += eof;
  70. break;
  71. case SEEK_CUR:
  72. /*
  73. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  74. * position-querying operation. Avoid rewriting the "same"
  75. * f_pos value back to the file because a concurrent read(),
  76. * write() or lseek() might have altered it
  77. */
  78. if (offset == 0)
  79. return file->f_pos;
  80. /*
  81. * f_lock protects against read/modify/write race with other
  82. * SEEK_CURs. Note that parallel writes and reads behave
  83. * like SEEK_SET.
  84. */
  85. spin_lock(&file->f_lock);
  86. offset = lseek_execute(file, inode, file->f_pos + offset,
  87. maxsize);
  88. spin_unlock(&file->f_lock);
  89. return offset;
  90. case SEEK_DATA:
  91. /*
  92. * In the generic case the entire file is data, so as long as
  93. * offset isn't at the end of the file then the offset is data.
  94. */
  95. if (offset >= eof)
  96. return -ENXIO;
  97. break;
  98. case SEEK_HOLE:
  99. /*
  100. * There is a virtual hole at the end of the file, so as long as
  101. * offset isn't i_size or larger, return i_size.
  102. */
  103. if (offset >= eof)
  104. return -ENXIO;
  105. offset = eof;
  106. break;
  107. }
  108. return lseek_execute(file, inode, offset, maxsize);
  109. }
  110. EXPORT_SYMBOL(generic_file_llseek_size);
  111. /**
  112. * generic_file_llseek - generic llseek implementation for regular files
  113. * @file: file structure to seek on
  114. * @offset: file offset to seek to
  115. * @whence: type of seek
  116. *
  117. * This is a generic implemenation of ->llseek useable for all normal local
  118. * filesystems. It just updates the file offset to the value specified by
  119. * @offset and @whence under i_mutex.
  120. */
  121. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  122. {
  123. struct inode *inode = file->f_mapping->host;
  124. return generic_file_llseek_size(file, offset, whence,
  125. inode->i_sb->s_maxbytes,
  126. i_size_read(inode));
  127. }
  128. EXPORT_SYMBOL(generic_file_llseek);
  129. /**
  130. * noop_llseek - No Operation Performed llseek implementation
  131. * @file: file structure to seek on
  132. * @offset: file offset to seek to
  133. * @whence: type of seek
  134. *
  135. * This is an implementation of ->llseek useable for the rare special case when
  136. * userspace expects the seek to succeed but the (device) file is actually not
  137. * able to perform the seek. In this case you use noop_llseek() instead of
  138. * falling back to the default implementation of ->llseek.
  139. */
  140. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  141. {
  142. return file->f_pos;
  143. }
  144. EXPORT_SYMBOL(noop_llseek);
  145. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  146. {
  147. return -ESPIPE;
  148. }
  149. EXPORT_SYMBOL(no_llseek);
  150. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  151. {
  152. struct inode *inode = file_inode(file);
  153. loff_t retval;
  154. mutex_lock(&inode->i_mutex);
  155. switch (whence) {
  156. case SEEK_END:
  157. offset += i_size_read(inode);
  158. break;
  159. case SEEK_CUR:
  160. if (offset == 0) {
  161. retval = file->f_pos;
  162. goto out;
  163. }
  164. offset += file->f_pos;
  165. break;
  166. case SEEK_DATA:
  167. /*
  168. * In the generic case the entire file is data, so as
  169. * long as offset isn't at the end of the file then the
  170. * offset is data.
  171. */
  172. if (offset >= inode->i_size) {
  173. retval = -ENXIO;
  174. goto out;
  175. }
  176. break;
  177. case SEEK_HOLE:
  178. /*
  179. * There is a virtual hole at the end of the file, so
  180. * as long as offset isn't i_size or larger, return
  181. * i_size.
  182. */
  183. if (offset >= inode->i_size) {
  184. retval = -ENXIO;
  185. goto out;
  186. }
  187. offset = inode->i_size;
  188. break;
  189. }
  190. retval = -EINVAL;
  191. if (offset >= 0 || unsigned_offsets(file)) {
  192. if (offset != file->f_pos) {
  193. file->f_pos = offset;
  194. file->f_version = 0;
  195. }
  196. retval = offset;
  197. }
  198. out:
  199. mutex_unlock(&inode->i_mutex);
  200. return retval;
  201. }
  202. EXPORT_SYMBOL(default_llseek);
  203. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  204. {
  205. loff_t (*fn)(struct file *, loff_t, int);
  206. fn = no_llseek;
  207. if (file->f_mode & FMODE_LSEEK) {
  208. if (file->f_op && file->f_op->llseek)
  209. fn = file->f_op->llseek;
  210. }
  211. return fn(file, offset, whence);
  212. }
  213. EXPORT_SYMBOL(vfs_llseek);
  214. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  215. {
  216. off_t retval;
  217. struct fd f = fdget(fd);
  218. if (!f.file)
  219. return -EBADF;
  220. retval = -EINVAL;
  221. if (whence <= SEEK_MAX) {
  222. loff_t res = vfs_llseek(f.file, offset, whence);
  223. retval = res;
  224. if (res != (loff_t)retval)
  225. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  226. }
  227. fdput(f);
  228. return retval;
  229. }
  230. #ifdef CONFIG_COMPAT
  231. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  232. {
  233. return sys_lseek(fd, offset, whence);
  234. }
  235. #endif
  236. #ifdef __ARCH_WANT_SYS_LLSEEK
  237. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  238. unsigned long, offset_low, loff_t __user *, result,
  239. unsigned int, whence)
  240. {
  241. int retval;
  242. struct fd f = fdget(fd);
  243. loff_t offset;
  244. if (!f.file)
  245. return -EBADF;
  246. retval = -EINVAL;
  247. if (whence > SEEK_MAX)
  248. goto out_putf;
  249. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  250. whence);
  251. retval = (int)offset;
  252. if (offset >= 0) {
  253. retval = -EFAULT;
  254. if (!copy_to_user(result, &offset, sizeof(offset)))
  255. retval = 0;
  256. }
  257. out_putf:
  258. fdput(f);
  259. return retval;
  260. }
  261. #endif
  262. /*
  263. * rw_verify_area doesn't like huge counts. We limit
  264. * them to something that fits in "int" so that others
  265. * won't have to do range checks all the time.
  266. */
  267. int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
  268. {
  269. struct inode *inode;
  270. loff_t pos;
  271. int retval = -EINVAL;
  272. inode = file_inode(file);
  273. if (unlikely((ssize_t) count < 0))
  274. return retval;
  275. pos = *ppos;
  276. if (unlikely(pos < 0)) {
  277. if (!unsigned_offsets(file))
  278. return retval;
  279. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  280. return -EOVERFLOW;
  281. } else if (unlikely((loff_t) (pos + count) < 0)) {
  282. if (!unsigned_offsets(file))
  283. return retval;
  284. }
  285. if (unlikely(inode->i_flock && mandatory_lock(inode))) {
  286. retval = locks_mandatory_area(
  287. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  288. inode, file, pos, count);
  289. if (retval < 0)
  290. return retval;
  291. }
  292. retval = security_file_permission(file,
  293. read_write == READ ? MAY_READ : MAY_WRITE);
  294. if (retval)
  295. return retval;
  296. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  297. }
  298. static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
  299. {
  300. set_current_state(TASK_UNINTERRUPTIBLE);
  301. if (!kiocbIsKicked(iocb))
  302. schedule();
  303. else
  304. kiocbClearKicked(iocb);
  305. __set_current_state(TASK_RUNNING);
  306. }
  307. ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  308. {
  309. struct iovec iov = { .iov_base = buf, .iov_len = len };
  310. struct kiocb kiocb;
  311. ssize_t ret;
  312. init_sync_kiocb(&kiocb, filp);
  313. kiocb.ki_pos = *ppos;
  314. kiocb.ki_left = len;
  315. kiocb.ki_nbytes = len;
  316. for (;;) {
  317. ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
  318. if (ret != -EIOCBRETRY)
  319. break;
  320. wait_on_retry_sync_kiocb(&kiocb);
  321. }
  322. if (-EIOCBQUEUED == ret)
  323. ret = wait_on_sync_kiocb(&kiocb);
  324. *ppos = kiocb.ki_pos;
  325. return ret;
  326. }
  327. EXPORT_SYMBOL(do_sync_read);
  328. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  329. {
  330. ssize_t ret;
  331. if (!(file->f_mode & FMODE_READ))
  332. return -EBADF;
  333. if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
  334. return -EINVAL;
  335. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  336. return -EFAULT;
  337. ret = rw_verify_area(READ, file, pos, count);
  338. if (ret >= 0) {
  339. count = ret;
  340. if (file->f_op->read)
  341. ret = file->f_op->read(file, buf, count, pos);
  342. else
  343. ret = do_sync_read(file, buf, count, pos);
  344. if (ret > 0) {
  345. fsnotify_access(file);
  346. add_rchar(current, ret);
  347. }
  348. inc_syscr(current);
  349. }
  350. return ret;
  351. }
  352. EXPORT_SYMBOL(vfs_read);
  353. ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  354. {
  355. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  356. struct kiocb kiocb;
  357. ssize_t ret;
  358. init_sync_kiocb(&kiocb, filp);
  359. kiocb.ki_pos = *ppos;
  360. kiocb.ki_left = len;
  361. kiocb.ki_nbytes = len;
  362. for (;;) {
  363. ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
  364. if (ret != -EIOCBRETRY)
  365. break;
  366. wait_on_retry_sync_kiocb(&kiocb);
  367. }
  368. if (-EIOCBQUEUED == ret)
  369. ret = wait_on_sync_kiocb(&kiocb);
  370. *ppos = kiocb.ki_pos;
  371. return ret;
  372. }
  373. EXPORT_SYMBOL(do_sync_write);
  374. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  375. {
  376. ssize_t ret;
  377. if (!(file->f_mode & FMODE_WRITE))
  378. return -EBADF;
  379. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  380. return -EINVAL;
  381. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  382. return -EFAULT;
  383. ret = rw_verify_area(WRITE, file, pos, count);
  384. if (ret >= 0) {
  385. count = ret;
  386. if (file->f_op->write)
  387. ret = file->f_op->write(file, buf, count, pos);
  388. else
  389. ret = do_sync_write(file, buf, count, pos);
  390. if (ret > 0) {
  391. fsnotify_modify(file);
  392. add_wchar(current, ret);
  393. }
  394. inc_syscw(current);
  395. }
  396. return ret;
  397. }
  398. EXPORT_SYMBOL(vfs_write);
  399. static inline loff_t file_pos_read(struct file *file)
  400. {
  401. return file->f_pos;
  402. }
  403. static inline void file_pos_write(struct file *file, loff_t pos)
  404. {
  405. file->f_pos = pos;
  406. }
  407. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  408. {
  409. struct fd f = fdget(fd);
  410. ssize_t ret = -EBADF;
  411. if (f.file) {
  412. loff_t pos = file_pos_read(f.file);
  413. ret = vfs_read(f.file, buf, count, &pos);
  414. file_pos_write(f.file, pos);
  415. fdput(f);
  416. }
  417. return ret;
  418. }
  419. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  420. size_t, count)
  421. {
  422. struct fd f = fdget(fd);
  423. ssize_t ret = -EBADF;
  424. if (f.file) {
  425. loff_t pos = file_pos_read(f.file);
  426. ret = vfs_write(f.file, buf, count, &pos);
  427. file_pos_write(f.file, pos);
  428. fdput(f);
  429. }
  430. return ret;
  431. }
  432. SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
  433. size_t count, loff_t pos)
  434. {
  435. struct fd f;
  436. ssize_t ret = -EBADF;
  437. if (pos < 0)
  438. return -EINVAL;
  439. f = fdget(fd);
  440. if (f.file) {
  441. ret = -ESPIPE;
  442. if (f.file->f_mode & FMODE_PREAD)
  443. ret = vfs_read(f.file, buf, count, &pos);
  444. fdput(f);
  445. }
  446. return ret;
  447. }
  448. #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
  449. asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
  450. {
  451. return SYSC_pread64((unsigned int) fd, (char __user *) buf,
  452. (size_t) count, pos);
  453. }
  454. SYSCALL_ALIAS(sys_pread64, SyS_pread64);
  455. #endif
  456. SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
  457. size_t count, loff_t pos)
  458. {
  459. struct fd f;
  460. ssize_t ret = -EBADF;
  461. if (pos < 0)
  462. return -EINVAL;
  463. f = fdget(fd);
  464. if (f.file) {
  465. ret = -ESPIPE;
  466. if (f.file->f_mode & FMODE_PWRITE)
  467. ret = vfs_write(f.file, buf, count, &pos);
  468. fdput(f);
  469. }
  470. return ret;
  471. }
  472. #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
  473. asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
  474. {
  475. return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
  476. (size_t) count, pos);
  477. }
  478. SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
  479. #endif
  480. /*
  481. * Reduce an iovec's length in-place. Return the resulting number of segments
  482. */
  483. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  484. {
  485. unsigned long seg = 0;
  486. size_t len = 0;
  487. while (seg < nr_segs) {
  488. seg++;
  489. if (len + iov->iov_len >= to) {
  490. iov->iov_len = to - len;
  491. break;
  492. }
  493. len += iov->iov_len;
  494. iov++;
  495. }
  496. return seg;
  497. }
  498. EXPORT_SYMBOL(iov_shorten);
  499. ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
  500. unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
  501. {
  502. struct kiocb kiocb;
  503. ssize_t ret;
  504. init_sync_kiocb(&kiocb, filp);
  505. kiocb.ki_pos = *ppos;
  506. kiocb.ki_left = len;
  507. kiocb.ki_nbytes = len;
  508. for (;;) {
  509. ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
  510. if (ret != -EIOCBRETRY)
  511. break;
  512. wait_on_retry_sync_kiocb(&kiocb);
  513. }
  514. if (ret == -EIOCBQUEUED)
  515. ret = wait_on_sync_kiocb(&kiocb);
  516. *ppos = kiocb.ki_pos;
  517. return ret;
  518. }
  519. /* Do it by hand, with file-ops */
  520. ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
  521. unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
  522. {
  523. struct iovec *vector = iov;
  524. ssize_t ret = 0;
  525. while (nr_segs > 0) {
  526. void __user *base;
  527. size_t len;
  528. ssize_t nr;
  529. base = vector->iov_base;
  530. len = vector->iov_len;
  531. vector++;
  532. nr_segs--;
  533. nr = fn(filp, base, len, ppos);
  534. if (nr < 0) {
  535. if (!ret)
  536. ret = nr;
  537. break;
  538. }
  539. ret += nr;
  540. if (nr != len)
  541. break;
  542. }
  543. return ret;
  544. }
  545. /* A write operation does a read from user space and vice versa */
  546. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  547. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  548. unsigned long nr_segs, unsigned long fast_segs,
  549. struct iovec *fast_pointer,
  550. struct iovec **ret_pointer)
  551. {
  552. unsigned long seg;
  553. ssize_t ret;
  554. struct iovec *iov = fast_pointer;
  555. /*
  556. * SuS says "The readv() function *may* fail if the iovcnt argument
  557. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  558. * traditionally returned zero for zero segments, so...
  559. */
  560. if (nr_segs == 0) {
  561. ret = 0;
  562. goto out;
  563. }
  564. /*
  565. * First get the "struct iovec" from user memory and
  566. * verify all the pointers
  567. */
  568. if (nr_segs > UIO_MAXIOV) {
  569. ret = -EINVAL;
  570. goto out;
  571. }
  572. if (nr_segs > fast_segs) {
  573. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  574. if (iov == NULL) {
  575. ret = -ENOMEM;
  576. goto out;
  577. }
  578. }
  579. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  580. ret = -EFAULT;
  581. goto out;
  582. }
  583. /*
  584. * According to the Single Unix Specification we should return EINVAL
  585. * if an element length is < 0 when cast to ssize_t or if the
  586. * total length would overflow the ssize_t return value of the
  587. * system call.
  588. *
  589. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  590. * overflow case.
  591. */
  592. ret = 0;
  593. for (seg = 0; seg < nr_segs; seg++) {
  594. void __user *buf = iov[seg].iov_base;
  595. ssize_t len = (ssize_t)iov[seg].iov_len;
  596. /* see if we we're about to use an invalid len or if
  597. * it's about to overflow ssize_t */
  598. if (len < 0) {
  599. ret = -EINVAL;
  600. goto out;
  601. }
  602. if (type >= 0
  603. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  604. ret = -EFAULT;
  605. goto out;
  606. }
  607. if (len > MAX_RW_COUNT - ret) {
  608. len = MAX_RW_COUNT - ret;
  609. iov[seg].iov_len = len;
  610. }
  611. ret += len;
  612. }
  613. out:
  614. *ret_pointer = iov;
  615. return ret;
  616. }
  617. static ssize_t do_readv_writev(int type, struct file *file,
  618. const struct iovec __user * uvector,
  619. unsigned long nr_segs, loff_t *pos)
  620. {
  621. size_t tot_len;
  622. struct iovec iovstack[UIO_FASTIOV];
  623. struct iovec *iov = iovstack;
  624. ssize_t ret;
  625. io_fn_t fn;
  626. iov_fn_t fnv;
  627. if (!file->f_op) {
  628. ret = -EINVAL;
  629. goto out;
  630. }
  631. ret = rw_copy_check_uvector(type, uvector, nr_segs,
  632. ARRAY_SIZE(iovstack), iovstack, &iov);
  633. if (ret <= 0)
  634. goto out;
  635. tot_len = ret;
  636. ret = rw_verify_area(type, file, pos, tot_len);
  637. if (ret < 0)
  638. goto out;
  639. fnv = NULL;
  640. if (type == READ) {
  641. fn = file->f_op->read;
  642. fnv = file->f_op->aio_read;
  643. } else {
  644. fn = (io_fn_t)file->f_op->write;
  645. fnv = file->f_op->aio_write;
  646. }
  647. if (fnv)
  648. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  649. pos, fnv);
  650. else
  651. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  652. out:
  653. if (iov != iovstack)
  654. kfree(iov);
  655. if ((ret + (type == READ)) > 0) {
  656. if (type == READ)
  657. fsnotify_access(file);
  658. else
  659. fsnotify_modify(file);
  660. }
  661. return ret;
  662. }
  663. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  664. unsigned long vlen, loff_t *pos)
  665. {
  666. if (!(file->f_mode & FMODE_READ))
  667. return -EBADF;
  668. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  669. return -EINVAL;
  670. return do_readv_writev(READ, file, vec, vlen, pos);
  671. }
  672. EXPORT_SYMBOL(vfs_readv);
  673. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  674. unsigned long vlen, loff_t *pos)
  675. {
  676. if (!(file->f_mode & FMODE_WRITE))
  677. return -EBADF;
  678. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  679. return -EINVAL;
  680. return do_readv_writev(WRITE, file, vec, vlen, pos);
  681. }
  682. EXPORT_SYMBOL(vfs_writev);
  683. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  684. unsigned long, vlen)
  685. {
  686. struct fd f = fdget(fd);
  687. ssize_t ret = -EBADF;
  688. if (f.file) {
  689. loff_t pos = file_pos_read(f.file);
  690. ret = vfs_readv(f.file, vec, vlen, &pos);
  691. file_pos_write(f.file, pos);
  692. fdput(f);
  693. }
  694. if (ret > 0)
  695. add_rchar(current, ret);
  696. inc_syscr(current);
  697. return ret;
  698. }
  699. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  700. unsigned long, vlen)
  701. {
  702. struct fd f = fdget(fd);
  703. ssize_t ret = -EBADF;
  704. if (f.file) {
  705. loff_t pos = file_pos_read(f.file);
  706. ret = vfs_writev(f.file, vec, vlen, &pos);
  707. file_pos_write(f.file, pos);
  708. fdput(f);
  709. }
  710. if (ret > 0)
  711. add_wchar(current, ret);
  712. inc_syscw(current);
  713. return ret;
  714. }
  715. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  716. {
  717. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  718. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  719. }
  720. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  721. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  722. {
  723. loff_t pos = pos_from_hilo(pos_h, pos_l);
  724. struct fd f;
  725. ssize_t ret = -EBADF;
  726. if (pos < 0)
  727. return -EINVAL;
  728. f = fdget(fd);
  729. if (f.file) {
  730. ret = -ESPIPE;
  731. if (f.file->f_mode & FMODE_PREAD)
  732. ret = vfs_readv(f.file, vec, vlen, &pos);
  733. fdput(f);
  734. }
  735. if (ret > 0)
  736. add_rchar(current, ret);
  737. inc_syscr(current);
  738. return ret;
  739. }
  740. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  741. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  742. {
  743. loff_t pos = pos_from_hilo(pos_h, pos_l);
  744. struct fd f;
  745. ssize_t ret = -EBADF;
  746. if (pos < 0)
  747. return -EINVAL;
  748. f = fdget(fd);
  749. if (f.file) {
  750. ret = -ESPIPE;
  751. if (f.file->f_mode & FMODE_PWRITE)
  752. ret = vfs_writev(f.file, vec, vlen, &pos);
  753. fdput(f);
  754. }
  755. if (ret > 0)
  756. add_wchar(current, ret);
  757. inc_syscw(current);
  758. return ret;
  759. }
  760. ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
  761. loff_t max)
  762. {
  763. struct fd in, out;
  764. struct inode *in_inode, *out_inode;
  765. loff_t pos;
  766. ssize_t retval;
  767. int fl;
  768. /*
  769. * Get input file, and verify that it is ok..
  770. */
  771. retval = -EBADF;
  772. in = fdget(in_fd);
  773. if (!in.file)
  774. goto out;
  775. if (!(in.file->f_mode & FMODE_READ))
  776. goto fput_in;
  777. retval = -ESPIPE;
  778. if (!ppos)
  779. ppos = &in.file->f_pos;
  780. else
  781. if (!(in.file->f_mode & FMODE_PREAD))
  782. goto fput_in;
  783. retval = rw_verify_area(READ, in.file, ppos, count);
  784. if (retval < 0)
  785. goto fput_in;
  786. count = retval;
  787. /*
  788. * Get output file, and verify that it is ok..
  789. */
  790. retval = -EBADF;
  791. out = fdget(out_fd);
  792. if (!out.file)
  793. goto fput_in;
  794. if (!(out.file->f_mode & FMODE_WRITE))
  795. goto fput_out;
  796. retval = -EINVAL;
  797. in_inode = file_inode(in.file);
  798. out_inode = file_inode(out.file);
  799. retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
  800. if (retval < 0)
  801. goto fput_out;
  802. count = retval;
  803. if (!max)
  804. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  805. pos = *ppos;
  806. if (unlikely(pos + count > max)) {
  807. retval = -EOVERFLOW;
  808. if (pos >= max)
  809. goto fput_out;
  810. count = max - pos;
  811. }
  812. fl = 0;
  813. #if 0
  814. /*
  815. * We need to debate whether we can enable this or not. The
  816. * man page documents EAGAIN return for the output at least,
  817. * and the application is arguably buggy if it doesn't expect
  818. * EAGAIN on a non-blocking file descriptor.
  819. */
  820. if (in.file->f_flags & O_NONBLOCK)
  821. fl = SPLICE_F_NONBLOCK;
  822. #endif
  823. retval = do_splice_direct(in.file, ppos, out.file, count, fl);
  824. if (retval > 0) {
  825. add_rchar(current, retval);
  826. add_wchar(current, retval);
  827. fsnotify_access(in.file);
  828. fsnotify_modify(out.file);
  829. }
  830. inc_syscr(current);
  831. inc_syscw(current);
  832. if (*ppos > max)
  833. retval = -EOVERFLOW;
  834. fput_out:
  835. fdput(out);
  836. fput_in:
  837. fdput(in);
  838. out:
  839. return retval;
  840. }
  841. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  842. {
  843. loff_t pos;
  844. off_t off;
  845. ssize_t ret;
  846. if (offset) {
  847. if (unlikely(get_user(off, offset)))
  848. return -EFAULT;
  849. pos = off;
  850. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  851. if (unlikely(put_user(pos, offset)))
  852. return -EFAULT;
  853. return ret;
  854. }
  855. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  856. }
  857. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  858. {
  859. loff_t pos;
  860. ssize_t ret;
  861. if (offset) {
  862. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  863. return -EFAULT;
  864. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  865. if (unlikely(put_user(pos, offset)))
  866. return -EFAULT;
  867. return ret;
  868. }
  869. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  870. }