read_write.c 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/aio.h>
  12. #include <linux/fsnotify.h>
  13. #include <linux/security.h>
  14. #include <linux/export.h>
  15. #include <linux/syscalls.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/splice.h>
  18. #include <linux/compat.h>
  19. #include "internal.h"
  20. #include <asm/uaccess.h>
  21. #include <asm/unistd.h>
  22. typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  23. typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  24. unsigned long, loff_t);
  25. const struct file_operations generic_ro_fops = {
  26. .llseek = generic_file_llseek,
  27. .read = do_sync_read,
  28. .aio_read = generic_file_aio_read,
  29. .mmap = generic_file_readonly_mmap,
  30. .splice_read = generic_file_splice_read,
  31. };
  32. EXPORT_SYMBOL(generic_ro_fops);
  33. static inline int unsigned_offsets(struct file *file)
  34. {
  35. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  36. }
  37. static loff_t lseek_execute(struct file *file, struct inode *inode,
  38. loff_t offset, loff_t maxsize)
  39. {
  40. if (offset < 0 && !unsigned_offsets(file))
  41. return -EINVAL;
  42. if (offset > maxsize)
  43. return -EINVAL;
  44. if (offset != file->f_pos) {
  45. file->f_pos = offset;
  46. file->f_version = 0;
  47. }
  48. return offset;
  49. }
  50. /**
  51. * generic_file_llseek_size - generic llseek implementation for regular files
  52. * @file: file structure to seek on
  53. * @offset: file offset to seek to
  54. * @whence: type of seek
  55. * @size: max size of this file in file system
  56. * @eof: offset used for SEEK_END position
  57. *
  58. * This is a variant of generic_file_llseek that allows passing in a custom
  59. * maximum file size and a custom EOF position, for e.g. hashed directories
  60. *
  61. * Synchronization:
  62. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  63. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  64. * read/writes behave like SEEK_SET against seeks.
  65. */
  66. loff_t
  67. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  68. loff_t maxsize, loff_t eof)
  69. {
  70. struct inode *inode = file->f_mapping->host;
  71. switch (whence) {
  72. case SEEK_END:
  73. offset += eof;
  74. break;
  75. case SEEK_CUR:
  76. /*
  77. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  78. * position-querying operation. Avoid rewriting the "same"
  79. * f_pos value back to the file because a concurrent read(),
  80. * write() or lseek() might have altered it
  81. */
  82. if (offset == 0)
  83. return file->f_pos;
  84. /*
  85. * f_lock protects against read/modify/write race with other
  86. * SEEK_CURs. Note that parallel writes and reads behave
  87. * like SEEK_SET.
  88. */
  89. spin_lock(&file->f_lock);
  90. offset = lseek_execute(file, inode, file->f_pos + offset,
  91. maxsize);
  92. spin_unlock(&file->f_lock);
  93. return offset;
  94. case SEEK_DATA:
  95. /*
  96. * In the generic case the entire file is data, so as long as
  97. * offset isn't at the end of the file then the offset is data.
  98. */
  99. if (offset >= eof)
  100. return -ENXIO;
  101. break;
  102. case SEEK_HOLE:
  103. /*
  104. * There is a virtual hole at the end of the file, so as long as
  105. * offset isn't i_size or larger, return i_size.
  106. */
  107. if (offset >= eof)
  108. return -ENXIO;
  109. offset = eof;
  110. break;
  111. }
  112. return lseek_execute(file, inode, offset, maxsize);
  113. }
  114. EXPORT_SYMBOL(generic_file_llseek_size);
  115. /**
  116. * generic_file_llseek - generic llseek implementation for regular files
  117. * @file: file structure to seek on
  118. * @offset: file offset to seek to
  119. * @whence: type of seek
  120. *
  121. * This is a generic implemenation of ->llseek useable for all normal local
  122. * filesystems. It just updates the file offset to the value specified by
  123. * @offset and @whence.
  124. */
  125. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  126. {
  127. struct inode *inode = file->f_mapping->host;
  128. return generic_file_llseek_size(file, offset, whence,
  129. inode->i_sb->s_maxbytes,
  130. i_size_read(inode));
  131. }
  132. EXPORT_SYMBOL(generic_file_llseek);
  133. /**
  134. * noop_llseek - No Operation Performed llseek implementation
  135. * @file: file structure to seek on
  136. * @offset: file offset to seek to
  137. * @whence: type of seek
  138. *
  139. * This is an implementation of ->llseek useable for the rare special case when
  140. * userspace expects the seek to succeed but the (device) file is actually not
  141. * able to perform the seek. In this case you use noop_llseek() instead of
  142. * falling back to the default implementation of ->llseek.
  143. */
  144. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  145. {
  146. return file->f_pos;
  147. }
  148. EXPORT_SYMBOL(noop_llseek);
  149. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  150. {
  151. return -ESPIPE;
  152. }
  153. EXPORT_SYMBOL(no_llseek);
  154. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  155. {
  156. struct inode *inode = file_inode(file);
  157. loff_t retval;
  158. mutex_lock(&inode->i_mutex);
  159. switch (whence) {
  160. case SEEK_END:
  161. offset += i_size_read(inode);
  162. break;
  163. case SEEK_CUR:
  164. if (offset == 0) {
  165. retval = file->f_pos;
  166. goto out;
  167. }
  168. offset += file->f_pos;
  169. break;
  170. case SEEK_DATA:
  171. /*
  172. * In the generic case the entire file is data, so as
  173. * long as offset isn't at the end of the file then the
  174. * offset is data.
  175. */
  176. if (offset >= inode->i_size) {
  177. retval = -ENXIO;
  178. goto out;
  179. }
  180. break;
  181. case SEEK_HOLE:
  182. /*
  183. * There is a virtual hole at the end of the file, so
  184. * as long as offset isn't i_size or larger, return
  185. * i_size.
  186. */
  187. if (offset >= inode->i_size) {
  188. retval = -ENXIO;
  189. goto out;
  190. }
  191. offset = inode->i_size;
  192. break;
  193. }
  194. retval = -EINVAL;
  195. if (offset >= 0 || unsigned_offsets(file)) {
  196. if (offset != file->f_pos) {
  197. file->f_pos = offset;
  198. file->f_version = 0;
  199. }
  200. retval = offset;
  201. }
  202. out:
  203. mutex_unlock(&inode->i_mutex);
  204. return retval;
  205. }
  206. EXPORT_SYMBOL(default_llseek);
  207. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  208. {
  209. loff_t (*fn)(struct file *, loff_t, int);
  210. fn = no_llseek;
  211. if (file->f_mode & FMODE_LSEEK) {
  212. if (file->f_op && file->f_op->llseek)
  213. fn = file->f_op->llseek;
  214. }
  215. return fn(file, offset, whence);
  216. }
  217. EXPORT_SYMBOL(vfs_llseek);
  218. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  219. {
  220. off_t retval;
  221. struct fd f = fdget(fd);
  222. if (!f.file)
  223. return -EBADF;
  224. retval = -EINVAL;
  225. if (whence <= SEEK_MAX) {
  226. loff_t res = vfs_llseek(f.file, offset, whence);
  227. retval = res;
  228. if (res != (loff_t)retval)
  229. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  230. }
  231. fdput(f);
  232. return retval;
  233. }
  234. #ifdef CONFIG_COMPAT
  235. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  236. {
  237. return sys_lseek(fd, offset, whence);
  238. }
  239. #endif
  240. #ifdef __ARCH_WANT_SYS_LLSEEK
  241. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  242. unsigned long, offset_low, loff_t __user *, result,
  243. unsigned int, whence)
  244. {
  245. int retval;
  246. struct fd f = fdget(fd);
  247. loff_t offset;
  248. if (!f.file)
  249. return -EBADF;
  250. retval = -EINVAL;
  251. if (whence > SEEK_MAX)
  252. goto out_putf;
  253. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  254. whence);
  255. retval = (int)offset;
  256. if (offset >= 0) {
  257. retval = -EFAULT;
  258. if (!copy_to_user(result, &offset, sizeof(offset)))
  259. retval = 0;
  260. }
  261. out_putf:
  262. fdput(f);
  263. return retval;
  264. }
  265. #endif
  266. /*
  267. * rw_verify_area doesn't like huge counts. We limit
  268. * them to something that fits in "int" so that others
  269. * won't have to do range checks all the time.
  270. */
  271. int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
  272. {
  273. struct inode *inode;
  274. loff_t pos;
  275. int retval = -EINVAL;
  276. inode = file_inode(file);
  277. if (unlikely((ssize_t) count < 0))
  278. return retval;
  279. pos = *ppos;
  280. if (unlikely(pos < 0)) {
  281. if (!unsigned_offsets(file))
  282. return retval;
  283. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  284. return -EOVERFLOW;
  285. } else if (unlikely((loff_t) (pos + count) < 0)) {
  286. if (!unsigned_offsets(file))
  287. return retval;
  288. }
  289. if (unlikely(inode->i_flock && mandatory_lock(inode))) {
  290. retval = locks_mandatory_area(
  291. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  292. inode, file, pos, count);
  293. if (retval < 0)
  294. return retval;
  295. }
  296. retval = security_file_permission(file,
  297. read_write == READ ? MAY_READ : MAY_WRITE);
  298. if (retval)
  299. return retval;
  300. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  301. }
  302. ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  303. {
  304. struct iovec iov = { .iov_base = buf, .iov_len = len };
  305. struct kiocb kiocb;
  306. ssize_t ret;
  307. init_sync_kiocb(&kiocb, filp);
  308. kiocb.ki_pos = *ppos;
  309. kiocb.ki_left = len;
  310. kiocb.ki_nbytes = len;
  311. ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
  312. if (-EIOCBQUEUED == ret)
  313. ret = wait_on_sync_kiocb(&kiocb);
  314. *ppos = kiocb.ki_pos;
  315. return ret;
  316. }
  317. EXPORT_SYMBOL(do_sync_read);
  318. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  319. {
  320. ssize_t ret;
  321. if (!(file->f_mode & FMODE_READ))
  322. return -EBADF;
  323. if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
  324. return -EINVAL;
  325. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  326. return -EFAULT;
  327. ret = rw_verify_area(READ, file, pos, count);
  328. if (ret >= 0) {
  329. count = ret;
  330. if (file->f_op->read)
  331. ret = file->f_op->read(file, buf, count, pos);
  332. else
  333. ret = do_sync_read(file, buf, count, pos);
  334. if (ret > 0) {
  335. fsnotify_access(file);
  336. add_rchar(current, ret);
  337. }
  338. inc_syscr(current);
  339. }
  340. return ret;
  341. }
  342. EXPORT_SYMBOL(vfs_read);
  343. ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  344. {
  345. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  346. struct kiocb kiocb;
  347. ssize_t ret;
  348. init_sync_kiocb(&kiocb, filp);
  349. kiocb.ki_pos = *ppos;
  350. kiocb.ki_left = len;
  351. kiocb.ki_nbytes = len;
  352. ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
  353. if (-EIOCBQUEUED == ret)
  354. ret = wait_on_sync_kiocb(&kiocb);
  355. *ppos = kiocb.ki_pos;
  356. return ret;
  357. }
  358. EXPORT_SYMBOL(do_sync_write);
  359. ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
  360. {
  361. mm_segment_t old_fs;
  362. const char __user *p;
  363. ssize_t ret;
  364. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  365. return -EINVAL;
  366. old_fs = get_fs();
  367. set_fs(get_ds());
  368. p = (__force const char __user *)buf;
  369. if (count > MAX_RW_COUNT)
  370. count = MAX_RW_COUNT;
  371. if (file->f_op->write)
  372. ret = file->f_op->write(file, p, count, pos);
  373. else
  374. ret = do_sync_write(file, p, count, pos);
  375. set_fs(old_fs);
  376. if (ret > 0) {
  377. fsnotify_modify(file);
  378. add_wchar(current, ret);
  379. }
  380. inc_syscw(current);
  381. return ret;
  382. }
  383. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  384. {
  385. ssize_t ret;
  386. if (!(file->f_mode & FMODE_WRITE))
  387. return -EBADF;
  388. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  389. return -EINVAL;
  390. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  391. return -EFAULT;
  392. ret = rw_verify_area(WRITE, file, pos, count);
  393. if (ret >= 0) {
  394. count = ret;
  395. file_start_write(file);
  396. if (file->f_op->write)
  397. ret = file->f_op->write(file, buf, count, pos);
  398. else
  399. ret = do_sync_write(file, buf, count, pos);
  400. if (ret > 0) {
  401. fsnotify_modify(file);
  402. add_wchar(current, ret);
  403. }
  404. inc_syscw(current);
  405. file_end_write(file);
  406. }
  407. return ret;
  408. }
  409. EXPORT_SYMBOL(vfs_write);
  410. static inline loff_t file_pos_read(struct file *file)
  411. {
  412. return file->f_pos;
  413. }
  414. static inline void file_pos_write(struct file *file, loff_t pos)
  415. {
  416. file->f_pos = pos;
  417. }
  418. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  419. {
  420. struct fd f = fdget(fd);
  421. ssize_t ret = -EBADF;
  422. if (f.file) {
  423. loff_t pos = file_pos_read(f.file);
  424. ret = vfs_read(f.file, buf, count, &pos);
  425. file_pos_write(f.file, pos);
  426. fdput(f);
  427. }
  428. return ret;
  429. }
  430. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  431. size_t, count)
  432. {
  433. struct fd f = fdget(fd);
  434. ssize_t ret = -EBADF;
  435. if (f.file) {
  436. loff_t pos = file_pos_read(f.file);
  437. ret = vfs_write(f.file, buf, count, &pos);
  438. file_pos_write(f.file, pos);
  439. fdput(f);
  440. }
  441. return ret;
  442. }
  443. SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
  444. size_t, count, loff_t, pos)
  445. {
  446. struct fd f;
  447. ssize_t ret = -EBADF;
  448. if (pos < 0)
  449. return -EINVAL;
  450. f = fdget(fd);
  451. if (f.file) {
  452. ret = -ESPIPE;
  453. if (f.file->f_mode & FMODE_PREAD)
  454. ret = vfs_read(f.file, buf, count, &pos);
  455. fdput(f);
  456. }
  457. return ret;
  458. }
  459. SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
  460. size_t, count, loff_t, pos)
  461. {
  462. struct fd f;
  463. ssize_t ret = -EBADF;
  464. if (pos < 0)
  465. return -EINVAL;
  466. f = fdget(fd);
  467. if (f.file) {
  468. ret = -ESPIPE;
  469. if (f.file->f_mode & FMODE_PWRITE)
  470. ret = vfs_write(f.file, buf, count, &pos);
  471. fdput(f);
  472. }
  473. return ret;
  474. }
  475. /*
  476. * Reduce an iovec's length in-place. Return the resulting number of segments
  477. */
  478. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  479. {
  480. unsigned long seg = 0;
  481. size_t len = 0;
  482. while (seg < nr_segs) {
  483. seg++;
  484. if (len + iov->iov_len >= to) {
  485. iov->iov_len = to - len;
  486. break;
  487. }
  488. len += iov->iov_len;
  489. iov++;
  490. }
  491. return seg;
  492. }
  493. EXPORT_SYMBOL(iov_shorten);
  494. static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
  495. unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
  496. {
  497. struct kiocb kiocb;
  498. ssize_t ret;
  499. init_sync_kiocb(&kiocb, filp);
  500. kiocb.ki_pos = *ppos;
  501. kiocb.ki_left = len;
  502. kiocb.ki_nbytes = len;
  503. ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
  504. if (ret == -EIOCBQUEUED)
  505. ret = wait_on_sync_kiocb(&kiocb);
  506. *ppos = kiocb.ki_pos;
  507. return ret;
  508. }
  509. /* Do it by hand, with file-ops */
  510. static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
  511. unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
  512. {
  513. struct iovec *vector = iov;
  514. ssize_t ret = 0;
  515. while (nr_segs > 0) {
  516. void __user *base;
  517. size_t len;
  518. ssize_t nr;
  519. base = vector->iov_base;
  520. len = vector->iov_len;
  521. vector++;
  522. nr_segs--;
  523. nr = fn(filp, base, len, ppos);
  524. if (nr < 0) {
  525. if (!ret)
  526. ret = nr;
  527. break;
  528. }
  529. ret += nr;
  530. if (nr != len)
  531. break;
  532. }
  533. return ret;
  534. }
  535. /* A write operation does a read from user space and vice versa */
  536. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  537. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  538. unsigned long nr_segs, unsigned long fast_segs,
  539. struct iovec *fast_pointer,
  540. struct iovec **ret_pointer)
  541. {
  542. unsigned long seg;
  543. ssize_t ret;
  544. struct iovec *iov = fast_pointer;
  545. /*
  546. * SuS says "The readv() function *may* fail if the iovcnt argument
  547. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  548. * traditionally returned zero for zero segments, so...
  549. */
  550. if (nr_segs == 0) {
  551. ret = 0;
  552. goto out;
  553. }
  554. /*
  555. * First get the "struct iovec" from user memory and
  556. * verify all the pointers
  557. */
  558. if (nr_segs > UIO_MAXIOV) {
  559. ret = -EINVAL;
  560. goto out;
  561. }
  562. if (nr_segs > fast_segs) {
  563. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  564. if (iov == NULL) {
  565. ret = -ENOMEM;
  566. goto out;
  567. }
  568. }
  569. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  570. ret = -EFAULT;
  571. goto out;
  572. }
  573. /*
  574. * According to the Single Unix Specification we should return EINVAL
  575. * if an element length is < 0 when cast to ssize_t or if the
  576. * total length would overflow the ssize_t return value of the
  577. * system call.
  578. *
  579. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  580. * overflow case.
  581. */
  582. ret = 0;
  583. for (seg = 0; seg < nr_segs; seg++) {
  584. void __user *buf = iov[seg].iov_base;
  585. ssize_t len = (ssize_t)iov[seg].iov_len;
  586. /* see if we we're about to use an invalid len or if
  587. * it's about to overflow ssize_t */
  588. if (len < 0) {
  589. ret = -EINVAL;
  590. goto out;
  591. }
  592. if (type >= 0
  593. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  594. ret = -EFAULT;
  595. goto out;
  596. }
  597. if (len > MAX_RW_COUNT - ret) {
  598. len = MAX_RW_COUNT - ret;
  599. iov[seg].iov_len = len;
  600. }
  601. ret += len;
  602. }
  603. out:
  604. *ret_pointer = iov;
  605. return ret;
  606. }
  607. static ssize_t do_readv_writev(int type, struct file *file,
  608. const struct iovec __user * uvector,
  609. unsigned long nr_segs, loff_t *pos)
  610. {
  611. size_t tot_len;
  612. struct iovec iovstack[UIO_FASTIOV];
  613. struct iovec *iov = iovstack;
  614. ssize_t ret;
  615. io_fn_t fn;
  616. iov_fn_t fnv;
  617. if (!file->f_op) {
  618. ret = -EINVAL;
  619. goto out;
  620. }
  621. ret = rw_copy_check_uvector(type, uvector, nr_segs,
  622. ARRAY_SIZE(iovstack), iovstack, &iov);
  623. if (ret <= 0)
  624. goto out;
  625. tot_len = ret;
  626. ret = rw_verify_area(type, file, pos, tot_len);
  627. if (ret < 0)
  628. goto out;
  629. fnv = NULL;
  630. if (type == READ) {
  631. fn = file->f_op->read;
  632. fnv = file->f_op->aio_read;
  633. } else {
  634. fn = (io_fn_t)file->f_op->write;
  635. fnv = file->f_op->aio_write;
  636. file_start_write(file);
  637. }
  638. if (fnv)
  639. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  640. pos, fnv);
  641. else
  642. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  643. if (type != READ)
  644. file_end_write(file);
  645. out:
  646. if (iov != iovstack)
  647. kfree(iov);
  648. if ((ret + (type == READ)) > 0) {
  649. if (type == READ)
  650. fsnotify_access(file);
  651. else
  652. fsnotify_modify(file);
  653. }
  654. return ret;
  655. }
  656. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  657. unsigned long vlen, loff_t *pos)
  658. {
  659. if (!(file->f_mode & FMODE_READ))
  660. return -EBADF;
  661. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  662. return -EINVAL;
  663. return do_readv_writev(READ, file, vec, vlen, pos);
  664. }
  665. EXPORT_SYMBOL(vfs_readv);
  666. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  667. unsigned long vlen, loff_t *pos)
  668. {
  669. if (!(file->f_mode & FMODE_WRITE))
  670. return -EBADF;
  671. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  672. return -EINVAL;
  673. return do_readv_writev(WRITE, file, vec, vlen, pos);
  674. }
  675. EXPORT_SYMBOL(vfs_writev);
  676. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  677. unsigned long, vlen)
  678. {
  679. struct fd f = fdget(fd);
  680. ssize_t ret = -EBADF;
  681. if (f.file) {
  682. loff_t pos = file_pos_read(f.file);
  683. ret = vfs_readv(f.file, vec, vlen, &pos);
  684. file_pos_write(f.file, pos);
  685. fdput(f);
  686. }
  687. if (ret > 0)
  688. add_rchar(current, ret);
  689. inc_syscr(current);
  690. return ret;
  691. }
  692. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  693. unsigned long, vlen)
  694. {
  695. struct fd f = fdget(fd);
  696. ssize_t ret = -EBADF;
  697. if (f.file) {
  698. loff_t pos = file_pos_read(f.file);
  699. ret = vfs_writev(f.file, vec, vlen, &pos);
  700. file_pos_write(f.file, pos);
  701. fdput(f);
  702. }
  703. if (ret > 0)
  704. add_wchar(current, ret);
  705. inc_syscw(current);
  706. return ret;
  707. }
  708. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  709. {
  710. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  711. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  712. }
  713. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  714. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  715. {
  716. loff_t pos = pos_from_hilo(pos_h, pos_l);
  717. struct fd f;
  718. ssize_t ret = -EBADF;
  719. if (pos < 0)
  720. return -EINVAL;
  721. f = fdget(fd);
  722. if (f.file) {
  723. ret = -ESPIPE;
  724. if (f.file->f_mode & FMODE_PREAD)
  725. ret = vfs_readv(f.file, vec, vlen, &pos);
  726. fdput(f);
  727. }
  728. if (ret > 0)
  729. add_rchar(current, ret);
  730. inc_syscr(current);
  731. return ret;
  732. }
  733. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  734. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  735. {
  736. loff_t pos = pos_from_hilo(pos_h, pos_l);
  737. struct fd f;
  738. ssize_t ret = -EBADF;
  739. if (pos < 0)
  740. return -EINVAL;
  741. f = fdget(fd);
  742. if (f.file) {
  743. ret = -ESPIPE;
  744. if (f.file->f_mode & FMODE_PWRITE)
  745. ret = vfs_writev(f.file, vec, vlen, &pos);
  746. fdput(f);
  747. }
  748. if (ret > 0)
  749. add_wchar(current, ret);
  750. inc_syscw(current);
  751. return ret;
  752. }
  753. #ifdef CONFIG_COMPAT
  754. static ssize_t compat_do_readv_writev(int type, struct file *file,
  755. const struct compat_iovec __user *uvector,
  756. unsigned long nr_segs, loff_t *pos)
  757. {
  758. compat_ssize_t tot_len;
  759. struct iovec iovstack[UIO_FASTIOV];
  760. struct iovec *iov = iovstack;
  761. ssize_t ret;
  762. io_fn_t fn;
  763. iov_fn_t fnv;
  764. ret = -EINVAL;
  765. if (!file->f_op)
  766. goto out;
  767. ret = -EFAULT;
  768. if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
  769. goto out;
  770. ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
  771. UIO_FASTIOV, iovstack, &iov);
  772. if (ret <= 0)
  773. goto out;
  774. tot_len = ret;
  775. ret = rw_verify_area(type, file, pos, tot_len);
  776. if (ret < 0)
  777. goto out;
  778. fnv = NULL;
  779. if (type == READ) {
  780. fn = file->f_op->read;
  781. fnv = file->f_op->aio_read;
  782. } else {
  783. fn = (io_fn_t)file->f_op->write;
  784. fnv = file->f_op->aio_write;
  785. file_start_write(file);
  786. }
  787. if (fnv)
  788. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  789. pos, fnv);
  790. else
  791. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  792. if (type != READ)
  793. file_end_write(file);
  794. out:
  795. if (iov != iovstack)
  796. kfree(iov);
  797. if ((ret + (type == READ)) > 0) {
  798. if (type == READ)
  799. fsnotify_access(file);
  800. else
  801. fsnotify_modify(file);
  802. }
  803. return ret;
  804. }
  805. static size_t compat_readv(struct file *file,
  806. const struct compat_iovec __user *vec,
  807. unsigned long vlen, loff_t *pos)
  808. {
  809. ssize_t ret = -EBADF;
  810. if (!(file->f_mode & FMODE_READ))
  811. goto out;
  812. ret = -EINVAL;
  813. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  814. goto out;
  815. ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
  816. out:
  817. if (ret > 0)
  818. add_rchar(current, ret);
  819. inc_syscr(current);
  820. return ret;
  821. }
  822. COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
  823. const struct compat_iovec __user *,vec,
  824. unsigned long, vlen)
  825. {
  826. struct fd f = fdget(fd);
  827. ssize_t ret;
  828. loff_t pos;
  829. if (!f.file)
  830. return -EBADF;
  831. pos = f.file->f_pos;
  832. ret = compat_readv(f.file, vec, vlen, &pos);
  833. f.file->f_pos = pos;
  834. fdput(f);
  835. return ret;
  836. }
  837. COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
  838. const struct compat_iovec __user *,vec,
  839. unsigned long, vlen, loff_t, pos)
  840. {
  841. struct fd f;
  842. ssize_t ret;
  843. if (pos < 0)
  844. return -EINVAL;
  845. f = fdget(fd);
  846. if (!f.file)
  847. return -EBADF;
  848. ret = -ESPIPE;
  849. if (f.file->f_mode & FMODE_PREAD)
  850. ret = compat_readv(f.file, vec, vlen, &pos);
  851. fdput(f);
  852. return ret;
  853. }
  854. COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
  855. const struct compat_iovec __user *,vec,
  856. unsigned long, vlen, u32, pos_low, u32, pos_high)
  857. {
  858. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  859. return compat_sys_preadv64(fd, vec, vlen, pos);
  860. }
  861. static size_t compat_writev(struct file *file,
  862. const struct compat_iovec __user *vec,
  863. unsigned long vlen, loff_t *pos)
  864. {
  865. ssize_t ret = -EBADF;
  866. if (!(file->f_mode & FMODE_WRITE))
  867. goto out;
  868. ret = -EINVAL;
  869. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  870. goto out;
  871. ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
  872. out:
  873. if (ret > 0)
  874. add_wchar(current, ret);
  875. inc_syscw(current);
  876. return ret;
  877. }
  878. COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
  879. const struct compat_iovec __user *, vec,
  880. unsigned long, vlen)
  881. {
  882. struct fd f = fdget(fd);
  883. ssize_t ret;
  884. loff_t pos;
  885. if (!f.file)
  886. return -EBADF;
  887. pos = f.file->f_pos;
  888. ret = compat_writev(f.file, vec, vlen, &pos);
  889. f.file->f_pos = pos;
  890. fdput(f);
  891. return ret;
  892. }
  893. COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
  894. const struct compat_iovec __user *,vec,
  895. unsigned long, vlen, loff_t, pos)
  896. {
  897. struct fd f;
  898. ssize_t ret;
  899. if (pos < 0)
  900. return -EINVAL;
  901. f = fdget(fd);
  902. if (!f.file)
  903. return -EBADF;
  904. ret = -ESPIPE;
  905. if (f.file->f_mode & FMODE_PWRITE)
  906. ret = compat_writev(f.file, vec, vlen, &pos);
  907. fdput(f);
  908. return ret;
  909. }
  910. COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
  911. const struct compat_iovec __user *,vec,
  912. unsigned long, vlen, u32, pos_low, u32, pos_high)
  913. {
  914. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  915. return compat_sys_pwritev64(fd, vec, vlen, pos);
  916. }
  917. #endif
  918. static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  919. size_t count, loff_t max)
  920. {
  921. struct fd in, out;
  922. struct inode *in_inode, *out_inode;
  923. loff_t pos;
  924. ssize_t retval;
  925. int fl;
  926. /*
  927. * Get input file, and verify that it is ok..
  928. */
  929. retval = -EBADF;
  930. in = fdget(in_fd);
  931. if (!in.file)
  932. goto out;
  933. if (!(in.file->f_mode & FMODE_READ))
  934. goto fput_in;
  935. retval = -ESPIPE;
  936. if (!ppos)
  937. ppos = &in.file->f_pos;
  938. else
  939. if (!(in.file->f_mode & FMODE_PREAD))
  940. goto fput_in;
  941. retval = rw_verify_area(READ, in.file, ppos, count);
  942. if (retval < 0)
  943. goto fput_in;
  944. count = retval;
  945. /*
  946. * Get output file, and verify that it is ok..
  947. */
  948. retval = -EBADF;
  949. out = fdget(out_fd);
  950. if (!out.file)
  951. goto fput_in;
  952. if (!(out.file->f_mode & FMODE_WRITE))
  953. goto fput_out;
  954. retval = -EINVAL;
  955. in_inode = file_inode(in.file);
  956. out_inode = file_inode(out.file);
  957. retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
  958. if (retval < 0)
  959. goto fput_out;
  960. count = retval;
  961. if (!max)
  962. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  963. pos = *ppos;
  964. if (unlikely(pos + count > max)) {
  965. retval = -EOVERFLOW;
  966. if (pos >= max)
  967. goto fput_out;
  968. count = max - pos;
  969. }
  970. fl = 0;
  971. #if 0
  972. /*
  973. * We need to debate whether we can enable this or not. The
  974. * man page documents EAGAIN return for the output at least,
  975. * and the application is arguably buggy if it doesn't expect
  976. * EAGAIN on a non-blocking file descriptor.
  977. */
  978. if (in.file->f_flags & O_NONBLOCK)
  979. fl = SPLICE_F_NONBLOCK;
  980. #endif
  981. retval = do_splice_direct(in.file, ppos, out.file, count, fl);
  982. if (retval > 0) {
  983. add_rchar(current, retval);
  984. add_wchar(current, retval);
  985. fsnotify_access(in.file);
  986. fsnotify_modify(out.file);
  987. }
  988. inc_syscr(current);
  989. inc_syscw(current);
  990. if (*ppos > max)
  991. retval = -EOVERFLOW;
  992. fput_out:
  993. fdput(out);
  994. fput_in:
  995. fdput(in);
  996. out:
  997. return retval;
  998. }
  999. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  1000. {
  1001. loff_t pos;
  1002. off_t off;
  1003. ssize_t ret;
  1004. if (offset) {
  1005. if (unlikely(get_user(off, offset)))
  1006. return -EFAULT;
  1007. pos = off;
  1008. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1009. if (unlikely(put_user(pos, offset)))
  1010. return -EFAULT;
  1011. return ret;
  1012. }
  1013. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1014. }
  1015. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  1016. {
  1017. loff_t pos;
  1018. ssize_t ret;
  1019. if (offset) {
  1020. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1021. return -EFAULT;
  1022. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1023. if (unlikely(put_user(pos, offset)))
  1024. return -EFAULT;
  1025. return ret;
  1026. }
  1027. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1028. }
  1029. #ifdef CONFIG_COMPAT
  1030. COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
  1031. compat_off_t __user *, offset, compat_size_t, count)
  1032. {
  1033. loff_t pos;
  1034. off_t off;
  1035. ssize_t ret;
  1036. if (offset) {
  1037. if (unlikely(get_user(off, offset)))
  1038. return -EFAULT;
  1039. pos = off;
  1040. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1041. if (unlikely(put_user(pos, offset)))
  1042. return -EFAULT;
  1043. return ret;
  1044. }
  1045. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1046. }
  1047. COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
  1048. compat_loff_t __user *, offset, compat_size_t, count)
  1049. {
  1050. loff_t pos;
  1051. ssize_t ret;
  1052. if (offset) {
  1053. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1054. return -EFAULT;
  1055. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1056. if (unlikely(put_user(pos, offset)))
  1057. return -EFAULT;
  1058. return ret;
  1059. }
  1060. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1061. }
  1062. #endif