read_write.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/fsnotify.h>
  12. #include <linux/security.h>
  13. #include <linux/export.h>
  14. #include <linux/syscalls.h>
  15. #include <linux/pagemap.h>
  16. #include <linux/splice.h>
  17. #include <linux/compat.h>
  18. #include "internal.h"
  19. #include <asm/uaccess.h>
  20. #include <asm/unistd.h>
  21. typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  22. typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  23. unsigned long, loff_t);
  24. const struct file_operations generic_ro_fops = {
  25. .llseek = generic_file_llseek,
  26. .read = do_sync_read,
  27. .aio_read = generic_file_aio_read,
  28. .mmap = generic_file_readonly_mmap,
  29. .splice_read = generic_file_splice_read,
  30. };
  31. EXPORT_SYMBOL(generic_ro_fops);
  32. static inline int unsigned_offsets(struct file *file)
  33. {
  34. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  35. }
  36. static loff_t lseek_execute(struct file *file, struct inode *inode,
  37. loff_t offset, loff_t maxsize)
  38. {
  39. if (offset < 0 && !unsigned_offsets(file))
  40. return -EINVAL;
  41. if (offset > maxsize)
  42. return -EINVAL;
  43. if (offset != file->f_pos) {
  44. file->f_pos = offset;
  45. file->f_version = 0;
  46. }
  47. return offset;
  48. }
  49. /**
  50. * generic_file_llseek_size - generic llseek implementation for regular files
  51. * @file: file structure to seek on
  52. * @offset: file offset to seek to
  53. * @whence: type of seek
  54. * @size: max size of this file in file system
  55. * @eof: offset used for SEEK_END position
  56. *
  57. * This is a variant of generic_file_llseek that allows passing in a custom
  58. * maximum file size and a custom EOF position, for e.g. hashed directories
  59. *
  60. * Synchronization:
  61. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  62. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  63. * read/writes behave like SEEK_SET against seeks.
  64. */
  65. loff_t
  66. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  67. loff_t maxsize, loff_t eof)
  68. {
  69. struct inode *inode = file->f_mapping->host;
  70. switch (whence) {
  71. case SEEK_END:
  72. offset += eof;
  73. break;
  74. case SEEK_CUR:
  75. /*
  76. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  77. * position-querying operation. Avoid rewriting the "same"
  78. * f_pos value back to the file because a concurrent read(),
  79. * write() or lseek() might have altered it
  80. */
  81. if (offset == 0)
  82. return file->f_pos;
  83. /*
  84. * f_lock protects against read/modify/write race with other
  85. * SEEK_CURs. Note that parallel writes and reads behave
  86. * like SEEK_SET.
  87. */
  88. spin_lock(&file->f_lock);
  89. offset = lseek_execute(file, inode, file->f_pos + offset,
  90. maxsize);
  91. spin_unlock(&file->f_lock);
  92. return offset;
  93. case SEEK_DATA:
  94. /*
  95. * In the generic case the entire file is data, so as long as
  96. * offset isn't at the end of the file then the offset is data.
  97. */
  98. if (offset >= eof)
  99. return -ENXIO;
  100. break;
  101. case SEEK_HOLE:
  102. /*
  103. * There is a virtual hole at the end of the file, so as long as
  104. * offset isn't i_size or larger, return i_size.
  105. */
  106. if (offset >= eof)
  107. return -ENXIO;
  108. offset = eof;
  109. break;
  110. }
  111. return lseek_execute(file, inode, offset, maxsize);
  112. }
  113. EXPORT_SYMBOL(generic_file_llseek_size);
  114. /**
  115. * generic_file_llseek - generic llseek implementation for regular files
  116. * @file: file structure to seek on
  117. * @offset: file offset to seek to
  118. * @whence: type of seek
  119. *
  120. * This is a generic implemenation of ->llseek useable for all normal local
  121. * filesystems. It just updates the file offset to the value specified by
  122. * @offset and @whence.
  123. */
  124. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  125. {
  126. struct inode *inode = file->f_mapping->host;
  127. return generic_file_llseek_size(file, offset, whence,
  128. inode->i_sb->s_maxbytes,
  129. i_size_read(inode));
  130. }
  131. EXPORT_SYMBOL(generic_file_llseek);
  132. /**
  133. * noop_llseek - No Operation Performed llseek implementation
  134. * @file: file structure to seek on
  135. * @offset: file offset to seek to
  136. * @whence: type of seek
  137. *
  138. * This is an implementation of ->llseek useable for the rare special case when
  139. * userspace expects the seek to succeed but the (device) file is actually not
  140. * able to perform the seek. In this case you use noop_llseek() instead of
  141. * falling back to the default implementation of ->llseek.
  142. */
  143. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  144. {
  145. return file->f_pos;
  146. }
  147. EXPORT_SYMBOL(noop_llseek);
  148. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  149. {
  150. return -ESPIPE;
  151. }
  152. EXPORT_SYMBOL(no_llseek);
  153. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  154. {
  155. struct inode *inode = file_inode(file);
  156. loff_t retval;
  157. mutex_lock(&inode->i_mutex);
  158. switch (whence) {
  159. case SEEK_END:
  160. offset += i_size_read(inode);
  161. break;
  162. case SEEK_CUR:
  163. if (offset == 0) {
  164. retval = file->f_pos;
  165. goto out;
  166. }
  167. offset += file->f_pos;
  168. break;
  169. case SEEK_DATA:
  170. /*
  171. * In the generic case the entire file is data, so as
  172. * long as offset isn't at the end of the file then the
  173. * offset is data.
  174. */
  175. if (offset >= inode->i_size) {
  176. retval = -ENXIO;
  177. goto out;
  178. }
  179. break;
  180. case SEEK_HOLE:
  181. /*
  182. * There is a virtual hole at the end of the file, so
  183. * as long as offset isn't i_size or larger, return
  184. * i_size.
  185. */
  186. if (offset >= inode->i_size) {
  187. retval = -ENXIO;
  188. goto out;
  189. }
  190. offset = inode->i_size;
  191. break;
  192. }
  193. retval = -EINVAL;
  194. if (offset >= 0 || unsigned_offsets(file)) {
  195. if (offset != file->f_pos) {
  196. file->f_pos = offset;
  197. file->f_version = 0;
  198. }
  199. retval = offset;
  200. }
  201. out:
  202. mutex_unlock(&inode->i_mutex);
  203. return retval;
  204. }
  205. EXPORT_SYMBOL(default_llseek);
  206. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  207. {
  208. loff_t (*fn)(struct file *, loff_t, int);
  209. fn = no_llseek;
  210. if (file->f_mode & FMODE_LSEEK) {
  211. if (file->f_op && file->f_op->llseek)
  212. fn = file->f_op->llseek;
  213. }
  214. return fn(file, offset, whence);
  215. }
  216. EXPORT_SYMBOL(vfs_llseek);
  217. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  218. {
  219. off_t retval;
  220. struct fd f = fdget(fd);
  221. if (!f.file)
  222. return -EBADF;
  223. retval = -EINVAL;
  224. if (whence <= SEEK_MAX) {
  225. loff_t res = vfs_llseek(f.file, offset, whence);
  226. retval = res;
  227. if (res != (loff_t)retval)
  228. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  229. }
  230. fdput(f);
  231. return retval;
  232. }
  233. #ifdef CONFIG_COMPAT
  234. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  235. {
  236. return sys_lseek(fd, offset, whence);
  237. }
  238. #endif
  239. #ifdef __ARCH_WANT_SYS_LLSEEK
  240. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  241. unsigned long, offset_low, loff_t __user *, result,
  242. unsigned int, whence)
  243. {
  244. int retval;
  245. struct fd f = fdget(fd);
  246. loff_t offset;
  247. if (!f.file)
  248. return -EBADF;
  249. retval = -EINVAL;
  250. if (whence > SEEK_MAX)
  251. goto out_putf;
  252. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  253. whence);
  254. retval = (int)offset;
  255. if (offset >= 0) {
  256. retval = -EFAULT;
  257. if (!copy_to_user(result, &offset, sizeof(offset)))
  258. retval = 0;
  259. }
  260. out_putf:
  261. fdput(f);
  262. return retval;
  263. }
  264. #endif
  265. /*
  266. * rw_verify_area doesn't like huge counts. We limit
  267. * them to something that fits in "int" so that others
  268. * won't have to do range checks all the time.
  269. */
  270. int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
  271. {
  272. struct inode *inode;
  273. loff_t pos;
  274. int retval = -EINVAL;
  275. inode = file_inode(file);
  276. if (unlikely((ssize_t) count < 0))
  277. return retval;
  278. pos = *ppos;
  279. if (unlikely(pos < 0)) {
  280. if (!unsigned_offsets(file))
  281. return retval;
  282. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  283. return -EOVERFLOW;
  284. } else if (unlikely((loff_t) (pos + count) < 0)) {
  285. if (!unsigned_offsets(file))
  286. return retval;
  287. }
  288. if (unlikely(inode->i_flock && mandatory_lock(inode))) {
  289. retval = locks_mandatory_area(
  290. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  291. inode, file, pos, count);
  292. if (retval < 0)
  293. return retval;
  294. }
  295. retval = security_file_permission(file,
  296. read_write == READ ? MAY_READ : MAY_WRITE);
  297. if (retval)
  298. return retval;
  299. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  300. }
  301. ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  302. {
  303. struct iovec iov = { .iov_base = buf, .iov_len = len };
  304. struct kiocb kiocb;
  305. ssize_t ret;
  306. init_sync_kiocb(&kiocb, filp);
  307. kiocb.ki_pos = *ppos;
  308. kiocb.ki_left = len;
  309. kiocb.ki_nbytes = len;
  310. ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
  311. if (-EIOCBQUEUED == ret)
  312. ret = wait_on_sync_kiocb(&kiocb);
  313. *ppos = kiocb.ki_pos;
  314. return ret;
  315. }
  316. EXPORT_SYMBOL(do_sync_read);
  317. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  318. {
  319. ssize_t ret;
  320. if (!(file->f_mode & FMODE_READ))
  321. return -EBADF;
  322. if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
  323. return -EINVAL;
  324. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  325. return -EFAULT;
  326. ret = rw_verify_area(READ, file, pos, count);
  327. if (ret >= 0) {
  328. count = ret;
  329. if (file->f_op->read)
  330. ret = file->f_op->read(file, buf, count, pos);
  331. else
  332. ret = do_sync_read(file, buf, count, pos);
  333. if (ret > 0) {
  334. fsnotify_access(file);
  335. add_rchar(current, ret);
  336. }
  337. inc_syscr(current);
  338. }
  339. return ret;
  340. }
  341. EXPORT_SYMBOL(vfs_read);
  342. ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  343. {
  344. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  345. struct kiocb kiocb;
  346. ssize_t ret;
  347. init_sync_kiocb(&kiocb, filp);
  348. kiocb.ki_pos = *ppos;
  349. kiocb.ki_left = len;
  350. kiocb.ki_nbytes = len;
  351. ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
  352. if (-EIOCBQUEUED == ret)
  353. ret = wait_on_sync_kiocb(&kiocb);
  354. *ppos = kiocb.ki_pos;
  355. return ret;
  356. }
  357. EXPORT_SYMBOL(do_sync_write);
  358. ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
  359. {
  360. mm_segment_t old_fs;
  361. const char __user *p;
  362. ssize_t ret;
  363. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  364. return -EINVAL;
  365. old_fs = get_fs();
  366. set_fs(get_ds());
  367. p = (__force const char __user *)buf;
  368. if (count > MAX_RW_COUNT)
  369. count = MAX_RW_COUNT;
  370. if (file->f_op->write)
  371. ret = file->f_op->write(file, p, count, pos);
  372. else
  373. ret = do_sync_write(file, p, count, pos);
  374. set_fs(old_fs);
  375. if (ret > 0) {
  376. fsnotify_modify(file);
  377. add_wchar(current, ret);
  378. }
  379. inc_syscw(current);
  380. return ret;
  381. }
  382. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  383. {
  384. ssize_t ret;
  385. if (!(file->f_mode & FMODE_WRITE))
  386. return -EBADF;
  387. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  388. return -EINVAL;
  389. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  390. return -EFAULT;
  391. ret = rw_verify_area(WRITE, file, pos, count);
  392. if (ret >= 0) {
  393. count = ret;
  394. file_start_write(file);
  395. if (file->f_op->write)
  396. ret = file->f_op->write(file, buf, count, pos);
  397. else
  398. ret = do_sync_write(file, buf, count, pos);
  399. if (ret > 0) {
  400. fsnotify_modify(file);
  401. add_wchar(current, ret);
  402. }
  403. inc_syscw(current);
  404. file_end_write(file);
  405. }
  406. return ret;
  407. }
  408. EXPORT_SYMBOL(vfs_write);
  409. static inline loff_t file_pos_read(struct file *file)
  410. {
  411. return file->f_pos;
  412. }
  413. static inline void file_pos_write(struct file *file, loff_t pos)
  414. {
  415. file->f_pos = pos;
  416. }
  417. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  418. {
  419. struct fd f = fdget(fd);
  420. ssize_t ret = -EBADF;
  421. if (f.file) {
  422. loff_t pos = file_pos_read(f.file);
  423. ret = vfs_read(f.file, buf, count, &pos);
  424. file_pos_write(f.file, pos);
  425. fdput(f);
  426. }
  427. return ret;
  428. }
  429. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  430. size_t, count)
  431. {
  432. struct fd f = fdget(fd);
  433. ssize_t ret = -EBADF;
  434. if (f.file) {
  435. loff_t pos = file_pos_read(f.file);
  436. ret = vfs_write(f.file, buf, count, &pos);
  437. file_pos_write(f.file, pos);
  438. fdput(f);
  439. }
  440. return ret;
  441. }
  442. SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
  443. size_t, count, loff_t, pos)
  444. {
  445. struct fd f;
  446. ssize_t ret = -EBADF;
  447. if (pos < 0)
  448. return -EINVAL;
  449. f = fdget(fd);
  450. if (f.file) {
  451. ret = -ESPIPE;
  452. if (f.file->f_mode & FMODE_PREAD)
  453. ret = vfs_read(f.file, buf, count, &pos);
  454. fdput(f);
  455. }
  456. return ret;
  457. }
  458. SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
  459. size_t, count, loff_t, pos)
  460. {
  461. struct fd f;
  462. ssize_t ret = -EBADF;
  463. if (pos < 0)
  464. return -EINVAL;
  465. f = fdget(fd);
  466. if (f.file) {
  467. ret = -ESPIPE;
  468. if (f.file->f_mode & FMODE_PWRITE)
  469. ret = vfs_write(f.file, buf, count, &pos);
  470. fdput(f);
  471. }
  472. return ret;
  473. }
  474. /*
  475. * Reduce an iovec's length in-place. Return the resulting number of segments
  476. */
  477. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  478. {
  479. unsigned long seg = 0;
  480. size_t len = 0;
  481. while (seg < nr_segs) {
  482. seg++;
  483. if (len + iov->iov_len >= to) {
  484. iov->iov_len = to - len;
  485. break;
  486. }
  487. len += iov->iov_len;
  488. iov++;
  489. }
  490. return seg;
  491. }
  492. EXPORT_SYMBOL(iov_shorten);
  493. static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
  494. unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
  495. {
  496. struct kiocb kiocb;
  497. ssize_t ret;
  498. init_sync_kiocb(&kiocb, filp);
  499. kiocb.ki_pos = *ppos;
  500. kiocb.ki_left = len;
  501. kiocb.ki_nbytes = len;
  502. ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
  503. if (ret == -EIOCBQUEUED)
  504. ret = wait_on_sync_kiocb(&kiocb);
  505. *ppos = kiocb.ki_pos;
  506. return ret;
  507. }
  508. /* Do it by hand, with file-ops */
  509. static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
  510. unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
  511. {
  512. struct iovec *vector = iov;
  513. ssize_t ret = 0;
  514. while (nr_segs > 0) {
  515. void __user *base;
  516. size_t len;
  517. ssize_t nr;
  518. base = vector->iov_base;
  519. len = vector->iov_len;
  520. vector++;
  521. nr_segs--;
  522. nr = fn(filp, base, len, ppos);
  523. if (nr < 0) {
  524. if (!ret)
  525. ret = nr;
  526. break;
  527. }
  528. ret += nr;
  529. if (nr != len)
  530. break;
  531. }
  532. return ret;
  533. }
  534. /* A write operation does a read from user space and vice versa */
  535. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  536. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  537. unsigned long nr_segs, unsigned long fast_segs,
  538. struct iovec *fast_pointer,
  539. struct iovec **ret_pointer)
  540. {
  541. unsigned long seg;
  542. ssize_t ret;
  543. struct iovec *iov = fast_pointer;
  544. /*
  545. * SuS says "The readv() function *may* fail if the iovcnt argument
  546. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  547. * traditionally returned zero for zero segments, so...
  548. */
  549. if (nr_segs == 0) {
  550. ret = 0;
  551. goto out;
  552. }
  553. /*
  554. * First get the "struct iovec" from user memory and
  555. * verify all the pointers
  556. */
  557. if (nr_segs > UIO_MAXIOV) {
  558. ret = -EINVAL;
  559. goto out;
  560. }
  561. if (nr_segs > fast_segs) {
  562. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  563. if (iov == NULL) {
  564. ret = -ENOMEM;
  565. goto out;
  566. }
  567. }
  568. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  569. ret = -EFAULT;
  570. goto out;
  571. }
  572. /*
  573. * According to the Single Unix Specification we should return EINVAL
  574. * if an element length is < 0 when cast to ssize_t or if the
  575. * total length would overflow the ssize_t return value of the
  576. * system call.
  577. *
  578. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  579. * overflow case.
  580. */
  581. ret = 0;
  582. for (seg = 0; seg < nr_segs; seg++) {
  583. void __user *buf = iov[seg].iov_base;
  584. ssize_t len = (ssize_t)iov[seg].iov_len;
  585. /* see if we we're about to use an invalid len or if
  586. * it's about to overflow ssize_t */
  587. if (len < 0) {
  588. ret = -EINVAL;
  589. goto out;
  590. }
  591. if (type >= 0
  592. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  593. ret = -EFAULT;
  594. goto out;
  595. }
  596. if (len > MAX_RW_COUNT - ret) {
  597. len = MAX_RW_COUNT - ret;
  598. iov[seg].iov_len = len;
  599. }
  600. ret += len;
  601. }
  602. out:
  603. *ret_pointer = iov;
  604. return ret;
  605. }
  606. static ssize_t do_readv_writev(int type, struct file *file,
  607. const struct iovec __user * uvector,
  608. unsigned long nr_segs, loff_t *pos)
  609. {
  610. size_t tot_len;
  611. struct iovec iovstack[UIO_FASTIOV];
  612. struct iovec *iov = iovstack;
  613. ssize_t ret;
  614. io_fn_t fn;
  615. iov_fn_t fnv;
  616. if (!file->f_op) {
  617. ret = -EINVAL;
  618. goto out;
  619. }
  620. ret = rw_copy_check_uvector(type, uvector, nr_segs,
  621. ARRAY_SIZE(iovstack), iovstack, &iov);
  622. if (ret <= 0)
  623. goto out;
  624. tot_len = ret;
  625. ret = rw_verify_area(type, file, pos, tot_len);
  626. if (ret < 0)
  627. goto out;
  628. fnv = NULL;
  629. if (type == READ) {
  630. fn = file->f_op->read;
  631. fnv = file->f_op->aio_read;
  632. } else {
  633. fn = (io_fn_t)file->f_op->write;
  634. fnv = file->f_op->aio_write;
  635. file_start_write(file);
  636. }
  637. if (fnv)
  638. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  639. pos, fnv);
  640. else
  641. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  642. if (type != READ)
  643. file_end_write(file);
  644. out:
  645. if (iov != iovstack)
  646. kfree(iov);
  647. if ((ret + (type == READ)) > 0) {
  648. if (type == READ)
  649. fsnotify_access(file);
  650. else
  651. fsnotify_modify(file);
  652. }
  653. return ret;
  654. }
  655. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  656. unsigned long vlen, loff_t *pos)
  657. {
  658. if (!(file->f_mode & FMODE_READ))
  659. return -EBADF;
  660. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  661. return -EINVAL;
  662. return do_readv_writev(READ, file, vec, vlen, pos);
  663. }
  664. EXPORT_SYMBOL(vfs_readv);
  665. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  666. unsigned long vlen, loff_t *pos)
  667. {
  668. if (!(file->f_mode & FMODE_WRITE))
  669. return -EBADF;
  670. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  671. return -EINVAL;
  672. return do_readv_writev(WRITE, file, vec, vlen, pos);
  673. }
  674. EXPORT_SYMBOL(vfs_writev);
  675. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  676. unsigned long, vlen)
  677. {
  678. struct fd f = fdget(fd);
  679. ssize_t ret = -EBADF;
  680. if (f.file) {
  681. loff_t pos = file_pos_read(f.file);
  682. ret = vfs_readv(f.file, vec, vlen, &pos);
  683. file_pos_write(f.file, pos);
  684. fdput(f);
  685. }
  686. if (ret > 0)
  687. add_rchar(current, ret);
  688. inc_syscr(current);
  689. return ret;
  690. }
  691. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  692. unsigned long, vlen)
  693. {
  694. struct fd f = fdget(fd);
  695. ssize_t ret = -EBADF;
  696. if (f.file) {
  697. loff_t pos = file_pos_read(f.file);
  698. ret = vfs_writev(f.file, vec, vlen, &pos);
  699. file_pos_write(f.file, pos);
  700. fdput(f);
  701. }
  702. if (ret > 0)
  703. add_wchar(current, ret);
  704. inc_syscw(current);
  705. return ret;
  706. }
  707. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  708. {
  709. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  710. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  711. }
  712. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  713. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  714. {
  715. loff_t pos = pos_from_hilo(pos_h, pos_l);
  716. struct fd f;
  717. ssize_t ret = -EBADF;
  718. if (pos < 0)
  719. return -EINVAL;
  720. f = fdget(fd);
  721. if (f.file) {
  722. ret = -ESPIPE;
  723. if (f.file->f_mode & FMODE_PREAD)
  724. ret = vfs_readv(f.file, vec, vlen, &pos);
  725. fdput(f);
  726. }
  727. if (ret > 0)
  728. add_rchar(current, ret);
  729. inc_syscr(current);
  730. return ret;
  731. }
  732. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  733. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  734. {
  735. loff_t pos = pos_from_hilo(pos_h, pos_l);
  736. struct fd f;
  737. ssize_t ret = -EBADF;
  738. if (pos < 0)
  739. return -EINVAL;
  740. f = fdget(fd);
  741. if (f.file) {
  742. ret = -ESPIPE;
  743. if (f.file->f_mode & FMODE_PWRITE)
  744. ret = vfs_writev(f.file, vec, vlen, &pos);
  745. fdput(f);
  746. }
  747. if (ret > 0)
  748. add_wchar(current, ret);
  749. inc_syscw(current);
  750. return ret;
  751. }
  752. #ifdef CONFIG_COMPAT
  753. static ssize_t compat_do_readv_writev(int type, struct file *file,
  754. const struct compat_iovec __user *uvector,
  755. unsigned long nr_segs, loff_t *pos)
  756. {
  757. compat_ssize_t tot_len;
  758. struct iovec iovstack[UIO_FASTIOV];
  759. struct iovec *iov = iovstack;
  760. ssize_t ret;
  761. io_fn_t fn;
  762. iov_fn_t fnv;
  763. ret = -EINVAL;
  764. if (!file->f_op)
  765. goto out;
  766. ret = -EFAULT;
  767. if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
  768. goto out;
  769. ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
  770. UIO_FASTIOV, iovstack, &iov);
  771. if (ret <= 0)
  772. goto out;
  773. tot_len = ret;
  774. ret = rw_verify_area(type, file, pos, tot_len);
  775. if (ret < 0)
  776. goto out;
  777. fnv = NULL;
  778. if (type == READ) {
  779. fn = file->f_op->read;
  780. fnv = file->f_op->aio_read;
  781. } else {
  782. fn = (io_fn_t)file->f_op->write;
  783. fnv = file->f_op->aio_write;
  784. file_start_write(file);
  785. }
  786. if (fnv)
  787. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  788. pos, fnv);
  789. else
  790. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  791. if (type != READ)
  792. file_end_write(file);
  793. out:
  794. if (iov != iovstack)
  795. kfree(iov);
  796. if ((ret + (type == READ)) > 0) {
  797. if (type == READ)
  798. fsnotify_access(file);
  799. else
  800. fsnotify_modify(file);
  801. }
  802. return ret;
  803. }
  804. static size_t compat_readv(struct file *file,
  805. const struct compat_iovec __user *vec,
  806. unsigned long vlen, loff_t *pos)
  807. {
  808. ssize_t ret = -EBADF;
  809. if (!(file->f_mode & FMODE_READ))
  810. goto out;
  811. ret = -EINVAL;
  812. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  813. goto out;
  814. ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
  815. out:
  816. if (ret > 0)
  817. add_rchar(current, ret);
  818. inc_syscr(current);
  819. return ret;
  820. }
  821. COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
  822. const struct compat_iovec __user *,vec,
  823. unsigned long, vlen)
  824. {
  825. struct fd f = fdget(fd);
  826. ssize_t ret;
  827. loff_t pos;
  828. if (!f.file)
  829. return -EBADF;
  830. pos = f.file->f_pos;
  831. ret = compat_readv(f.file, vec, vlen, &pos);
  832. f.file->f_pos = pos;
  833. fdput(f);
  834. return ret;
  835. }
  836. COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
  837. const struct compat_iovec __user *,vec,
  838. unsigned long, vlen, loff_t, pos)
  839. {
  840. struct fd f;
  841. ssize_t ret;
  842. if (pos < 0)
  843. return -EINVAL;
  844. f = fdget(fd);
  845. if (!f.file)
  846. return -EBADF;
  847. ret = -ESPIPE;
  848. if (f.file->f_mode & FMODE_PREAD)
  849. ret = compat_readv(f.file, vec, vlen, &pos);
  850. fdput(f);
  851. return ret;
  852. }
  853. COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
  854. const struct compat_iovec __user *,vec,
  855. unsigned long, vlen, u32, pos_low, u32, pos_high)
  856. {
  857. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  858. return compat_sys_preadv64(fd, vec, vlen, pos);
  859. }
  860. static size_t compat_writev(struct file *file,
  861. const struct compat_iovec __user *vec,
  862. unsigned long vlen, loff_t *pos)
  863. {
  864. ssize_t ret = -EBADF;
  865. if (!(file->f_mode & FMODE_WRITE))
  866. goto out;
  867. ret = -EINVAL;
  868. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  869. goto out;
  870. ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
  871. out:
  872. if (ret > 0)
  873. add_wchar(current, ret);
  874. inc_syscw(current);
  875. return ret;
  876. }
  877. COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
  878. const struct compat_iovec __user *, vec,
  879. unsigned long, vlen)
  880. {
  881. struct fd f = fdget(fd);
  882. ssize_t ret;
  883. loff_t pos;
  884. if (!f.file)
  885. return -EBADF;
  886. pos = f.file->f_pos;
  887. ret = compat_writev(f.file, vec, vlen, &pos);
  888. f.file->f_pos = pos;
  889. fdput(f);
  890. return ret;
  891. }
  892. COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
  893. const struct compat_iovec __user *,vec,
  894. unsigned long, vlen, loff_t, pos)
  895. {
  896. struct fd f;
  897. ssize_t ret;
  898. if (pos < 0)
  899. return -EINVAL;
  900. f = fdget(fd);
  901. if (!f.file)
  902. return -EBADF;
  903. ret = -ESPIPE;
  904. if (f.file->f_mode & FMODE_PWRITE)
  905. ret = compat_writev(f.file, vec, vlen, &pos);
  906. fdput(f);
  907. return ret;
  908. }
  909. COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
  910. const struct compat_iovec __user *,vec,
  911. unsigned long, vlen, u32, pos_low, u32, pos_high)
  912. {
  913. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  914. return compat_sys_pwritev64(fd, vec, vlen, pos);
  915. }
  916. #endif
  917. static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  918. size_t count, loff_t max)
  919. {
  920. struct fd in, out;
  921. struct inode *in_inode, *out_inode;
  922. loff_t pos;
  923. ssize_t retval;
  924. int fl;
  925. /*
  926. * Get input file, and verify that it is ok..
  927. */
  928. retval = -EBADF;
  929. in = fdget(in_fd);
  930. if (!in.file)
  931. goto out;
  932. if (!(in.file->f_mode & FMODE_READ))
  933. goto fput_in;
  934. retval = -ESPIPE;
  935. if (!ppos)
  936. ppos = &in.file->f_pos;
  937. else
  938. if (!(in.file->f_mode & FMODE_PREAD))
  939. goto fput_in;
  940. retval = rw_verify_area(READ, in.file, ppos, count);
  941. if (retval < 0)
  942. goto fput_in;
  943. count = retval;
  944. /*
  945. * Get output file, and verify that it is ok..
  946. */
  947. retval = -EBADF;
  948. out = fdget(out_fd);
  949. if (!out.file)
  950. goto fput_in;
  951. if (!(out.file->f_mode & FMODE_WRITE))
  952. goto fput_out;
  953. retval = -EINVAL;
  954. in_inode = file_inode(in.file);
  955. out_inode = file_inode(out.file);
  956. retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
  957. if (retval < 0)
  958. goto fput_out;
  959. count = retval;
  960. if (!max)
  961. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  962. pos = *ppos;
  963. if (unlikely(pos + count > max)) {
  964. retval = -EOVERFLOW;
  965. if (pos >= max)
  966. goto fput_out;
  967. count = max - pos;
  968. }
  969. fl = 0;
  970. #if 0
  971. /*
  972. * We need to debate whether we can enable this or not. The
  973. * man page documents EAGAIN return for the output at least,
  974. * and the application is arguably buggy if it doesn't expect
  975. * EAGAIN on a non-blocking file descriptor.
  976. */
  977. if (in.file->f_flags & O_NONBLOCK)
  978. fl = SPLICE_F_NONBLOCK;
  979. #endif
  980. retval = do_splice_direct(in.file, ppos, out.file, count, fl);
  981. if (retval > 0) {
  982. add_rchar(current, retval);
  983. add_wchar(current, retval);
  984. fsnotify_access(in.file);
  985. fsnotify_modify(out.file);
  986. }
  987. inc_syscr(current);
  988. inc_syscw(current);
  989. if (*ppos > max)
  990. retval = -EOVERFLOW;
  991. fput_out:
  992. fdput(out);
  993. fput_in:
  994. fdput(in);
  995. out:
  996. return retval;
  997. }
  998. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  999. {
  1000. loff_t pos;
  1001. off_t off;
  1002. ssize_t ret;
  1003. if (offset) {
  1004. if (unlikely(get_user(off, offset)))
  1005. return -EFAULT;
  1006. pos = off;
  1007. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1008. if (unlikely(put_user(pos, offset)))
  1009. return -EFAULT;
  1010. return ret;
  1011. }
  1012. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1013. }
  1014. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  1015. {
  1016. loff_t pos;
  1017. ssize_t ret;
  1018. if (offset) {
  1019. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1020. return -EFAULT;
  1021. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1022. if (unlikely(put_user(pos, offset)))
  1023. return -EFAULT;
  1024. return ret;
  1025. }
  1026. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1027. }
  1028. #ifdef CONFIG_COMPAT
  1029. COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
  1030. compat_off_t __user *, offset, compat_size_t, count)
  1031. {
  1032. loff_t pos;
  1033. off_t off;
  1034. ssize_t ret;
  1035. if (offset) {
  1036. if (unlikely(get_user(off, offset)))
  1037. return -EFAULT;
  1038. pos = off;
  1039. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1040. if (unlikely(put_user(pos, offset)))
  1041. return -EFAULT;
  1042. return ret;
  1043. }
  1044. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1045. }
  1046. COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
  1047. compat_loff_t __user *, offset, compat_size_t, count)
  1048. {
  1049. loff_t pos;
  1050. ssize_t ret;
  1051. if (offset) {
  1052. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1053. return -EFAULT;
  1054. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1055. if (unlikely(put_user(pos, offset)))
  1056. return -EFAULT;
  1057. return ret;
  1058. }
  1059. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1060. }
  1061. #endif