read_write.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/aio.h>
  12. #include <linux/fsnotify.h>
  13. #include <linux/security.h>
  14. #include <linux/export.h>
  15. #include <linux/syscalls.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/splice.h>
  18. #include <linux/compat.h>
  19. #include "internal.h"
  20. #include <asm/uaccess.h>
  21. #include <asm/unistd.h>
  22. typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  23. typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  24. unsigned long, loff_t);
  25. const struct file_operations generic_ro_fops = {
  26. .llseek = generic_file_llseek,
  27. .read = do_sync_read,
  28. .aio_read = generic_file_aio_read,
  29. .mmap = generic_file_readonly_mmap,
  30. .splice_read = generic_file_splice_read,
  31. };
  32. EXPORT_SYMBOL(generic_ro_fops);
  33. static inline int unsigned_offsets(struct file *file)
  34. {
  35. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  36. }
  37. static loff_t lseek_execute(struct file *file, struct inode *inode,
  38. loff_t offset, loff_t maxsize)
  39. {
  40. if (offset < 0 && !unsigned_offsets(file))
  41. return -EINVAL;
  42. if (offset > maxsize)
  43. return -EINVAL;
  44. if (offset != file->f_pos) {
  45. file->f_pos = offset;
  46. file->f_version = 0;
  47. }
  48. return offset;
  49. }
  50. /**
  51. * generic_file_llseek_size - generic llseek implementation for regular files
  52. * @file: file structure to seek on
  53. * @offset: file offset to seek to
  54. * @whence: type of seek
  55. * @size: max size of this file in file system
  56. * @eof: offset used for SEEK_END position
  57. *
  58. * This is a variant of generic_file_llseek that allows passing in a custom
  59. * maximum file size and a custom EOF position, for e.g. hashed directories
  60. *
  61. * Synchronization:
  62. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  63. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  64. * read/writes behave like SEEK_SET against seeks.
  65. */
  66. loff_t
  67. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  68. loff_t maxsize, loff_t eof)
  69. {
  70. struct inode *inode = file->f_mapping->host;
  71. switch (whence) {
  72. case SEEK_END:
  73. offset += eof;
  74. break;
  75. case SEEK_CUR:
  76. /*
  77. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  78. * position-querying operation. Avoid rewriting the "same"
  79. * f_pos value back to the file because a concurrent read(),
  80. * write() or lseek() might have altered it
  81. */
  82. if (offset == 0)
  83. return file->f_pos;
  84. /*
  85. * f_lock protects against read/modify/write race with other
  86. * SEEK_CURs. Note that parallel writes and reads behave
  87. * like SEEK_SET.
  88. */
  89. spin_lock(&file->f_lock);
  90. offset = lseek_execute(file, inode, file->f_pos + offset,
  91. maxsize);
  92. spin_unlock(&file->f_lock);
  93. return offset;
  94. case SEEK_DATA:
  95. /*
  96. * In the generic case the entire file is data, so as long as
  97. * offset isn't at the end of the file then the offset is data.
  98. */
  99. if (offset >= eof)
  100. return -ENXIO;
  101. break;
  102. case SEEK_HOLE:
  103. /*
  104. * There is a virtual hole at the end of the file, so as long as
  105. * offset isn't i_size or larger, return i_size.
  106. */
  107. if (offset >= eof)
  108. return -ENXIO;
  109. offset = eof;
  110. break;
  111. }
  112. return lseek_execute(file, inode, offset, maxsize);
  113. }
  114. EXPORT_SYMBOL(generic_file_llseek_size);
  115. /**
  116. * generic_file_llseek - generic llseek implementation for regular files
  117. * @file: file structure to seek on
  118. * @offset: file offset to seek to
  119. * @whence: type of seek
  120. *
  121. * This is a generic implemenation of ->llseek useable for all normal local
  122. * filesystems. It just updates the file offset to the value specified by
  123. * @offset and @whence.
  124. */
  125. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  126. {
  127. struct inode *inode = file->f_mapping->host;
  128. return generic_file_llseek_size(file, offset, whence,
  129. inode->i_sb->s_maxbytes,
  130. i_size_read(inode));
  131. }
  132. EXPORT_SYMBOL(generic_file_llseek);
  133. /**
  134. * noop_llseek - No Operation Performed llseek implementation
  135. * @file: file structure to seek on
  136. * @offset: file offset to seek to
  137. * @whence: type of seek
  138. *
  139. * This is an implementation of ->llseek useable for the rare special case when
  140. * userspace expects the seek to succeed but the (device) file is actually not
  141. * able to perform the seek. In this case you use noop_llseek() instead of
  142. * falling back to the default implementation of ->llseek.
  143. */
  144. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  145. {
  146. return file->f_pos;
  147. }
  148. EXPORT_SYMBOL(noop_llseek);
  149. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  150. {
  151. return -ESPIPE;
  152. }
  153. EXPORT_SYMBOL(no_llseek);
  154. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  155. {
  156. struct inode *inode = file_inode(file);
  157. loff_t retval;
  158. mutex_lock(&inode->i_mutex);
  159. switch (whence) {
  160. case SEEK_END:
  161. offset += i_size_read(inode);
  162. break;
  163. case SEEK_CUR:
  164. if (offset == 0) {
  165. retval = file->f_pos;
  166. goto out;
  167. }
  168. offset += file->f_pos;
  169. break;
  170. case SEEK_DATA:
  171. /*
  172. * In the generic case the entire file is data, so as
  173. * long as offset isn't at the end of the file then the
  174. * offset is data.
  175. */
  176. if (offset >= inode->i_size) {
  177. retval = -ENXIO;
  178. goto out;
  179. }
  180. break;
  181. case SEEK_HOLE:
  182. /*
  183. * There is a virtual hole at the end of the file, so
  184. * as long as offset isn't i_size or larger, return
  185. * i_size.
  186. */
  187. if (offset >= inode->i_size) {
  188. retval = -ENXIO;
  189. goto out;
  190. }
  191. offset = inode->i_size;
  192. break;
  193. }
  194. retval = -EINVAL;
  195. if (offset >= 0 || unsigned_offsets(file)) {
  196. if (offset != file->f_pos) {
  197. file->f_pos = offset;
  198. file->f_version = 0;
  199. }
  200. retval = offset;
  201. }
  202. out:
  203. mutex_unlock(&inode->i_mutex);
  204. return retval;
  205. }
  206. EXPORT_SYMBOL(default_llseek);
  207. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  208. {
  209. loff_t (*fn)(struct file *, loff_t, int);
  210. fn = no_llseek;
  211. if (file->f_mode & FMODE_LSEEK) {
  212. if (file->f_op && file->f_op->llseek)
  213. fn = file->f_op->llseek;
  214. }
  215. return fn(file, offset, whence);
  216. }
  217. EXPORT_SYMBOL(vfs_llseek);
  218. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  219. {
  220. off_t retval;
  221. struct fd f = fdget(fd);
  222. if (!f.file)
  223. return -EBADF;
  224. retval = -EINVAL;
  225. if (whence <= SEEK_MAX) {
  226. loff_t res = vfs_llseek(f.file, offset, whence);
  227. retval = res;
  228. if (res != (loff_t)retval)
  229. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  230. }
  231. fdput(f);
  232. return retval;
  233. }
  234. #ifdef CONFIG_COMPAT
  235. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  236. {
  237. return sys_lseek(fd, offset, whence);
  238. }
  239. #endif
  240. #ifdef __ARCH_WANT_SYS_LLSEEK
  241. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  242. unsigned long, offset_low, loff_t __user *, result,
  243. unsigned int, whence)
  244. {
  245. int retval;
  246. struct fd f = fdget(fd);
  247. loff_t offset;
  248. if (!f.file)
  249. return -EBADF;
  250. retval = -EINVAL;
  251. if (whence > SEEK_MAX)
  252. goto out_putf;
  253. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  254. whence);
  255. retval = (int)offset;
  256. if (offset >= 0) {
  257. retval = -EFAULT;
  258. if (!copy_to_user(result, &offset, sizeof(offset)))
  259. retval = 0;
  260. }
  261. out_putf:
  262. fdput(f);
  263. return retval;
  264. }
  265. #endif
  266. /*
  267. * rw_verify_area doesn't like huge counts. We limit
  268. * them to something that fits in "int" so that others
  269. * won't have to do range checks all the time.
  270. */
  271. int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
  272. {
  273. struct inode *inode;
  274. loff_t pos;
  275. int retval = -EINVAL;
  276. inode = file_inode(file);
  277. if (unlikely((ssize_t) count < 0))
  278. return retval;
  279. pos = *ppos;
  280. if (unlikely(pos < 0)) {
  281. if (!unsigned_offsets(file))
  282. return retval;
  283. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  284. return -EOVERFLOW;
  285. } else if (unlikely((loff_t) (pos + count) < 0)) {
  286. if (!unsigned_offsets(file))
  287. return retval;
  288. }
  289. if (unlikely(inode->i_flock && mandatory_lock(inode))) {
  290. retval = locks_mandatory_area(
  291. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  292. inode, file, pos, count);
  293. if (retval < 0)
  294. return retval;
  295. }
  296. retval = security_file_permission(file,
  297. read_write == READ ? MAY_READ : MAY_WRITE);
  298. if (retval)
  299. return retval;
  300. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  301. }
  302. ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  303. {
  304. struct iovec iov = { .iov_base = buf, .iov_len = len };
  305. struct kiocb kiocb;
  306. ssize_t ret;
  307. init_sync_kiocb(&kiocb, filp);
  308. kiocb.ki_pos = *ppos;
  309. kiocb.ki_left = len;
  310. kiocb.ki_nbytes = len;
  311. ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
  312. if (-EIOCBQUEUED == ret)
  313. ret = wait_on_sync_kiocb(&kiocb);
  314. *ppos = kiocb.ki_pos;
  315. return ret;
  316. }
  317. EXPORT_SYMBOL(do_sync_read);
  318. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  319. {
  320. ssize_t ret;
  321. if (!(file->f_mode & FMODE_READ))
  322. return -EBADF;
  323. if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
  324. return -EINVAL;
  325. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  326. return -EFAULT;
  327. ret = rw_verify_area(READ, file, pos, count);
  328. if (ret >= 0) {
  329. count = ret;
  330. if (file->f_op->read)
  331. ret = file->f_op->read(file, buf, count, pos);
  332. else
  333. ret = do_sync_read(file, buf, count, pos);
  334. if (ret > 0) {
  335. fsnotify_access(file);
  336. add_rchar(current, ret);
  337. }
  338. inc_syscr(current);
  339. }
  340. return ret;
  341. }
  342. EXPORT_SYMBOL(vfs_read);
  343. ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  344. {
  345. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  346. struct kiocb kiocb;
  347. ssize_t ret;
  348. init_sync_kiocb(&kiocb, filp);
  349. kiocb.ki_pos = *ppos;
  350. kiocb.ki_left = len;
  351. kiocb.ki_nbytes = len;
  352. ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
  353. if (-EIOCBQUEUED == ret)
  354. ret = wait_on_sync_kiocb(&kiocb);
  355. *ppos = kiocb.ki_pos;
  356. return ret;
  357. }
  358. EXPORT_SYMBOL(do_sync_write);
  359. ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
  360. {
  361. mm_segment_t old_fs;
  362. const char __user *p;
  363. ssize_t ret;
  364. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  365. return -EINVAL;
  366. old_fs = get_fs();
  367. set_fs(get_ds());
  368. p = (__force const char __user *)buf;
  369. if (count > MAX_RW_COUNT)
  370. count = MAX_RW_COUNT;
  371. if (file->f_op->write)
  372. ret = file->f_op->write(file, p, count, pos);
  373. else
  374. ret = do_sync_write(file, p, count, pos);
  375. set_fs(old_fs);
  376. if (ret > 0) {
  377. fsnotify_modify(file);
  378. add_wchar(current, ret);
  379. }
  380. inc_syscw(current);
  381. return ret;
  382. }
  383. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  384. {
  385. ssize_t ret;
  386. if (!(file->f_mode & FMODE_WRITE))
  387. return -EBADF;
  388. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  389. return -EINVAL;
  390. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  391. return -EFAULT;
  392. ret = rw_verify_area(WRITE, file, pos, count);
  393. if (ret >= 0) {
  394. count = ret;
  395. file_start_write(file);
  396. if (file->f_op->write)
  397. ret = file->f_op->write(file, buf, count, pos);
  398. else
  399. ret = do_sync_write(file, buf, count, pos);
  400. if (ret > 0) {
  401. fsnotify_modify(file);
  402. add_wchar(current, ret);
  403. }
  404. inc_syscw(current);
  405. file_end_write(file);
  406. }
  407. return ret;
  408. }
  409. EXPORT_SYMBOL(vfs_write);
  410. static inline loff_t file_pos_read(struct file *file)
  411. {
  412. return file->f_pos;
  413. }
  414. static inline void file_pos_write(struct file *file, loff_t pos)
  415. {
  416. file->f_pos = pos;
  417. }
  418. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  419. {
  420. struct fd f = fdget(fd);
  421. ssize_t ret = -EBADF;
  422. if (f.file) {
  423. loff_t pos = file_pos_read(f.file);
  424. ret = vfs_read(f.file, buf, count, &pos);
  425. file_pos_write(f.file, pos);
  426. fdput(f);
  427. }
  428. return ret;
  429. }
  430. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  431. size_t, count)
  432. {
  433. struct fd f = fdget(fd);
  434. ssize_t ret = -EBADF;
  435. if (f.file) {
  436. loff_t pos = file_pos_read(f.file);
  437. ret = vfs_write(f.file, buf, count, &pos);
  438. file_pos_write(f.file, pos);
  439. fdput(f);
  440. }
  441. return ret;
  442. }
  443. SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
  444. size_t, count, loff_t, pos)
  445. {
  446. struct fd f;
  447. ssize_t ret = -EBADF;
  448. if (pos < 0)
  449. return -EINVAL;
  450. f = fdget(fd);
  451. if (f.file) {
  452. ret = -ESPIPE;
  453. if (f.file->f_mode & FMODE_PREAD)
  454. ret = vfs_read(f.file, buf, count, &pos);
  455. fdput(f);
  456. }
  457. return ret;
  458. }
  459. SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
  460. size_t, count, loff_t, pos)
  461. {
  462. struct fd f;
  463. ssize_t ret = -EBADF;
  464. if (pos < 0)
  465. return -EINVAL;
  466. f = fdget(fd);
  467. if (f.file) {
  468. ret = -ESPIPE;
  469. if (f.file->f_mode & FMODE_PWRITE)
  470. ret = vfs_write(f.file, buf, count, &pos);
  471. fdput(f);
  472. }
  473. return ret;
  474. }
  475. /*
  476. * Reduce an iovec's length in-place. Return the resulting number of segments
  477. */
  478. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  479. {
  480. unsigned long seg = 0;
  481. size_t len = 0;
  482. while (seg < nr_segs) {
  483. seg++;
  484. if (len + iov->iov_len >= to) {
  485. iov->iov_len = to - len;
  486. break;
  487. }
  488. len += iov->iov_len;
  489. iov++;
  490. }
  491. return seg;
  492. }
  493. EXPORT_SYMBOL(iov_shorten);
  494. static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
  495. unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
  496. {
  497. struct kiocb kiocb;
  498. ssize_t ret;
  499. init_sync_kiocb(&kiocb, filp);
  500. kiocb.ki_pos = *ppos;
  501. kiocb.ki_left = len;
  502. kiocb.ki_nbytes = len;
  503. ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
  504. if (ret == -EIOCBQUEUED)
  505. ret = wait_on_sync_kiocb(&kiocb);
  506. *ppos = kiocb.ki_pos;
  507. return ret;
  508. }
  509. /* Do it by hand, with file-ops */
  510. static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
  511. unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
  512. {
  513. struct iovec *vector = iov;
  514. ssize_t ret = 0;
  515. while (nr_segs > 0) {
  516. void __user *base;
  517. size_t len;
  518. ssize_t nr;
  519. base = vector->iov_base;
  520. len = vector->iov_len;
  521. vector++;
  522. nr_segs--;
  523. nr = fn(filp, base, len, ppos);
  524. if (nr < 0) {
  525. if (!ret)
  526. ret = nr;
  527. break;
  528. }
  529. ret += nr;
  530. if (nr != len)
  531. break;
  532. }
  533. return ret;
  534. }
  535. /* A write operation does a read from user space and vice versa */
  536. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  537. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  538. unsigned long nr_segs, unsigned long fast_segs,
  539. struct iovec *fast_pointer,
  540. struct iovec **ret_pointer)
  541. {
  542. unsigned long seg;
  543. ssize_t ret;
  544. struct iovec *iov = fast_pointer;
  545. /*
  546. * SuS says "The readv() function *may* fail if the iovcnt argument
  547. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  548. * traditionally returned zero for zero segments, so...
  549. */
  550. if (nr_segs == 0) {
  551. ret = 0;
  552. goto out;
  553. }
  554. /*
  555. * First get the "struct iovec" from user memory and
  556. * verify all the pointers
  557. */
  558. if (nr_segs > UIO_MAXIOV) {
  559. ret = -EINVAL;
  560. goto out;
  561. }
  562. if (nr_segs > fast_segs) {
  563. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  564. if (iov == NULL) {
  565. ret = -ENOMEM;
  566. goto out;
  567. }
  568. }
  569. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  570. ret = -EFAULT;
  571. goto out;
  572. }
  573. /*
  574. * According to the Single Unix Specification we should return EINVAL
  575. * if an element length is < 0 when cast to ssize_t or if the
  576. * total length would overflow the ssize_t return value of the
  577. * system call.
  578. *
  579. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  580. * overflow case.
  581. */
  582. ret = 0;
  583. for (seg = 0; seg < nr_segs; seg++) {
  584. void __user *buf = iov[seg].iov_base;
  585. ssize_t len = (ssize_t)iov[seg].iov_len;
  586. /* see if we we're about to use an invalid len or if
  587. * it's about to overflow ssize_t */
  588. if (len < 0) {
  589. ret = -EINVAL;
  590. goto out;
  591. }
  592. if (type >= 0
  593. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  594. ret = -EFAULT;
  595. goto out;
  596. }
  597. if (len > MAX_RW_COUNT - ret) {
  598. len = MAX_RW_COUNT - ret;
  599. iov[seg].iov_len = len;
  600. }
  601. ret += len;
  602. }
  603. out:
  604. *ret_pointer = iov;
  605. return ret;
  606. }
  607. static ssize_t do_readv_writev(int type, struct file *file,
  608. const struct iovec __user * uvector,
  609. unsigned long nr_segs, loff_t *pos)
  610. {
  611. size_t tot_len;
  612. struct iovec iovstack[UIO_FASTIOV];
  613. struct iovec *iov = iovstack;
  614. ssize_t ret;
  615. io_fn_t fn;
  616. iov_fn_t fnv;
  617. if (!file->f_op) {
  618. ret = -EINVAL;
  619. goto out;
  620. }
  621. ret = rw_copy_check_uvector(type, uvector, nr_segs,
  622. ARRAY_SIZE(iovstack), iovstack, &iov);
  623. if (ret <= 0)
  624. goto out;
  625. tot_len = ret;
  626. ret = rw_verify_area(type, file, pos, tot_len);
  627. if (ret < 0)
  628. goto out;
  629. fnv = NULL;
  630. if (type == READ) {
  631. fn = file->f_op->read;
  632. fnv = file->f_op->aio_read;
  633. } else {
  634. fn = (io_fn_t)file->f_op->write;
  635. fnv = file->f_op->aio_write;
  636. file_start_write(file);
  637. }
  638. if (fnv)
  639. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  640. pos, fnv);
  641. else
  642. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  643. if (type != READ)
  644. file_end_write(file);
  645. out:
  646. if (iov != iovstack)
  647. kfree(iov);
  648. if ((ret + (type == READ)) > 0) {
  649. if (type == READ)
  650. fsnotify_access(file);
  651. else
  652. fsnotify_modify(file);
  653. }
  654. return ret;
  655. }
  656. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  657. unsigned long vlen, loff_t *pos)
  658. {
  659. if (!(file->f_mode & FMODE_READ))
  660. return -EBADF;
  661. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  662. return -EINVAL;
  663. return do_readv_writev(READ, file, vec, vlen, pos);
  664. }
  665. EXPORT_SYMBOL(vfs_readv);
  666. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  667. unsigned long vlen, loff_t *pos)
  668. {
  669. if (!(file->f_mode & FMODE_WRITE))
  670. return -EBADF;
  671. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  672. return -EINVAL;
  673. return do_readv_writev(WRITE, file, vec, vlen, pos);
  674. }
  675. EXPORT_SYMBOL(vfs_writev);
  676. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  677. unsigned long, vlen)
  678. {
  679. struct fd f = fdget(fd);
  680. ssize_t ret = -EBADF;
  681. if (f.file) {
  682. loff_t pos = file_pos_read(f.file);
  683. ret = vfs_readv(f.file, vec, vlen, &pos);
  684. file_pos_write(f.file, pos);
  685. fdput(f);
  686. }
  687. if (ret > 0)
  688. add_rchar(current, ret);
  689. inc_syscr(current);
  690. return ret;
  691. }
  692. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  693. unsigned long, vlen)
  694. {
  695. struct fd f = fdget(fd);
  696. ssize_t ret = -EBADF;
  697. if (f.file) {
  698. loff_t pos = file_pos_read(f.file);
  699. ret = vfs_writev(f.file, vec, vlen, &pos);
  700. file_pos_write(f.file, pos);
  701. fdput(f);
  702. }
  703. if (ret > 0)
  704. add_wchar(current, ret);
  705. inc_syscw(current);
  706. return ret;
  707. }
  708. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  709. {
  710. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  711. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  712. }
  713. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  714. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  715. {
  716. loff_t pos = pos_from_hilo(pos_h, pos_l);
  717. struct fd f;
  718. ssize_t ret = -EBADF;
  719. if (pos < 0)
  720. return -EINVAL;
  721. f = fdget(fd);
  722. if (f.file) {
  723. ret = -ESPIPE;
  724. if (f.file->f_mode & FMODE_PREAD)
  725. ret = vfs_readv(f.file, vec, vlen, &pos);
  726. fdput(f);
  727. }
  728. if (ret > 0)
  729. add_rchar(current, ret);
  730. inc_syscr(current);
  731. return ret;
  732. }
  733. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  734. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  735. {
  736. loff_t pos = pos_from_hilo(pos_h, pos_l);
  737. struct fd f;
  738. ssize_t ret = -EBADF;
  739. if (pos < 0)
  740. return -EINVAL;
  741. f = fdget(fd);
  742. if (f.file) {
  743. ret = -ESPIPE;
  744. if (f.file->f_mode & FMODE_PWRITE)
  745. ret = vfs_writev(f.file, vec, vlen, &pos);
  746. fdput(f);
  747. }
  748. if (ret > 0)
  749. add_wchar(current, ret);
  750. inc_syscw(current);
  751. return ret;
  752. }
  753. #ifdef CONFIG_COMPAT
  754. static ssize_t compat_do_readv_writev(int type, struct file *file,
  755. const struct compat_iovec __user *uvector,
  756. unsigned long nr_segs, loff_t *pos)
  757. {
  758. compat_ssize_t tot_len;
  759. struct iovec iovstack[UIO_FASTIOV];
  760. struct iovec *iov = iovstack;
  761. ssize_t ret;
  762. io_fn_t fn;
  763. iov_fn_t fnv;
  764. ret = -EINVAL;
  765. if (!file->f_op)
  766. goto out;
  767. ret = -EFAULT;
  768. if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
  769. goto out;
  770. ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
  771. UIO_FASTIOV, iovstack, &iov);
  772. if (ret <= 0)
  773. goto out;
  774. tot_len = ret;
  775. ret = rw_verify_area(type, file, pos, tot_len);
  776. if (ret < 0)
  777. goto out;
  778. fnv = NULL;
  779. if (type == READ) {
  780. fn = file->f_op->read;
  781. fnv = file->f_op->aio_read;
  782. } else {
  783. fn = (io_fn_t)file->f_op->write;
  784. fnv = file->f_op->aio_write;
  785. file_start_write(file);
  786. }
  787. if (fnv)
  788. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  789. pos, fnv);
  790. else
  791. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  792. if (type != READ)
  793. file_end_write(file);
  794. out:
  795. if (iov != iovstack)
  796. kfree(iov);
  797. if ((ret + (type == READ)) > 0) {
  798. if (type == READ)
  799. fsnotify_access(file);
  800. else
  801. fsnotify_modify(file);
  802. }
  803. return ret;
  804. }
  805. static size_t compat_readv(struct file *file,
  806. const struct compat_iovec __user *vec,
  807. unsigned long vlen, loff_t *pos)
  808. {
  809. ssize_t ret = -EBADF;
  810. if (!(file->f_mode & FMODE_READ))
  811. goto out;
  812. ret = -EINVAL;
  813. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  814. goto out;
  815. ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
  816. out:
  817. if (ret > 0)
  818. add_rchar(current, ret);
  819. inc_syscr(current);
  820. return ret;
  821. }
  822. COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
  823. const struct compat_iovec __user *,vec,
  824. unsigned long, vlen)
  825. {
  826. struct fd f = fdget(fd);
  827. ssize_t ret;
  828. loff_t pos;
  829. if (!f.file)
  830. return -EBADF;
  831. pos = f.file->f_pos;
  832. ret = compat_readv(f.file, vec, vlen, &pos);
  833. f.file->f_pos = pos;
  834. fdput(f);
  835. return ret;
  836. }
  837. COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
  838. const struct compat_iovec __user *,vec,
  839. unsigned long, vlen, loff_t, pos)
  840. {
  841. struct fd f;
  842. ssize_t ret;
  843. if (pos < 0)
  844. return -EINVAL;
  845. f = fdget(fd);
  846. if (!f.file)
  847. return -EBADF;
  848. ret = -ESPIPE;
  849. if (f.file->f_mode & FMODE_PREAD)
  850. ret = compat_readv(f.file, vec, vlen, &pos);
  851. fdput(f);
  852. return ret;
  853. }
  854. COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
  855. const struct compat_iovec __user *,vec,
  856. unsigned long, vlen, u32, pos_low, u32, pos_high)
  857. {
  858. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  859. return compat_sys_preadv64(fd, vec, vlen, pos);
  860. }
  861. static size_t compat_writev(struct file *file,
  862. const struct compat_iovec __user *vec,
  863. unsigned long vlen, loff_t *pos)
  864. {
  865. ssize_t ret = -EBADF;
  866. if (!(file->f_mode & FMODE_WRITE))
  867. goto out;
  868. ret = -EINVAL;
  869. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  870. goto out;
  871. ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
  872. out:
  873. if (ret > 0)
  874. add_wchar(current, ret);
  875. inc_syscw(current);
  876. return ret;
  877. }
  878. COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
  879. const struct compat_iovec __user *, vec,
  880. unsigned long, vlen)
  881. {
  882. struct fd f = fdget(fd);
  883. ssize_t ret;
  884. loff_t pos;
  885. if (!f.file)
  886. return -EBADF;
  887. pos = f.file->f_pos;
  888. ret = compat_writev(f.file, vec, vlen, &pos);
  889. f.file->f_pos = pos;
  890. fdput(f);
  891. return ret;
  892. }
  893. COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
  894. const struct compat_iovec __user *,vec,
  895. unsigned long, vlen, loff_t, pos)
  896. {
  897. struct fd f;
  898. ssize_t ret;
  899. if (pos < 0)
  900. return -EINVAL;
  901. f = fdget(fd);
  902. if (!f.file)
  903. return -EBADF;
  904. ret = -ESPIPE;
  905. if (f.file->f_mode & FMODE_PWRITE)
  906. ret = compat_writev(f.file, vec, vlen, &pos);
  907. fdput(f);
  908. return ret;
  909. }
  910. COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
  911. const struct compat_iovec __user *,vec,
  912. unsigned long, vlen, u32, pos_low, u32, pos_high)
  913. {
  914. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  915. return compat_sys_pwritev64(fd, vec, vlen, pos);
  916. }
  917. #endif
  918. static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  919. size_t count, loff_t max)
  920. {
  921. struct fd in, out;
  922. struct inode *in_inode, *out_inode;
  923. loff_t pos;
  924. loff_t out_pos;
  925. ssize_t retval;
  926. int fl;
  927. /*
  928. * Get input file, and verify that it is ok..
  929. */
  930. retval = -EBADF;
  931. in = fdget(in_fd);
  932. if (!in.file)
  933. goto out;
  934. if (!(in.file->f_mode & FMODE_READ))
  935. goto fput_in;
  936. retval = -ESPIPE;
  937. if (!ppos) {
  938. pos = in.file->f_pos;
  939. } else {
  940. pos = *ppos;
  941. if (!(in.file->f_mode & FMODE_PREAD))
  942. goto fput_in;
  943. }
  944. retval = rw_verify_area(READ, in.file, &pos, count);
  945. if (retval < 0)
  946. goto fput_in;
  947. count = retval;
  948. /*
  949. * Get output file, and verify that it is ok..
  950. */
  951. retval = -EBADF;
  952. out = fdget(out_fd);
  953. if (!out.file)
  954. goto fput_in;
  955. if (!(out.file->f_mode & FMODE_WRITE))
  956. goto fput_out;
  957. retval = -EINVAL;
  958. in_inode = file_inode(in.file);
  959. out_inode = file_inode(out.file);
  960. out_pos = out.file->f_pos;
  961. retval = rw_verify_area(WRITE, out.file, &out_pos, count);
  962. if (retval < 0)
  963. goto fput_out;
  964. count = retval;
  965. if (!max)
  966. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  967. if (unlikely(pos + count > max)) {
  968. retval = -EOVERFLOW;
  969. if (pos >= max)
  970. goto fput_out;
  971. count = max - pos;
  972. }
  973. fl = 0;
  974. #if 0
  975. /*
  976. * We need to debate whether we can enable this or not. The
  977. * man page documents EAGAIN return for the output at least,
  978. * and the application is arguably buggy if it doesn't expect
  979. * EAGAIN on a non-blocking file descriptor.
  980. */
  981. if (in.file->f_flags & O_NONBLOCK)
  982. fl = SPLICE_F_NONBLOCK;
  983. #endif
  984. retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
  985. if (retval > 0) {
  986. add_rchar(current, retval);
  987. add_wchar(current, retval);
  988. fsnotify_access(in.file);
  989. fsnotify_modify(out.file);
  990. out.file->f_pos = out_pos;
  991. if (ppos)
  992. *ppos = pos;
  993. else
  994. in.file->f_pos = pos;
  995. }
  996. inc_syscr(current);
  997. inc_syscw(current);
  998. if (pos > max)
  999. retval = -EOVERFLOW;
  1000. fput_out:
  1001. fdput(out);
  1002. fput_in:
  1003. fdput(in);
  1004. out:
  1005. return retval;
  1006. }
  1007. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  1008. {
  1009. loff_t pos;
  1010. off_t off;
  1011. ssize_t ret;
  1012. if (offset) {
  1013. if (unlikely(get_user(off, offset)))
  1014. return -EFAULT;
  1015. pos = off;
  1016. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1017. if (unlikely(put_user(pos, offset)))
  1018. return -EFAULT;
  1019. return ret;
  1020. }
  1021. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1022. }
  1023. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  1024. {
  1025. loff_t pos;
  1026. ssize_t ret;
  1027. if (offset) {
  1028. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1029. return -EFAULT;
  1030. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1031. if (unlikely(put_user(pos, offset)))
  1032. return -EFAULT;
  1033. return ret;
  1034. }
  1035. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1036. }
  1037. #ifdef CONFIG_COMPAT
  1038. COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
  1039. compat_off_t __user *, offset, compat_size_t, count)
  1040. {
  1041. loff_t pos;
  1042. off_t off;
  1043. ssize_t ret;
  1044. if (offset) {
  1045. if (unlikely(get_user(off, offset)))
  1046. return -EFAULT;
  1047. pos = off;
  1048. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1049. if (unlikely(put_user(pos, offset)))
  1050. return -EFAULT;
  1051. return ret;
  1052. }
  1053. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1054. }
  1055. COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
  1056. compat_loff_t __user *, offset, compat_size_t, count)
  1057. {
  1058. loff_t pos;
  1059. ssize_t ret;
  1060. if (offset) {
  1061. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1062. return -EFAULT;
  1063. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1064. if (unlikely(put_user(pos, offset)))
  1065. return -EFAULT;
  1066. return ret;
  1067. }
  1068. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1069. }
  1070. #endif