read_write.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/aio.h>
  12. #include <linux/fsnotify.h>
  13. #include <linux/security.h>
  14. #include <linux/export.h>
  15. #include <linux/syscalls.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/splice.h>
  18. #include <linux/compat.h>
  19. #include "internal.h"
  20. #include <asm/uaccess.h>
  21. #include <asm/unistd.h>
  22. typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  23. typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  24. unsigned long, loff_t);
  25. const struct file_operations generic_ro_fops = {
  26. .llseek = generic_file_llseek,
  27. .read = do_sync_read,
  28. .aio_read = generic_file_aio_read,
  29. .mmap = generic_file_readonly_mmap,
  30. .splice_read = generic_file_splice_read,
  31. };
  32. EXPORT_SYMBOL(generic_ro_fops);
  33. static inline int unsigned_offsets(struct file *file)
  34. {
  35. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  36. }
  37. static loff_t lseek_execute(struct file *file, struct inode *inode,
  38. loff_t offset, loff_t maxsize)
  39. {
  40. if (offset < 0 && !unsigned_offsets(file))
  41. return -EINVAL;
  42. if (offset > maxsize)
  43. return -EINVAL;
  44. if (offset != file->f_pos) {
  45. file->f_pos = offset;
  46. file->f_version = 0;
  47. }
  48. return offset;
  49. }
  50. /**
  51. * generic_file_llseek_size - generic llseek implementation for regular files
  52. * @file: file structure to seek on
  53. * @offset: file offset to seek to
  54. * @whence: type of seek
  55. * @size: max size of this file in file system
  56. * @eof: offset used for SEEK_END position
  57. *
  58. * This is a variant of generic_file_llseek that allows passing in a custom
  59. * maximum file size and a custom EOF position, for e.g. hashed directories
  60. *
  61. * Synchronization:
  62. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  63. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  64. * read/writes behave like SEEK_SET against seeks.
  65. */
  66. loff_t
  67. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  68. loff_t maxsize, loff_t eof)
  69. {
  70. struct inode *inode = file->f_mapping->host;
  71. switch (whence) {
  72. case SEEK_END:
  73. offset += eof;
  74. break;
  75. case SEEK_CUR:
  76. /*
  77. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  78. * position-querying operation. Avoid rewriting the "same"
  79. * f_pos value back to the file because a concurrent read(),
  80. * write() or lseek() might have altered it
  81. */
  82. if (offset == 0)
  83. return file->f_pos;
  84. /*
  85. * f_lock protects against read/modify/write race with other
  86. * SEEK_CURs. Note that parallel writes and reads behave
  87. * like SEEK_SET.
  88. */
  89. spin_lock(&file->f_lock);
  90. offset = lseek_execute(file, inode, file->f_pos + offset,
  91. maxsize);
  92. spin_unlock(&file->f_lock);
  93. return offset;
  94. case SEEK_DATA:
  95. /*
  96. * In the generic case the entire file is data, so as long as
  97. * offset isn't at the end of the file then the offset is data.
  98. */
  99. if (offset >= eof)
  100. return -ENXIO;
  101. break;
  102. case SEEK_HOLE:
  103. /*
  104. * There is a virtual hole at the end of the file, so as long as
  105. * offset isn't i_size or larger, return i_size.
  106. */
  107. if (offset >= eof)
  108. return -ENXIO;
  109. offset = eof;
  110. break;
  111. }
  112. return lseek_execute(file, inode, offset, maxsize);
  113. }
  114. EXPORT_SYMBOL(generic_file_llseek_size);
  115. /**
  116. * generic_file_llseek - generic llseek implementation for regular files
  117. * @file: file structure to seek on
  118. * @offset: file offset to seek to
  119. * @whence: type of seek
  120. *
  121. * This is a generic implemenation of ->llseek useable for all normal local
  122. * filesystems. It just updates the file offset to the value specified by
  123. * @offset and @whence.
  124. */
  125. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  126. {
  127. struct inode *inode = file->f_mapping->host;
  128. return generic_file_llseek_size(file, offset, whence,
  129. inode->i_sb->s_maxbytes,
  130. i_size_read(inode));
  131. }
  132. EXPORT_SYMBOL(generic_file_llseek);
  133. /**
  134. * noop_llseek - No Operation Performed llseek implementation
  135. * @file: file structure to seek on
  136. * @offset: file offset to seek to
  137. * @whence: type of seek
  138. *
  139. * This is an implementation of ->llseek useable for the rare special case when
  140. * userspace expects the seek to succeed but the (device) file is actually not
  141. * able to perform the seek. In this case you use noop_llseek() instead of
  142. * falling back to the default implementation of ->llseek.
  143. */
  144. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  145. {
  146. return file->f_pos;
  147. }
  148. EXPORT_SYMBOL(noop_llseek);
  149. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  150. {
  151. return -ESPIPE;
  152. }
  153. EXPORT_SYMBOL(no_llseek);
  154. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  155. {
  156. struct inode *inode = file_inode(file);
  157. loff_t retval;
  158. mutex_lock(&inode->i_mutex);
  159. switch (whence) {
  160. case SEEK_END:
  161. offset += i_size_read(inode);
  162. break;
  163. case SEEK_CUR:
  164. if (offset == 0) {
  165. retval = file->f_pos;
  166. goto out;
  167. }
  168. offset += file->f_pos;
  169. break;
  170. case SEEK_DATA:
  171. /*
  172. * In the generic case the entire file is data, so as
  173. * long as offset isn't at the end of the file then the
  174. * offset is data.
  175. */
  176. if (offset >= inode->i_size) {
  177. retval = -ENXIO;
  178. goto out;
  179. }
  180. break;
  181. case SEEK_HOLE:
  182. /*
  183. * There is a virtual hole at the end of the file, so
  184. * as long as offset isn't i_size or larger, return
  185. * i_size.
  186. */
  187. if (offset >= inode->i_size) {
  188. retval = -ENXIO;
  189. goto out;
  190. }
  191. offset = inode->i_size;
  192. break;
  193. }
  194. retval = -EINVAL;
  195. if (offset >= 0 || unsigned_offsets(file)) {
  196. if (offset != file->f_pos) {
  197. file->f_pos = offset;
  198. file->f_version = 0;
  199. }
  200. retval = offset;
  201. }
  202. out:
  203. mutex_unlock(&inode->i_mutex);
  204. return retval;
  205. }
  206. EXPORT_SYMBOL(default_llseek);
  207. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  208. {
  209. loff_t (*fn)(struct file *, loff_t, int);
  210. fn = no_llseek;
  211. if (file->f_mode & FMODE_LSEEK) {
  212. if (file->f_op && file->f_op->llseek)
  213. fn = file->f_op->llseek;
  214. }
  215. return fn(file, offset, whence);
  216. }
  217. EXPORT_SYMBOL(vfs_llseek);
  218. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  219. {
  220. off_t retval;
  221. struct fd f = fdget(fd);
  222. if (!f.file)
  223. return -EBADF;
  224. retval = -EINVAL;
  225. if (whence <= SEEK_MAX) {
  226. loff_t res = vfs_llseek(f.file, offset, whence);
  227. retval = res;
  228. if (res != (loff_t)retval)
  229. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  230. }
  231. fdput(f);
  232. return retval;
  233. }
  234. #ifdef CONFIG_COMPAT
  235. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  236. {
  237. return sys_lseek(fd, offset, whence);
  238. }
  239. #endif
  240. #ifdef __ARCH_WANT_SYS_LLSEEK
  241. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  242. unsigned long, offset_low, loff_t __user *, result,
  243. unsigned int, whence)
  244. {
  245. int retval;
  246. struct fd f = fdget(fd);
  247. loff_t offset;
  248. if (!f.file)
  249. return -EBADF;
  250. retval = -EINVAL;
  251. if (whence > SEEK_MAX)
  252. goto out_putf;
  253. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  254. whence);
  255. retval = (int)offset;
  256. if (offset >= 0) {
  257. retval = -EFAULT;
  258. if (!copy_to_user(result, &offset, sizeof(offset)))
  259. retval = 0;
  260. }
  261. out_putf:
  262. fdput(f);
  263. return retval;
  264. }
  265. #endif
  266. /*
  267. * rw_verify_area doesn't like huge counts. We limit
  268. * them to something that fits in "int" so that others
  269. * won't have to do range checks all the time.
  270. */
  271. int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
  272. {
  273. struct inode *inode;
  274. loff_t pos;
  275. int retval = -EINVAL;
  276. inode = file_inode(file);
  277. if (unlikely((ssize_t) count < 0))
  278. return retval;
  279. pos = *ppos;
  280. if (unlikely(pos < 0)) {
  281. if (!unsigned_offsets(file))
  282. return retval;
  283. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  284. return -EOVERFLOW;
  285. } else if (unlikely((loff_t) (pos + count) < 0)) {
  286. if (!unsigned_offsets(file))
  287. return retval;
  288. }
  289. if (unlikely(inode->i_flock && mandatory_lock(inode))) {
  290. retval = locks_mandatory_area(
  291. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  292. inode, file, pos, count);
  293. if (retval < 0)
  294. return retval;
  295. }
  296. retval = security_file_permission(file,
  297. read_write == READ ? MAY_READ : MAY_WRITE);
  298. if (retval)
  299. return retval;
  300. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  301. }
  302. ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  303. {
  304. struct iovec iov = { .iov_base = buf, .iov_len = len };
  305. struct kiocb kiocb;
  306. ssize_t ret;
  307. init_sync_kiocb(&kiocb, filp);
  308. kiocb.ki_pos = *ppos;
  309. kiocb.ki_left = len;
  310. kiocb.ki_nbytes = len;
  311. ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
  312. if (-EIOCBQUEUED == ret)
  313. ret = wait_on_sync_kiocb(&kiocb);
  314. *ppos = kiocb.ki_pos;
  315. return ret;
  316. }
  317. EXPORT_SYMBOL(do_sync_read);
  318. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  319. {
  320. ssize_t ret;
  321. if (!(file->f_mode & FMODE_READ))
  322. return -EBADF;
  323. if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
  324. return -EINVAL;
  325. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  326. return -EFAULT;
  327. ret = rw_verify_area(READ, file, pos, count);
  328. if (ret >= 0) {
  329. count = ret;
  330. if (file->f_op->read)
  331. ret = file->f_op->read(file, buf, count, pos);
  332. else
  333. ret = do_sync_read(file, buf, count, pos);
  334. if (ret > 0) {
  335. fsnotify_access(file);
  336. add_rchar(current, ret);
  337. }
  338. inc_syscr(current);
  339. }
  340. return ret;
  341. }
  342. EXPORT_SYMBOL(vfs_read);
  343. ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  344. {
  345. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  346. struct kiocb kiocb;
  347. ssize_t ret;
  348. init_sync_kiocb(&kiocb, filp);
  349. kiocb.ki_pos = *ppos;
  350. kiocb.ki_left = len;
  351. kiocb.ki_nbytes = len;
  352. ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
  353. if (-EIOCBQUEUED == ret)
  354. ret = wait_on_sync_kiocb(&kiocb);
  355. *ppos = kiocb.ki_pos;
  356. return ret;
  357. }
  358. EXPORT_SYMBOL(do_sync_write);
  359. ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
  360. {
  361. mm_segment_t old_fs;
  362. const char __user *p;
  363. ssize_t ret;
  364. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  365. return -EINVAL;
  366. old_fs = get_fs();
  367. set_fs(get_ds());
  368. p = (__force const char __user *)buf;
  369. if (count > MAX_RW_COUNT)
  370. count = MAX_RW_COUNT;
  371. if (file->f_op->write)
  372. ret = file->f_op->write(file, p, count, pos);
  373. else
  374. ret = do_sync_write(file, p, count, pos);
  375. set_fs(old_fs);
  376. if (ret > 0) {
  377. fsnotify_modify(file);
  378. add_wchar(current, ret);
  379. }
  380. inc_syscw(current);
  381. return ret;
  382. }
  383. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  384. {
  385. ssize_t ret;
  386. if (!(file->f_mode & FMODE_WRITE))
  387. return -EBADF;
  388. if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
  389. return -EINVAL;
  390. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  391. return -EFAULT;
  392. ret = rw_verify_area(WRITE, file, pos, count);
  393. if (ret >= 0) {
  394. count = ret;
  395. file_start_write(file);
  396. if (file->f_op->write)
  397. ret = file->f_op->write(file, buf, count, pos);
  398. else
  399. ret = do_sync_write(file, buf, count, pos);
  400. if (ret > 0) {
  401. fsnotify_modify(file);
  402. add_wchar(current, ret);
  403. }
  404. inc_syscw(current);
  405. file_end_write(file);
  406. }
  407. return ret;
  408. }
  409. EXPORT_SYMBOL(vfs_write);
  410. static inline loff_t file_pos_read(struct file *file)
  411. {
  412. return file->f_pos;
  413. }
  414. static inline void file_pos_write(struct file *file, loff_t pos)
  415. {
  416. file->f_pos = pos;
  417. }
  418. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  419. {
  420. struct fd f = fdget(fd);
  421. ssize_t ret = -EBADF;
  422. if (f.file) {
  423. loff_t pos = file_pos_read(f.file);
  424. ret = vfs_read(f.file, buf, count, &pos);
  425. if (ret >= 0)
  426. file_pos_write(f.file, pos);
  427. fdput(f);
  428. }
  429. return ret;
  430. }
  431. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  432. size_t, count)
  433. {
  434. struct fd f = fdget(fd);
  435. ssize_t ret = -EBADF;
  436. if (f.file) {
  437. loff_t pos = file_pos_read(f.file);
  438. ret = vfs_write(f.file, buf, count, &pos);
  439. if (ret >= 0)
  440. file_pos_write(f.file, pos);
  441. fdput(f);
  442. }
  443. return ret;
  444. }
  445. SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
  446. size_t, count, loff_t, pos)
  447. {
  448. struct fd f;
  449. ssize_t ret = -EBADF;
  450. if (pos < 0)
  451. return -EINVAL;
  452. f = fdget(fd);
  453. if (f.file) {
  454. ret = -ESPIPE;
  455. if (f.file->f_mode & FMODE_PREAD)
  456. ret = vfs_read(f.file, buf, count, &pos);
  457. fdput(f);
  458. }
  459. return ret;
  460. }
  461. SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
  462. size_t, count, loff_t, pos)
  463. {
  464. struct fd f;
  465. ssize_t ret = -EBADF;
  466. if (pos < 0)
  467. return -EINVAL;
  468. f = fdget(fd);
  469. if (f.file) {
  470. ret = -ESPIPE;
  471. if (f.file->f_mode & FMODE_PWRITE)
  472. ret = vfs_write(f.file, buf, count, &pos);
  473. fdput(f);
  474. }
  475. return ret;
  476. }
  477. /*
  478. * Reduce an iovec's length in-place. Return the resulting number of segments
  479. */
  480. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  481. {
  482. unsigned long seg = 0;
  483. size_t len = 0;
  484. while (seg < nr_segs) {
  485. seg++;
  486. if (len + iov->iov_len >= to) {
  487. iov->iov_len = to - len;
  488. break;
  489. }
  490. len += iov->iov_len;
  491. iov++;
  492. }
  493. return seg;
  494. }
  495. EXPORT_SYMBOL(iov_shorten);
  496. static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
  497. unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
  498. {
  499. struct kiocb kiocb;
  500. ssize_t ret;
  501. init_sync_kiocb(&kiocb, filp);
  502. kiocb.ki_pos = *ppos;
  503. kiocb.ki_left = len;
  504. kiocb.ki_nbytes = len;
  505. ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
  506. if (ret == -EIOCBQUEUED)
  507. ret = wait_on_sync_kiocb(&kiocb);
  508. *ppos = kiocb.ki_pos;
  509. return ret;
  510. }
  511. /* Do it by hand, with file-ops */
  512. static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
  513. unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
  514. {
  515. struct iovec *vector = iov;
  516. ssize_t ret = 0;
  517. while (nr_segs > 0) {
  518. void __user *base;
  519. size_t len;
  520. ssize_t nr;
  521. base = vector->iov_base;
  522. len = vector->iov_len;
  523. vector++;
  524. nr_segs--;
  525. nr = fn(filp, base, len, ppos);
  526. if (nr < 0) {
  527. if (!ret)
  528. ret = nr;
  529. break;
  530. }
  531. ret += nr;
  532. if (nr != len)
  533. break;
  534. }
  535. return ret;
  536. }
  537. /* A write operation does a read from user space and vice versa */
  538. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  539. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  540. unsigned long nr_segs, unsigned long fast_segs,
  541. struct iovec *fast_pointer,
  542. struct iovec **ret_pointer)
  543. {
  544. unsigned long seg;
  545. ssize_t ret;
  546. struct iovec *iov = fast_pointer;
  547. /*
  548. * SuS says "The readv() function *may* fail if the iovcnt argument
  549. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  550. * traditionally returned zero for zero segments, so...
  551. */
  552. if (nr_segs == 0) {
  553. ret = 0;
  554. goto out;
  555. }
  556. /*
  557. * First get the "struct iovec" from user memory and
  558. * verify all the pointers
  559. */
  560. if (nr_segs > UIO_MAXIOV) {
  561. ret = -EINVAL;
  562. goto out;
  563. }
  564. if (nr_segs > fast_segs) {
  565. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  566. if (iov == NULL) {
  567. ret = -ENOMEM;
  568. goto out;
  569. }
  570. }
  571. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  572. ret = -EFAULT;
  573. goto out;
  574. }
  575. /*
  576. * According to the Single Unix Specification we should return EINVAL
  577. * if an element length is < 0 when cast to ssize_t or if the
  578. * total length would overflow the ssize_t return value of the
  579. * system call.
  580. *
  581. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  582. * overflow case.
  583. */
  584. ret = 0;
  585. for (seg = 0; seg < nr_segs; seg++) {
  586. void __user *buf = iov[seg].iov_base;
  587. ssize_t len = (ssize_t)iov[seg].iov_len;
  588. /* see if we we're about to use an invalid len or if
  589. * it's about to overflow ssize_t */
  590. if (len < 0) {
  591. ret = -EINVAL;
  592. goto out;
  593. }
  594. if (type >= 0
  595. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  596. ret = -EFAULT;
  597. goto out;
  598. }
  599. if (len > MAX_RW_COUNT - ret) {
  600. len = MAX_RW_COUNT - ret;
  601. iov[seg].iov_len = len;
  602. }
  603. ret += len;
  604. }
  605. out:
  606. *ret_pointer = iov;
  607. return ret;
  608. }
  609. static ssize_t do_readv_writev(int type, struct file *file,
  610. const struct iovec __user * uvector,
  611. unsigned long nr_segs, loff_t *pos)
  612. {
  613. size_t tot_len;
  614. struct iovec iovstack[UIO_FASTIOV];
  615. struct iovec *iov = iovstack;
  616. ssize_t ret;
  617. io_fn_t fn;
  618. iov_fn_t fnv;
  619. if (!file->f_op) {
  620. ret = -EINVAL;
  621. goto out;
  622. }
  623. ret = rw_copy_check_uvector(type, uvector, nr_segs,
  624. ARRAY_SIZE(iovstack), iovstack, &iov);
  625. if (ret <= 0)
  626. goto out;
  627. tot_len = ret;
  628. ret = rw_verify_area(type, file, pos, tot_len);
  629. if (ret < 0)
  630. goto out;
  631. fnv = NULL;
  632. if (type == READ) {
  633. fn = file->f_op->read;
  634. fnv = file->f_op->aio_read;
  635. } else {
  636. fn = (io_fn_t)file->f_op->write;
  637. fnv = file->f_op->aio_write;
  638. file_start_write(file);
  639. }
  640. if (fnv)
  641. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  642. pos, fnv);
  643. else
  644. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  645. if (type != READ)
  646. file_end_write(file);
  647. out:
  648. if (iov != iovstack)
  649. kfree(iov);
  650. if ((ret + (type == READ)) > 0) {
  651. if (type == READ)
  652. fsnotify_access(file);
  653. else
  654. fsnotify_modify(file);
  655. }
  656. return ret;
  657. }
  658. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  659. unsigned long vlen, loff_t *pos)
  660. {
  661. if (!(file->f_mode & FMODE_READ))
  662. return -EBADF;
  663. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  664. return -EINVAL;
  665. return do_readv_writev(READ, file, vec, vlen, pos);
  666. }
  667. EXPORT_SYMBOL(vfs_readv);
  668. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  669. unsigned long vlen, loff_t *pos)
  670. {
  671. if (!(file->f_mode & FMODE_WRITE))
  672. return -EBADF;
  673. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  674. return -EINVAL;
  675. return do_readv_writev(WRITE, file, vec, vlen, pos);
  676. }
  677. EXPORT_SYMBOL(vfs_writev);
  678. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  679. unsigned long, vlen)
  680. {
  681. struct fd f = fdget(fd);
  682. ssize_t ret = -EBADF;
  683. if (f.file) {
  684. loff_t pos = file_pos_read(f.file);
  685. ret = vfs_readv(f.file, vec, vlen, &pos);
  686. if (ret >= 0)
  687. file_pos_write(f.file, pos);
  688. fdput(f);
  689. }
  690. if (ret > 0)
  691. add_rchar(current, ret);
  692. inc_syscr(current);
  693. return ret;
  694. }
  695. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  696. unsigned long, vlen)
  697. {
  698. struct fd f = fdget(fd);
  699. ssize_t ret = -EBADF;
  700. if (f.file) {
  701. loff_t pos = file_pos_read(f.file);
  702. ret = vfs_writev(f.file, vec, vlen, &pos);
  703. if (ret >= 0)
  704. file_pos_write(f.file, pos);
  705. fdput(f);
  706. }
  707. if (ret > 0)
  708. add_wchar(current, ret);
  709. inc_syscw(current);
  710. return ret;
  711. }
  712. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  713. {
  714. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  715. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  716. }
  717. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  718. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  719. {
  720. loff_t pos = pos_from_hilo(pos_h, pos_l);
  721. struct fd f;
  722. ssize_t ret = -EBADF;
  723. if (pos < 0)
  724. return -EINVAL;
  725. f = fdget(fd);
  726. if (f.file) {
  727. ret = -ESPIPE;
  728. if (f.file->f_mode & FMODE_PREAD)
  729. ret = vfs_readv(f.file, vec, vlen, &pos);
  730. fdput(f);
  731. }
  732. if (ret > 0)
  733. add_rchar(current, ret);
  734. inc_syscr(current);
  735. return ret;
  736. }
  737. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  738. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  739. {
  740. loff_t pos = pos_from_hilo(pos_h, pos_l);
  741. struct fd f;
  742. ssize_t ret = -EBADF;
  743. if (pos < 0)
  744. return -EINVAL;
  745. f = fdget(fd);
  746. if (f.file) {
  747. ret = -ESPIPE;
  748. if (f.file->f_mode & FMODE_PWRITE)
  749. ret = vfs_writev(f.file, vec, vlen, &pos);
  750. fdput(f);
  751. }
  752. if (ret > 0)
  753. add_wchar(current, ret);
  754. inc_syscw(current);
  755. return ret;
  756. }
  757. #ifdef CONFIG_COMPAT
  758. static ssize_t compat_do_readv_writev(int type, struct file *file,
  759. const struct compat_iovec __user *uvector,
  760. unsigned long nr_segs, loff_t *pos)
  761. {
  762. compat_ssize_t tot_len;
  763. struct iovec iovstack[UIO_FASTIOV];
  764. struct iovec *iov = iovstack;
  765. ssize_t ret;
  766. io_fn_t fn;
  767. iov_fn_t fnv;
  768. ret = -EINVAL;
  769. if (!file->f_op)
  770. goto out;
  771. ret = -EFAULT;
  772. if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
  773. goto out;
  774. ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
  775. UIO_FASTIOV, iovstack, &iov);
  776. if (ret <= 0)
  777. goto out;
  778. tot_len = ret;
  779. ret = rw_verify_area(type, file, pos, tot_len);
  780. if (ret < 0)
  781. goto out;
  782. fnv = NULL;
  783. if (type == READ) {
  784. fn = file->f_op->read;
  785. fnv = file->f_op->aio_read;
  786. } else {
  787. fn = (io_fn_t)file->f_op->write;
  788. fnv = file->f_op->aio_write;
  789. file_start_write(file);
  790. }
  791. if (fnv)
  792. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  793. pos, fnv);
  794. else
  795. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  796. if (type != READ)
  797. file_end_write(file);
  798. out:
  799. if (iov != iovstack)
  800. kfree(iov);
  801. if ((ret + (type == READ)) > 0) {
  802. if (type == READ)
  803. fsnotify_access(file);
  804. else
  805. fsnotify_modify(file);
  806. }
  807. return ret;
  808. }
  809. static size_t compat_readv(struct file *file,
  810. const struct compat_iovec __user *vec,
  811. unsigned long vlen, loff_t *pos)
  812. {
  813. ssize_t ret = -EBADF;
  814. if (!(file->f_mode & FMODE_READ))
  815. goto out;
  816. ret = -EINVAL;
  817. if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
  818. goto out;
  819. ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
  820. out:
  821. if (ret > 0)
  822. add_rchar(current, ret);
  823. inc_syscr(current);
  824. return ret;
  825. }
  826. COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
  827. const struct compat_iovec __user *,vec,
  828. unsigned long, vlen)
  829. {
  830. struct fd f = fdget(fd);
  831. ssize_t ret;
  832. loff_t pos;
  833. if (!f.file)
  834. return -EBADF;
  835. pos = f.file->f_pos;
  836. ret = compat_readv(f.file, vec, vlen, &pos);
  837. if (ret >= 0)
  838. f.file->f_pos = pos;
  839. fdput(f);
  840. return ret;
  841. }
  842. COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
  843. const struct compat_iovec __user *,vec,
  844. unsigned long, vlen, loff_t, pos)
  845. {
  846. struct fd f;
  847. ssize_t ret;
  848. if (pos < 0)
  849. return -EINVAL;
  850. f = fdget(fd);
  851. if (!f.file)
  852. return -EBADF;
  853. ret = -ESPIPE;
  854. if (f.file->f_mode & FMODE_PREAD)
  855. ret = compat_readv(f.file, vec, vlen, &pos);
  856. fdput(f);
  857. return ret;
  858. }
  859. COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
  860. const struct compat_iovec __user *,vec,
  861. unsigned long, vlen, u32, pos_low, u32, pos_high)
  862. {
  863. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  864. return compat_sys_preadv64(fd, vec, vlen, pos);
  865. }
  866. static size_t compat_writev(struct file *file,
  867. const struct compat_iovec __user *vec,
  868. unsigned long vlen, loff_t *pos)
  869. {
  870. ssize_t ret = -EBADF;
  871. if (!(file->f_mode & FMODE_WRITE))
  872. goto out;
  873. ret = -EINVAL;
  874. if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
  875. goto out;
  876. ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
  877. out:
  878. if (ret > 0)
  879. add_wchar(current, ret);
  880. inc_syscw(current);
  881. return ret;
  882. }
  883. COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
  884. const struct compat_iovec __user *, vec,
  885. unsigned long, vlen)
  886. {
  887. struct fd f = fdget(fd);
  888. ssize_t ret;
  889. loff_t pos;
  890. if (!f.file)
  891. return -EBADF;
  892. pos = f.file->f_pos;
  893. ret = compat_writev(f.file, vec, vlen, &pos);
  894. if (ret >= 0)
  895. f.file->f_pos = pos;
  896. fdput(f);
  897. return ret;
  898. }
  899. COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
  900. const struct compat_iovec __user *,vec,
  901. unsigned long, vlen, loff_t, pos)
  902. {
  903. struct fd f;
  904. ssize_t ret;
  905. if (pos < 0)
  906. return -EINVAL;
  907. f = fdget(fd);
  908. if (!f.file)
  909. return -EBADF;
  910. ret = -ESPIPE;
  911. if (f.file->f_mode & FMODE_PWRITE)
  912. ret = compat_writev(f.file, vec, vlen, &pos);
  913. fdput(f);
  914. return ret;
  915. }
  916. COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
  917. const struct compat_iovec __user *,vec,
  918. unsigned long, vlen, u32, pos_low, u32, pos_high)
  919. {
  920. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  921. return compat_sys_pwritev64(fd, vec, vlen, pos);
  922. }
  923. #endif
  924. static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  925. size_t count, loff_t max)
  926. {
  927. struct fd in, out;
  928. struct inode *in_inode, *out_inode;
  929. loff_t pos;
  930. loff_t out_pos;
  931. ssize_t retval;
  932. int fl;
  933. /*
  934. * Get input file, and verify that it is ok..
  935. */
  936. retval = -EBADF;
  937. in = fdget(in_fd);
  938. if (!in.file)
  939. goto out;
  940. if (!(in.file->f_mode & FMODE_READ))
  941. goto fput_in;
  942. retval = -ESPIPE;
  943. if (!ppos) {
  944. pos = in.file->f_pos;
  945. } else {
  946. pos = *ppos;
  947. if (!(in.file->f_mode & FMODE_PREAD))
  948. goto fput_in;
  949. }
  950. retval = rw_verify_area(READ, in.file, &pos, count);
  951. if (retval < 0)
  952. goto fput_in;
  953. count = retval;
  954. /*
  955. * Get output file, and verify that it is ok..
  956. */
  957. retval = -EBADF;
  958. out = fdget(out_fd);
  959. if (!out.file)
  960. goto fput_in;
  961. if (!(out.file->f_mode & FMODE_WRITE))
  962. goto fput_out;
  963. retval = -EINVAL;
  964. in_inode = file_inode(in.file);
  965. out_inode = file_inode(out.file);
  966. out_pos = out.file->f_pos;
  967. retval = rw_verify_area(WRITE, out.file, &out_pos, count);
  968. if (retval < 0)
  969. goto fput_out;
  970. count = retval;
  971. if (!max)
  972. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  973. if (unlikely(pos + count > max)) {
  974. retval = -EOVERFLOW;
  975. if (pos >= max)
  976. goto fput_out;
  977. count = max - pos;
  978. }
  979. fl = 0;
  980. #if 0
  981. /*
  982. * We need to debate whether we can enable this or not. The
  983. * man page documents EAGAIN return for the output at least,
  984. * and the application is arguably buggy if it doesn't expect
  985. * EAGAIN on a non-blocking file descriptor.
  986. */
  987. if (in.file->f_flags & O_NONBLOCK)
  988. fl = SPLICE_F_NONBLOCK;
  989. #endif
  990. file_start_write(out.file);
  991. retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
  992. file_end_write(out.file);
  993. if (retval > 0) {
  994. add_rchar(current, retval);
  995. add_wchar(current, retval);
  996. fsnotify_access(in.file);
  997. fsnotify_modify(out.file);
  998. out.file->f_pos = out_pos;
  999. if (ppos)
  1000. *ppos = pos;
  1001. else
  1002. in.file->f_pos = pos;
  1003. }
  1004. inc_syscr(current);
  1005. inc_syscw(current);
  1006. if (pos > max)
  1007. retval = -EOVERFLOW;
  1008. fput_out:
  1009. fdput(out);
  1010. fput_in:
  1011. fdput(in);
  1012. out:
  1013. return retval;
  1014. }
  1015. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  1016. {
  1017. loff_t pos;
  1018. off_t off;
  1019. ssize_t ret;
  1020. if (offset) {
  1021. if (unlikely(get_user(off, offset)))
  1022. return -EFAULT;
  1023. pos = off;
  1024. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1025. if (unlikely(put_user(pos, offset)))
  1026. return -EFAULT;
  1027. return ret;
  1028. }
  1029. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1030. }
  1031. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  1032. {
  1033. loff_t pos;
  1034. ssize_t ret;
  1035. if (offset) {
  1036. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1037. return -EFAULT;
  1038. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1039. if (unlikely(put_user(pos, offset)))
  1040. return -EFAULT;
  1041. return ret;
  1042. }
  1043. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1044. }
  1045. #ifdef CONFIG_COMPAT
  1046. COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
  1047. compat_off_t __user *, offset, compat_size_t, count)
  1048. {
  1049. loff_t pos;
  1050. off_t off;
  1051. ssize_t ret;
  1052. if (offset) {
  1053. if (unlikely(get_user(off, offset)))
  1054. return -EFAULT;
  1055. pos = off;
  1056. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1057. if (unlikely(put_user(pos, offset)))
  1058. return -EFAULT;
  1059. return ret;
  1060. }
  1061. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1062. }
  1063. COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
  1064. compat_loff_t __user *, offset, compat_size_t, count)
  1065. {
  1066. loff_t pos;
  1067. ssize_t ret;
  1068. if (offset) {
  1069. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1070. return -EFAULT;
  1071. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1072. if (unlikely(put_user(pos, offset)))
  1073. return -EFAULT;
  1074. return ret;
  1075. }
  1076. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1077. }
  1078. #endif