pipe.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299
  1. /*
  2. * linux/fs/pipe.c
  3. *
  4. * Copyright (C) 1991, 1992, 1999 Linus Torvalds
  5. */
  6. #include <linux/mm.h>
  7. #include <linux/file.h>
  8. #include <linux/poll.h>
  9. #include <linux/slab.h>
  10. #include <linux/module.h>
  11. #include <linux/init.h>
  12. #include <linux/fs.h>
  13. #include <linux/log2.h>
  14. #include <linux/mount.h>
  15. #include <linux/pipe_fs_i.h>
  16. #include <linux/uio.h>
  17. #include <linux/highmem.h>
  18. #include <linux/pagemap.h>
  19. #include <linux/audit.h>
  20. #include <linux/syscalls.h>
  21. #include <linux/fcntl.h>
  22. #include <asm/uaccess.h>
  23. #include <asm/ioctls.h>
  24. /*
  25. * The max size that a non-root user is allowed to grow the pipe. Can
  26. * be set by root in /proc/sys/fs/pipe-max-size
  27. */
  28. unsigned int pipe_max_size = 1048576;
  29. /*
  30. * Minimum pipe size, as required by POSIX
  31. */
  32. unsigned int pipe_min_size = PAGE_SIZE;
  33. /*
  34. * We use a start+len construction, which provides full use of the
  35. * allocated memory.
  36. * -- Florian Coosmann (FGC)
  37. *
  38. * Reads with count = 0 should always return 0.
  39. * -- Julian Bradfield 1999-06-07.
  40. *
  41. * FIFOs and Pipes now generate SIGIO for both readers and writers.
  42. * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
  43. *
  44. * pipe_read & write cleanup
  45. * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  46. */
  47. static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
  48. {
  49. if (pipe->inode)
  50. mutex_lock_nested(&pipe->inode->i_mutex, subclass);
  51. }
  52. void pipe_lock(struct pipe_inode_info *pipe)
  53. {
  54. /*
  55. * pipe_lock() nests non-pipe inode locks (for writing to a file)
  56. */
  57. pipe_lock_nested(pipe, I_MUTEX_PARENT);
  58. }
  59. EXPORT_SYMBOL(pipe_lock);
  60. void pipe_unlock(struct pipe_inode_info *pipe)
  61. {
  62. if (pipe->inode)
  63. mutex_unlock(&pipe->inode->i_mutex);
  64. }
  65. EXPORT_SYMBOL(pipe_unlock);
  66. void pipe_double_lock(struct pipe_inode_info *pipe1,
  67. struct pipe_inode_info *pipe2)
  68. {
  69. BUG_ON(pipe1 == pipe2);
  70. if (pipe1 < pipe2) {
  71. pipe_lock_nested(pipe1, I_MUTEX_PARENT);
  72. pipe_lock_nested(pipe2, I_MUTEX_CHILD);
  73. } else {
  74. pipe_lock_nested(pipe2, I_MUTEX_PARENT);
  75. pipe_lock_nested(pipe1, I_MUTEX_CHILD);
  76. }
  77. }
  78. /* Drop the inode semaphore and wait for a pipe event, atomically */
  79. void pipe_wait(struct pipe_inode_info *pipe)
  80. {
  81. DEFINE_WAIT(wait);
  82. /*
  83. * Pipes are system-local resources, so sleeping on them
  84. * is considered a noninteractive wait:
  85. */
  86. prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
  87. pipe_unlock(pipe);
  88. schedule();
  89. finish_wait(&pipe->wait, &wait);
  90. pipe_lock(pipe);
  91. }
  92. static int
  93. pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
  94. int atomic)
  95. {
  96. unsigned long copy;
  97. while (len > 0) {
  98. while (!iov->iov_len)
  99. iov++;
  100. copy = min_t(unsigned long, len, iov->iov_len);
  101. if (atomic) {
  102. if (__copy_from_user_inatomic(to, iov->iov_base, copy))
  103. return -EFAULT;
  104. } else {
  105. if (copy_from_user(to, iov->iov_base, copy))
  106. return -EFAULT;
  107. }
  108. to += copy;
  109. len -= copy;
  110. iov->iov_base += copy;
  111. iov->iov_len -= copy;
  112. }
  113. return 0;
  114. }
  115. static int
  116. pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
  117. int atomic)
  118. {
  119. unsigned long copy;
  120. while (len > 0) {
  121. while (!iov->iov_len)
  122. iov++;
  123. copy = min_t(unsigned long, len, iov->iov_len);
  124. if (atomic) {
  125. if (__copy_to_user_inatomic(iov->iov_base, from, copy))
  126. return -EFAULT;
  127. } else {
  128. if (copy_to_user(iov->iov_base, from, copy))
  129. return -EFAULT;
  130. }
  131. from += copy;
  132. len -= copy;
  133. iov->iov_base += copy;
  134. iov->iov_len -= copy;
  135. }
  136. return 0;
  137. }
  138. /*
  139. * Attempt to pre-fault in the user memory, so we can use atomic copies.
  140. * Returns the number of bytes not faulted in.
  141. */
  142. static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
  143. {
  144. while (!iov->iov_len)
  145. iov++;
  146. while (len > 0) {
  147. unsigned long this_len;
  148. this_len = min_t(unsigned long, len, iov->iov_len);
  149. if (fault_in_pages_writeable(iov->iov_base, this_len))
  150. break;
  151. len -= this_len;
  152. iov++;
  153. }
  154. return len;
  155. }
  156. /*
  157. * Pre-fault in the user memory, so we can use atomic copies.
  158. */
  159. static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
  160. {
  161. while (!iov->iov_len)
  162. iov++;
  163. while (len > 0) {
  164. unsigned long this_len;
  165. this_len = min_t(unsigned long, len, iov->iov_len);
  166. fault_in_pages_readable(iov->iov_base, this_len);
  167. len -= this_len;
  168. iov++;
  169. }
  170. }
  171. static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
  172. struct pipe_buffer *buf)
  173. {
  174. struct page *page = buf->page;
  175. /*
  176. * If nobody else uses this page, and we don't already have a
  177. * temporary page, let's keep track of it as a one-deep
  178. * allocation cache. (Otherwise just release our reference to it)
  179. */
  180. if (page_count(page) == 1 && !pipe->tmp_page)
  181. pipe->tmp_page = page;
  182. else
  183. page_cache_release(page);
  184. }
  185. /**
  186. * generic_pipe_buf_map - virtually map a pipe buffer
  187. * @pipe: the pipe that the buffer belongs to
  188. * @buf: the buffer that should be mapped
  189. * @atomic: whether to use an atomic map
  190. *
  191. * Description:
  192. * This function returns a kernel virtual address mapping for the
  193. * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
  194. * and the caller has to be careful not to fault before calling
  195. * the unmap function.
  196. *
  197. * Note that this function occupies KM_USER0 if @atomic != 0.
  198. */
  199. void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
  200. struct pipe_buffer *buf, int atomic)
  201. {
  202. if (atomic) {
  203. buf->flags |= PIPE_BUF_FLAG_ATOMIC;
  204. return kmap_atomic(buf->page, KM_USER0);
  205. }
  206. return kmap(buf->page);
  207. }
  208. EXPORT_SYMBOL(generic_pipe_buf_map);
  209. /**
  210. * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
  211. * @pipe: the pipe that the buffer belongs to
  212. * @buf: the buffer that should be unmapped
  213. * @map_data: the data that the mapping function returned
  214. *
  215. * Description:
  216. * This function undoes the mapping that ->map() provided.
  217. */
  218. void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
  219. struct pipe_buffer *buf, void *map_data)
  220. {
  221. if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
  222. buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
  223. kunmap_atomic(map_data, KM_USER0);
  224. } else
  225. kunmap(buf->page);
  226. }
  227. EXPORT_SYMBOL(generic_pipe_buf_unmap);
  228. /**
  229. * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
  230. * @pipe: the pipe that the buffer belongs to
  231. * @buf: the buffer to attempt to steal
  232. *
  233. * Description:
  234. * This function attempts to steal the &struct page attached to
  235. * @buf. If successful, this function returns 0 and returns with
  236. * the page locked. The caller may then reuse the page for whatever
  237. * he wishes; the typical use is insertion into a different file
  238. * page cache.
  239. */
  240. int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
  241. struct pipe_buffer *buf)
  242. {
  243. struct page *page = buf->page;
  244. /*
  245. * A reference of one is golden, that means that the owner of this
  246. * page is the only one holding a reference to it. lock the page
  247. * and return OK.
  248. */
  249. if (page_count(page) == 1) {
  250. lock_page(page);
  251. return 0;
  252. }
  253. return 1;
  254. }
  255. EXPORT_SYMBOL(generic_pipe_buf_steal);
  256. /**
  257. * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
  258. * @pipe: the pipe that the buffer belongs to
  259. * @buf: the buffer to get a reference to
  260. *
  261. * Description:
  262. * This function grabs an extra reference to @buf. It's used in
  263. * in the tee() system call, when we duplicate the buffers in one
  264. * pipe into another.
  265. */
  266. void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
  267. {
  268. page_cache_get(buf->page);
  269. }
  270. EXPORT_SYMBOL(generic_pipe_buf_get);
  271. /**
  272. * generic_pipe_buf_confirm - verify contents of the pipe buffer
  273. * @info: the pipe that the buffer belongs to
  274. * @buf: the buffer to confirm
  275. *
  276. * Description:
  277. * This function does nothing, because the generic pipe code uses
  278. * pages that are always good when inserted into the pipe.
  279. */
  280. int generic_pipe_buf_confirm(struct pipe_inode_info *info,
  281. struct pipe_buffer *buf)
  282. {
  283. return 0;
  284. }
  285. EXPORT_SYMBOL(generic_pipe_buf_confirm);
  286. /**
  287. * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
  288. * @pipe: the pipe that the buffer belongs to
  289. * @buf: the buffer to put a reference to
  290. *
  291. * Description:
  292. * This function releases a reference to @buf.
  293. */
  294. void generic_pipe_buf_release(struct pipe_inode_info *pipe,
  295. struct pipe_buffer *buf)
  296. {
  297. page_cache_release(buf->page);
  298. }
  299. EXPORT_SYMBOL(generic_pipe_buf_release);
  300. static const struct pipe_buf_operations anon_pipe_buf_ops = {
  301. .can_merge = 1,
  302. .map = generic_pipe_buf_map,
  303. .unmap = generic_pipe_buf_unmap,
  304. .confirm = generic_pipe_buf_confirm,
  305. .release = anon_pipe_buf_release,
  306. .steal = generic_pipe_buf_steal,
  307. .get = generic_pipe_buf_get,
  308. };
  309. static ssize_t
  310. pipe_read(struct kiocb *iocb, const struct iovec *_iov,
  311. unsigned long nr_segs, loff_t pos)
  312. {
  313. struct file *filp = iocb->ki_filp;
  314. struct inode *inode = filp->f_path.dentry->d_inode;
  315. struct pipe_inode_info *pipe;
  316. int do_wakeup;
  317. ssize_t ret;
  318. struct iovec *iov = (struct iovec *)_iov;
  319. size_t total_len;
  320. total_len = iov_length(iov, nr_segs);
  321. /* Null read succeeds. */
  322. if (unlikely(total_len == 0))
  323. return 0;
  324. do_wakeup = 0;
  325. ret = 0;
  326. mutex_lock(&inode->i_mutex);
  327. pipe = inode->i_pipe;
  328. for (;;) {
  329. int bufs = pipe->nrbufs;
  330. if (bufs) {
  331. int curbuf = pipe->curbuf;
  332. struct pipe_buffer *buf = pipe->bufs + curbuf;
  333. const struct pipe_buf_operations *ops = buf->ops;
  334. void *addr;
  335. size_t chars = buf->len;
  336. int error, atomic;
  337. if (chars > total_len)
  338. chars = total_len;
  339. error = ops->confirm(pipe, buf);
  340. if (error) {
  341. if (!ret)
  342. ret = error;
  343. break;
  344. }
  345. atomic = !iov_fault_in_pages_write(iov, chars);
  346. redo:
  347. addr = ops->map(pipe, buf, atomic);
  348. error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
  349. ops->unmap(pipe, buf, addr);
  350. if (unlikely(error)) {
  351. /*
  352. * Just retry with the slow path if we failed.
  353. */
  354. if (atomic) {
  355. atomic = 0;
  356. goto redo;
  357. }
  358. if (!ret)
  359. ret = error;
  360. break;
  361. }
  362. ret += chars;
  363. buf->offset += chars;
  364. buf->len -= chars;
  365. if (!buf->len) {
  366. buf->ops = NULL;
  367. ops->release(pipe, buf);
  368. curbuf = (curbuf + 1) & (pipe->buffers - 1);
  369. pipe->curbuf = curbuf;
  370. pipe->nrbufs = --bufs;
  371. do_wakeup = 1;
  372. }
  373. total_len -= chars;
  374. if (!total_len)
  375. break; /* common path: read succeeded */
  376. }
  377. if (bufs) /* More to do? */
  378. continue;
  379. if (!pipe->writers)
  380. break;
  381. if (!pipe->waiting_writers) {
  382. /* syscall merging: Usually we must not sleep
  383. * if O_NONBLOCK is set, or if we got some data.
  384. * But if a writer sleeps in kernel space, then
  385. * we can wait for that data without violating POSIX.
  386. */
  387. if (ret)
  388. break;
  389. if (filp->f_flags & O_NONBLOCK) {
  390. ret = -EAGAIN;
  391. break;
  392. }
  393. }
  394. if (signal_pending(current)) {
  395. if (!ret)
  396. ret = -ERESTARTSYS;
  397. break;
  398. }
  399. if (do_wakeup) {
  400. wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
  401. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  402. }
  403. pipe_wait(pipe);
  404. }
  405. mutex_unlock(&inode->i_mutex);
  406. /* Signal writers asynchronously that there is more room. */
  407. if (do_wakeup) {
  408. wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
  409. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  410. }
  411. if (ret > 0)
  412. file_accessed(filp);
  413. return ret;
  414. }
  415. static ssize_t
  416. pipe_write(struct kiocb *iocb, const struct iovec *_iov,
  417. unsigned long nr_segs, loff_t ppos)
  418. {
  419. struct file *filp = iocb->ki_filp;
  420. struct inode *inode = filp->f_path.dentry->d_inode;
  421. struct pipe_inode_info *pipe;
  422. ssize_t ret;
  423. int do_wakeup;
  424. struct iovec *iov = (struct iovec *)_iov;
  425. size_t total_len;
  426. ssize_t chars;
  427. total_len = iov_length(iov, nr_segs);
  428. /* Null write succeeds. */
  429. if (unlikely(total_len == 0))
  430. return 0;
  431. do_wakeup = 0;
  432. ret = 0;
  433. mutex_lock(&inode->i_mutex);
  434. pipe = inode->i_pipe;
  435. if (!pipe->readers) {
  436. send_sig(SIGPIPE, current, 0);
  437. ret = -EPIPE;
  438. goto out;
  439. }
  440. /* We try to merge small writes */
  441. chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
  442. if (pipe->nrbufs && chars != 0) {
  443. int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
  444. (pipe->buffers - 1);
  445. struct pipe_buffer *buf = pipe->bufs + lastbuf;
  446. const struct pipe_buf_operations *ops = buf->ops;
  447. int offset = buf->offset + buf->len;
  448. if (ops->can_merge && offset + chars <= PAGE_SIZE) {
  449. int error, atomic = 1;
  450. void *addr;
  451. error = ops->confirm(pipe, buf);
  452. if (error)
  453. goto out;
  454. iov_fault_in_pages_read(iov, chars);
  455. redo1:
  456. addr = ops->map(pipe, buf, atomic);
  457. error = pipe_iov_copy_from_user(offset + addr, iov,
  458. chars, atomic);
  459. ops->unmap(pipe, buf, addr);
  460. ret = error;
  461. do_wakeup = 1;
  462. if (error) {
  463. if (atomic) {
  464. atomic = 0;
  465. goto redo1;
  466. }
  467. goto out;
  468. }
  469. buf->len += chars;
  470. total_len -= chars;
  471. ret = chars;
  472. if (!total_len)
  473. goto out;
  474. }
  475. }
  476. for (;;) {
  477. int bufs;
  478. if (!pipe->readers) {
  479. send_sig(SIGPIPE, current, 0);
  480. if (!ret)
  481. ret = -EPIPE;
  482. break;
  483. }
  484. bufs = pipe->nrbufs;
  485. if (bufs < pipe->buffers) {
  486. int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
  487. struct pipe_buffer *buf = pipe->bufs + newbuf;
  488. struct page *page = pipe->tmp_page;
  489. char *src;
  490. int error, atomic = 1;
  491. if (!page) {
  492. page = alloc_page(GFP_HIGHUSER);
  493. if (unlikely(!page)) {
  494. ret = ret ? : -ENOMEM;
  495. break;
  496. }
  497. pipe->tmp_page = page;
  498. }
  499. /* Always wake up, even if the copy fails. Otherwise
  500. * we lock up (O_NONBLOCK-)readers that sleep due to
  501. * syscall merging.
  502. * FIXME! Is this really true?
  503. */
  504. do_wakeup = 1;
  505. chars = PAGE_SIZE;
  506. if (chars > total_len)
  507. chars = total_len;
  508. iov_fault_in_pages_read(iov, chars);
  509. redo2:
  510. if (atomic)
  511. src = kmap_atomic(page, KM_USER0);
  512. else
  513. src = kmap(page);
  514. error = pipe_iov_copy_from_user(src, iov, chars,
  515. atomic);
  516. if (atomic)
  517. kunmap_atomic(src, KM_USER0);
  518. else
  519. kunmap(page);
  520. if (unlikely(error)) {
  521. if (atomic) {
  522. atomic = 0;
  523. goto redo2;
  524. }
  525. if (!ret)
  526. ret = error;
  527. break;
  528. }
  529. ret += chars;
  530. /* Insert it into the buffer array */
  531. buf->page = page;
  532. buf->ops = &anon_pipe_buf_ops;
  533. buf->offset = 0;
  534. buf->len = chars;
  535. pipe->nrbufs = ++bufs;
  536. pipe->tmp_page = NULL;
  537. total_len -= chars;
  538. if (!total_len)
  539. break;
  540. }
  541. if (bufs < pipe->buffers)
  542. continue;
  543. if (filp->f_flags & O_NONBLOCK) {
  544. if (!ret)
  545. ret = -EAGAIN;
  546. break;
  547. }
  548. if (signal_pending(current)) {
  549. if (!ret)
  550. ret = -ERESTARTSYS;
  551. break;
  552. }
  553. if (do_wakeup) {
  554. wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
  555. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  556. do_wakeup = 0;
  557. }
  558. pipe->waiting_writers++;
  559. pipe_wait(pipe);
  560. pipe->waiting_writers--;
  561. }
  562. out:
  563. mutex_unlock(&inode->i_mutex);
  564. if (do_wakeup) {
  565. wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
  566. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  567. }
  568. if (ret > 0)
  569. file_update_time(filp);
  570. return ret;
  571. }
  572. static ssize_t
  573. bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
  574. {
  575. return -EBADF;
  576. }
  577. static ssize_t
  578. bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
  579. loff_t *ppos)
  580. {
  581. return -EBADF;
  582. }
  583. static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  584. {
  585. struct inode *inode = filp->f_path.dentry->d_inode;
  586. struct pipe_inode_info *pipe;
  587. int count, buf, nrbufs;
  588. switch (cmd) {
  589. case FIONREAD:
  590. mutex_lock(&inode->i_mutex);
  591. pipe = inode->i_pipe;
  592. count = 0;
  593. buf = pipe->curbuf;
  594. nrbufs = pipe->nrbufs;
  595. while (--nrbufs >= 0) {
  596. count += pipe->bufs[buf].len;
  597. buf = (buf+1) & (pipe->buffers - 1);
  598. }
  599. mutex_unlock(&inode->i_mutex);
  600. return put_user(count, (int __user *)arg);
  601. default:
  602. return -EINVAL;
  603. }
  604. }
  605. /* No kernel lock held - fine */
  606. static unsigned int
  607. pipe_poll(struct file *filp, poll_table *wait)
  608. {
  609. unsigned int mask;
  610. struct inode *inode = filp->f_path.dentry->d_inode;
  611. struct pipe_inode_info *pipe = inode->i_pipe;
  612. int nrbufs;
  613. poll_wait(filp, &pipe->wait, wait);
  614. /* Reading only -- no need for acquiring the semaphore. */
  615. nrbufs = pipe->nrbufs;
  616. mask = 0;
  617. if (filp->f_mode & FMODE_READ) {
  618. mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
  619. if (!pipe->writers && filp->f_version != pipe->w_counter)
  620. mask |= POLLHUP;
  621. }
  622. if (filp->f_mode & FMODE_WRITE) {
  623. mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
  624. /*
  625. * Most Unices do not set POLLERR for FIFOs but on Linux they
  626. * behave exactly like pipes for poll().
  627. */
  628. if (!pipe->readers)
  629. mask |= POLLERR;
  630. }
  631. return mask;
  632. }
  633. static int
  634. pipe_release(struct inode *inode, int decr, int decw)
  635. {
  636. struct pipe_inode_info *pipe;
  637. mutex_lock(&inode->i_mutex);
  638. pipe = inode->i_pipe;
  639. pipe->readers -= decr;
  640. pipe->writers -= decw;
  641. if (!pipe->readers && !pipe->writers) {
  642. free_pipe_info(inode);
  643. } else {
  644. wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
  645. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  646. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  647. }
  648. mutex_unlock(&inode->i_mutex);
  649. return 0;
  650. }
  651. static int
  652. pipe_read_fasync(int fd, struct file *filp, int on)
  653. {
  654. struct inode *inode = filp->f_path.dentry->d_inode;
  655. int retval;
  656. mutex_lock(&inode->i_mutex);
  657. retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
  658. mutex_unlock(&inode->i_mutex);
  659. return retval;
  660. }
  661. static int
  662. pipe_write_fasync(int fd, struct file *filp, int on)
  663. {
  664. struct inode *inode = filp->f_path.dentry->d_inode;
  665. int retval;
  666. mutex_lock(&inode->i_mutex);
  667. retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
  668. mutex_unlock(&inode->i_mutex);
  669. return retval;
  670. }
  671. static int
  672. pipe_rdwr_fasync(int fd, struct file *filp, int on)
  673. {
  674. struct inode *inode = filp->f_path.dentry->d_inode;
  675. struct pipe_inode_info *pipe = inode->i_pipe;
  676. int retval;
  677. mutex_lock(&inode->i_mutex);
  678. retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
  679. if (retval >= 0) {
  680. retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
  681. if (retval < 0) /* this can happen only if on == T */
  682. fasync_helper(-1, filp, 0, &pipe->fasync_readers);
  683. }
  684. mutex_unlock(&inode->i_mutex);
  685. return retval;
  686. }
  687. static int
  688. pipe_read_release(struct inode *inode, struct file *filp)
  689. {
  690. return pipe_release(inode, 1, 0);
  691. }
  692. static int
  693. pipe_write_release(struct inode *inode, struct file *filp)
  694. {
  695. return pipe_release(inode, 0, 1);
  696. }
  697. static int
  698. pipe_rdwr_release(struct inode *inode, struct file *filp)
  699. {
  700. int decr, decw;
  701. decr = (filp->f_mode & FMODE_READ) != 0;
  702. decw = (filp->f_mode & FMODE_WRITE) != 0;
  703. return pipe_release(inode, decr, decw);
  704. }
  705. static int
  706. pipe_read_open(struct inode *inode, struct file *filp)
  707. {
  708. int ret = -ENOENT;
  709. mutex_lock(&inode->i_mutex);
  710. if (inode->i_pipe) {
  711. ret = 0;
  712. inode->i_pipe->readers++;
  713. }
  714. mutex_unlock(&inode->i_mutex);
  715. return ret;
  716. }
  717. static int
  718. pipe_write_open(struct inode *inode, struct file *filp)
  719. {
  720. int ret = -ENOENT;
  721. mutex_lock(&inode->i_mutex);
  722. if (inode->i_pipe) {
  723. ret = 0;
  724. inode->i_pipe->writers++;
  725. }
  726. mutex_unlock(&inode->i_mutex);
  727. return ret;
  728. }
  729. static int
  730. pipe_rdwr_open(struct inode *inode, struct file *filp)
  731. {
  732. int ret = -ENOENT;
  733. mutex_lock(&inode->i_mutex);
  734. if (inode->i_pipe) {
  735. ret = 0;
  736. if (filp->f_mode & FMODE_READ)
  737. inode->i_pipe->readers++;
  738. if (filp->f_mode & FMODE_WRITE)
  739. inode->i_pipe->writers++;
  740. }
  741. mutex_unlock(&inode->i_mutex);
  742. return ret;
  743. }
  744. /*
  745. * The file_operations structs are not static because they
  746. * are also used in linux/fs/fifo.c to do operations on FIFOs.
  747. *
  748. * Pipes reuse fifos' file_operations structs.
  749. */
  750. const struct file_operations read_pipefifo_fops = {
  751. .llseek = no_llseek,
  752. .read = do_sync_read,
  753. .aio_read = pipe_read,
  754. .write = bad_pipe_w,
  755. .poll = pipe_poll,
  756. .unlocked_ioctl = pipe_ioctl,
  757. .open = pipe_read_open,
  758. .release = pipe_read_release,
  759. .fasync = pipe_read_fasync,
  760. };
  761. const struct file_operations write_pipefifo_fops = {
  762. .llseek = no_llseek,
  763. .read = bad_pipe_r,
  764. .write = do_sync_write,
  765. .aio_write = pipe_write,
  766. .poll = pipe_poll,
  767. .unlocked_ioctl = pipe_ioctl,
  768. .open = pipe_write_open,
  769. .release = pipe_write_release,
  770. .fasync = pipe_write_fasync,
  771. };
  772. const struct file_operations rdwr_pipefifo_fops = {
  773. .llseek = no_llseek,
  774. .read = do_sync_read,
  775. .aio_read = pipe_read,
  776. .write = do_sync_write,
  777. .aio_write = pipe_write,
  778. .poll = pipe_poll,
  779. .unlocked_ioctl = pipe_ioctl,
  780. .open = pipe_rdwr_open,
  781. .release = pipe_rdwr_release,
  782. .fasync = pipe_rdwr_fasync,
  783. };
  784. struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
  785. {
  786. struct pipe_inode_info *pipe;
  787. pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
  788. if (pipe) {
  789. pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
  790. if (pipe->bufs) {
  791. init_waitqueue_head(&pipe->wait);
  792. pipe->r_counter = pipe->w_counter = 1;
  793. pipe->inode = inode;
  794. pipe->buffers = PIPE_DEF_BUFFERS;
  795. return pipe;
  796. }
  797. kfree(pipe);
  798. }
  799. return NULL;
  800. }
  801. void __free_pipe_info(struct pipe_inode_info *pipe)
  802. {
  803. int i;
  804. for (i = 0; i < pipe->buffers; i++) {
  805. struct pipe_buffer *buf = pipe->bufs + i;
  806. if (buf->ops)
  807. buf->ops->release(pipe, buf);
  808. }
  809. if (pipe->tmp_page)
  810. __free_page(pipe->tmp_page);
  811. kfree(pipe->bufs);
  812. kfree(pipe);
  813. }
  814. void free_pipe_info(struct inode *inode)
  815. {
  816. __free_pipe_info(inode->i_pipe);
  817. inode->i_pipe = NULL;
  818. }
  819. static struct vfsmount *pipe_mnt __read_mostly;
  820. /*
  821. * pipefs_dname() is called from d_path().
  822. */
  823. static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
  824. {
  825. return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
  826. dentry->d_inode->i_ino);
  827. }
  828. static const struct dentry_operations pipefs_dentry_operations = {
  829. .d_dname = pipefs_dname,
  830. };
  831. static struct inode * get_pipe_inode(void)
  832. {
  833. struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
  834. struct pipe_inode_info *pipe;
  835. if (!inode)
  836. goto fail_inode;
  837. inode->i_ino = get_next_ino();
  838. pipe = alloc_pipe_info(inode);
  839. if (!pipe)
  840. goto fail_iput;
  841. inode->i_pipe = pipe;
  842. pipe->readers = pipe->writers = 1;
  843. inode->i_fop = &rdwr_pipefifo_fops;
  844. /*
  845. * Mark the inode dirty from the very beginning,
  846. * that way it will never be moved to the dirty
  847. * list because "mark_inode_dirty()" will think
  848. * that it already _is_ on the dirty list.
  849. */
  850. inode->i_state = I_DIRTY;
  851. inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
  852. inode->i_uid = current_fsuid();
  853. inode->i_gid = current_fsgid();
  854. inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  855. return inode;
  856. fail_iput:
  857. iput(inode);
  858. fail_inode:
  859. return NULL;
  860. }
  861. struct file *create_write_pipe(int flags)
  862. {
  863. int err;
  864. struct inode *inode;
  865. struct file *f;
  866. struct path path;
  867. struct qstr name = { .name = "" };
  868. err = -ENFILE;
  869. inode = get_pipe_inode();
  870. if (!inode)
  871. goto err;
  872. err = -ENOMEM;
  873. path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
  874. if (!path.dentry)
  875. goto err_inode;
  876. path.mnt = mntget(pipe_mnt);
  877. d_instantiate(path.dentry, inode);
  878. err = -ENFILE;
  879. f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
  880. if (!f)
  881. goto err_dentry;
  882. f->f_mapping = inode->i_mapping;
  883. f->f_flags = O_WRONLY | (flags & O_NONBLOCK);
  884. f->f_version = 0;
  885. return f;
  886. err_dentry:
  887. free_pipe_info(inode);
  888. path_put(&path);
  889. return ERR_PTR(err);
  890. err_inode:
  891. free_pipe_info(inode);
  892. iput(inode);
  893. err:
  894. return ERR_PTR(err);
  895. }
  896. void free_write_pipe(struct file *f)
  897. {
  898. free_pipe_info(f->f_dentry->d_inode);
  899. path_put(&f->f_path);
  900. put_filp(f);
  901. }
  902. struct file *create_read_pipe(struct file *wrf, int flags)
  903. {
  904. /* Grab pipe from the writer */
  905. struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
  906. &read_pipefifo_fops);
  907. if (!f)
  908. return ERR_PTR(-ENFILE);
  909. path_get(&wrf->f_path);
  910. f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
  911. return f;
  912. }
  913. int do_pipe_flags(int *fd, int flags)
  914. {
  915. struct file *fw, *fr;
  916. int error;
  917. int fdw, fdr;
  918. if (flags & ~(O_CLOEXEC | O_NONBLOCK))
  919. return -EINVAL;
  920. fw = create_write_pipe(flags);
  921. if (IS_ERR(fw))
  922. return PTR_ERR(fw);
  923. fr = create_read_pipe(fw, flags);
  924. error = PTR_ERR(fr);
  925. if (IS_ERR(fr))
  926. goto err_write_pipe;
  927. error = get_unused_fd_flags(flags);
  928. if (error < 0)
  929. goto err_read_pipe;
  930. fdr = error;
  931. error = get_unused_fd_flags(flags);
  932. if (error < 0)
  933. goto err_fdr;
  934. fdw = error;
  935. audit_fd_pair(fdr, fdw);
  936. fd_install(fdr, fr);
  937. fd_install(fdw, fw);
  938. fd[0] = fdr;
  939. fd[1] = fdw;
  940. return 0;
  941. err_fdr:
  942. put_unused_fd(fdr);
  943. err_read_pipe:
  944. path_put(&fr->f_path);
  945. put_filp(fr);
  946. err_write_pipe:
  947. free_write_pipe(fw);
  948. return error;
  949. }
  950. /*
  951. * sys_pipe() is the normal C calling standard for creating
  952. * a pipe. It's not the way Unix traditionally does this, though.
  953. */
  954. SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
  955. {
  956. int fd[2];
  957. int error;
  958. error = do_pipe_flags(fd, flags);
  959. if (!error) {
  960. if (copy_to_user(fildes, fd, sizeof(fd))) {
  961. sys_close(fd[0]);
  962. sys_close(fd[1]);
  963. error = -EFAULT;
  964. }
  965. }
  966. return error;
  967. }
  968. SYSCALL_DEFINE1(pipe, int __user *, fildes)
  969. {
  970. return sys_pipe2(fildes, 0);
  971. }
  972. /*
  973. * Allocate a new array of pipe buffers and copy the info over. Returns the
  974. * pipe size if successful, or return -ERROR on error.
  975. */
  976. static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
  977. {
  978. struct pipe_buffer *bufs;
  979. /*
  980. * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
  981. * expect a lot of shrink+grow operations, just free and allocate
  982. * again like we would do for growing. If the pipe currently
  983. * contains more buffers than arg, then return busy.
  984. */
  985. if (nr_pages < pipe->nrbufs)
  986. return -EBUSY;
  987. bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
  988. if (unlikely(!bufs))
  989. return -ENOMEM;
  990. /*
  991. * The pipe array wraps around, so just start the new one at zero
  992. * and adjust the indexes.
  993. */
  994. if (pipe->nrbufs) {
  995. unsigned int tail;
  996. unsigned int head;
  997. tail = pipe->curbuf + pipe->nrbufs;
  998. if (tail < pipe->buffers)
  999. tail = 0;
  1000. else
  1001. tail &= (pipe->buffers - 1);
  1002. head = pipe->nrbufs - tail;
  1003. if (head)
  1004. memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
  1005. if (tail)
  1006. memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
  1007. }
  1008. pipe->curbuf = 0;
  1009. kfree(pipe->bufs);
  1010. pipe->bufs = bufs;
  1011. pipe->buffers = nr_pages;
  1012. return nr_pages * PAGE_SIZE;
  1013. }
  1014. /*
  1015. * Currently we rely on the pipe array holding a power-of-2 number
  1016. * of pages.
  1017. */
  1018. static inline unsigned int round_pipe_size(unsigned int size)
  1019. {
  1020. unsigned long nr_pages;
  1021. nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
  1022. return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
  1023. }
  1024. /*
  1025. * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
  1026. * will return an error.
  1027. */
  1028. int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
  1029. size_t *lenp, loff_t *ppos)
  1030. {
  1031. int ret;
  1032. ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
  1033. if (ret < 0 || !write)
  1034. return ret;
  1035. pipe_max_size = round_pipe_size(pipe_max_size);
  1036. return ret;
  1037. }
  1038. /*
  1039. * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
  1040. * location, so checking ->i_pipe is not enough to verify that this is a
  1041. * pipe.
  1042. */
  1043. struct pipe_inode_info *get_pipe_info(struct file *file)
  1044. {
  1045. struct inode *i = file->f_path.dentry->d_inode;
  1046. return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
  1047. }
  1048. long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
  1049. {
  1050. struct pipe_inode_info *pipe;
  1051. long ret;
  1052. pipe = get_pipe_info(file);
  1053. if (!pipe)
  1054. return -EBADF;
  1055. mutex_lock(&pipe->inode->i_mutex);
  1056. switch (cmd) {
  1057. case F_SETPIPE_SZ: {
  1058. unsigned int size, nr_pages;
  1059. size = round_pipe_size(arg);
  1060. nr_pages = size >> PAGE_SHIFT;
  1061. ret = -EINVAL;
  1062. if (!nr_pages)
  1063. goto out;
  1064. if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
  1065. ret = -EPERM;
  1066. goto out;
  1067. }
  1068. ret = pipe_set_size(pipe, nr_pages);
  1069. break;
  1070. }
  1071. case F_GETPIPE_SZ:
  1072. ret = pipe->buffers * PAGE_SIZE;
  1073. break;
  1074. default:
  1075. ret = -EINVAL;
  1076. break;
  1077. }
  1078. out:
  1079. mutex_unlock(&pipe->inode->i_mutex);
  1080. return ret;
  1081. }
  1082. static const struct super_operations pipefs_ops = {
  1083. .destroy_inode = free_inode_nonrcu,
  1084. };
  1085. /*
  1086. * pipefs should _never_ be mounted by userland - too much of security hassle,
  1087. * no real gain from having the whole whorehouse mounted. So we don't need
  1088. * any operations on the root directory. However, we need a non-trivial
  1089. * d_name - pipe: will go nicely and kill the special-casing in procfs.
  1090. */
  1091. static struct dentry *pipefs_mount(struct file_system_type *fs_type,
  1092. int flags, const char *dev_name, void *data)
  1093. {
  1094. return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
  1095. &pipefs_dentry_operations, PIPEFS_MAGIC);
  1096. }
  1097. static struct file_system_type pipe_fs_type = {
  1098. .name = "pipefs",
  1099. .mount = pipefs_mount,
  1100. .kill_sb = kill_anon_super,
  1101. };
  1102. static int __init init_pipe_fs(void)
  1103. {
  1104. int err = register_filesystem(&pipe_fs_type);
  1105. if (!err) {
  1106. pipe_mnt = kern_mount(&pipe_fs_type);
  1107. if (IS_ERR(pipe_mnt)) {
  1108. err = PTR_ERR(pipe_mnt);
  1109. unregister_filesystem(&pipe_fs_type);
  1110. }
  1111. }
  1112. return err;
  1113. }
  1114. static void __exit exit_pipe_fs(void)
  1115. {
  1116. kern_unmount(pipe_mnt);
  1117. unregister_filesystem(&pipe_fs_type);
  1118. }
  1119. fs_initcall(init_pipe_fs);
  1120. module_exit(exit_pipe_fs);