eventpoll.c 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653
  1. /*
  2. * fs/eventpoll.c ( Efficent event polling implementation )
  3. * Copyright (C) 2001,...,2006 Davide Libenzi
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * Davide Libenzi <davidel@xmailserver.org>
  11. *
  12. */
  13. #include <linux/module.h>
  14. #include <linux/init.h>
  15. #include <linux/kernel.h>
  16. #include <linux/sched.h>
  17. #include <linux/fs.h>
  18. #include <linux/file.h>
  19. #include <linux/signal.h>
  20. #include <linux/errno.h>
  21. #include <linux/mm.h>
  22. #include <linux/slab.h>
  23. #include <linux/poll.h>
  24. #include <linux/smp_lock.h>
  25. #include <linux/string.h>
  26. #include <linux/list.h>
  27. #include <linux/hash.h>
  28. #include <linux/spinlock.h>
  29. #include <linux/syscalls.h>
  30. #include <linux/rwsem.h>
  31. #include <linux/rbtree.h>
  32. #include <linux/wait.h>
  33. #include <linux/eventpoll.h>
  34. #include <linux/mount.h>
  35. #include <linux/bitops.h>
  36. #include <linux/mutex.h>
  37. #include <asm/uaccess.h>
  38. #include <asm/system.h>
  39. #include <asm/io.h>
  40. #include <asm/mman.h>
  41. #include <asm/atomic.h>
  42. #include <asm/semaphore.h>
  43. /*
  44. * LOCKING:
  45. * There are three level of locking required by epoll :
  46. *
  47. * 1) epmutex (mutex)
  48. * 2) ep->sem (rw_semaphore)
  49. * 3) ep->lock (rw_lock)
  50. *
  51. * The acquire order is the one listed above, from 1 to 3.
  52. * We need a spinlock (ep->lock) because we manipulate objects
  53. * from inside the poll callback, that might be triggered from
  54. * a wake_up() that in turn might be called from IRQ context.
  55. * So we can't sleep inside the poll callback and hence we need
  56. * a spinlock. During the event transfer loop (from kernel to
  57. * user space) we could end up sleeping due a copy_to_user(), so
  58. * we need a lock that will allow us to sleep. This lock is a
  59. * read-write semaphore (ep->sem). It is acquired on read during
  60. * the event transfer loop and in write during epoll_ctl(EPOLL_CTL_DEL)
  61. * and during eventpoll_release_file(). Then we also need a global
  62. * semaphore to serialize eventpoll_release_file() and ep_free().
  63. * This semaphore is acquired by ep_free() during the epoll file
  64. * cleanup path and it is also acquired by eventpoll_release_file()
  65. * if a file has been pushed inside an epoll set and it is then
  66. * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
  67. * It is possible to drop the "ep->sem" and to use the global
  68. * semaphore "epmutex" (together with "ep->lock") to have it working,
  69. * but having "ep->sem" will make the interface more scalable.
  70. * Events that require holding "epmutex" are very rare, while for
  71. * normal operations the epoll private "ep->sem" will guarantee
  72. * a greater scalability.
  73. */
  74. #define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */
  75. #define DEBUG_EPOLL 0
  76. #if DEBUG_EPOLL > 0
  77. #define DPRINTK(x) printk x
  78. #define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
  79. #else /* #if DEBUG_EPOLL > 0 */
  80. #define DPRINTK(x) (void) 0
  81. #define DNPRINTK(n, x) (void) 0
  82. #endif /* #if DEBUG_EPOLL > 0 */
  83. #define DEBUG_EPI 0
  84. #if DEBUG_EPI != 0
  85. #define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
  86. #else /* #if DEBUG_EPI != 0 */
  87. #define EPI_SLAB_DEBUG 0
  88. #endif /* #if DEBUG_EPI != 0 */
  89. /* Epoll private bits inside the event mask */
  90. #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
  91. /* Maximum number of poll wake up nests we are allowing */
  92. #define EP_MAX_POLLWAKE_NESTS 4
  93. /* Maximum msec timeout value storeable in a long int */
  94. #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
  95. #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
  96. struct epoll_filefd {
  97. struct file *file;
  98. int fd;
  99. };
  100. /*
  101. * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
  102. * It is used to keep track on all tasks that are currently inside the wake_up() code
  103. * to 1) short-circuit the one coming from the same task and same wait queue head
  104. * ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting
  105. * 3) let go the ones coming from other tasks.
  106. */
  107. struct wake_task_node {
  108. struct list_head llink;
  109. struct task_struct *task;
  110. wait_queue_head_t *wq;
  111. };
  112. /*
  113. * This is used to implement the safe poll wake up avoiding to reenter
  114. * the poll callback from inside wake_up().
  115. */
  116. struct poll_safewake {
  117. struct list_head wake_task_list;
  118. spinlock_t lock;
  119. };
  120. /*
  121. * This structure is stored inside the "private_data" member of the file
  122. * structure and rapresent the main data sructure for the eventpoll
  123. * interface.
  124. */
  125. struct eventpoll {
  126. /* Protect the this structure access */
  127. rwlock_t lock;
  128. /*
  129. * This semaphore is used to ensure that files are not removed
  130. * while epoll is using them. This is read-held during the event
  131. * collection loop and it is write-held during the file cleanup
  132. * path, the epoll file exit code and the ctl operations.
  133. */
  134. struct rw_semaphore sem;
  135. /* Wait queue used by sys_epoll_wait() */
  136. wait_queue_head_t wq;
  137. /* Wait queue used by file->poll() */
  138. wait_queue_head_t poll_wait;
  139. /* List of ready file descriptors */
  140. struct list_head rdllist;
  141. /* RB-Tree root used to store monitored fd structs */
  142. struct rb_root rbr;
  143. };
  144. /* Wait structure used by the poll hooks */
  145. struct eppoll_entry {
  146. /* List header used to link this structure to the "struct epitem" */
  147. struct list_head llink;
  148. /* The "base" pointer is set to the container "struct epitem" */
  149. void *base;
  150. /*
  151. * Wait queue item that will be linked to the target file wait
  152. * queue head.
  153. */
  154. wait_queue_t wait;
  155. /* The wait queue head that linked the "wait" wait queue item */
  156. wait_queue_head_t *whead;
  157. };
  158. /*
  159. * Each file descriptor added to the eventpoll interface will
  160. * have an entry of this type linked to the "rbr" RB tree.
  161. */
  162. struct epitem {
  163. /* RB-Tree node used to link this structure to the eventpoll rb-tree */
  164. struct rb_node rbn;
  165. /* List header used to link this structure to the eventpoll ready list */
  166. struct list_head rdllink;
  167. /* The file descriptor information this item refers to */
  168. struct epoll_filefd ffd;
  169. /* Number of active wait queue attached to poll operations */
  170. int nwait;
  171. /* List containing poll wait queues */
  172. struct list_head pwqlist;
  173. /* The "container" of this item */
  174. struct eventpoll *ep;
  175. /* The structure that describe the interested events and the source fd */
  176. struct epoll_event event;
  177. /*
  178. * Used to keep track of the usage count of the structure. This avoids
  179. * that the structure will desappear from underneath our processing.
  180. */
  181. atomic_t usecnt;
  182. /* List header used to link this item to the "struct file" items list */
  183. struct list_head fllink;
  184. };
  185. /* Wrapper struct used by poll queueing */
  186. struct ep_pqueue {
  187. poll_table pt;
  188. struct epitem *epi;
  189. };
  190. static void ep_poll_safewake_init(struct poll_safewake *psw);
  191. static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
  192. static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
  193. struct eventpoll *ep);
  194. static int ep_alloc(struct eventpoll **pep);
  195. static void ep_free(struct eventpoll *ep);
  196. static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
  197. static void ep_use_epitem(struct epitem *epi);
  198. static void ep_release_epitem(struct epitem *epi);
  199. static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
  200. poll_table *pt);
  201. static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi);
  202. static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
  203. struct file *tfile, int fd);
  204. static int ep_modify(struct eventpoll *ep, struct epitem *epi,
  205. struct epoll_event *event);
  206. static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi);
  207. static int ep_unlink(struct eventpoll *ep, struct epitem *epi);
  208. static int ep_remove(struct eventpoll *ep, struct epitem *epi);
  209. static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key);
  210. static int ep_eventpoll_close(struct inode *inode, struct file *file);
  211. static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
  212. static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
  213. struct epoll_event __user *events, int maxevents);
  214. static int ep_events_transfer(struct eventpoll *ep,
  215. struct epoll_event __user *events,
  216. int maxevents);
  217. static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
  218. int maxevents, long timeout);
  219. static int eventpollfs_delete_dentry(struct dentry *dentry);
  220. static struct inode *ep_eventpoll_inode(void);
  221. static int eventpollfs_get_sb(struct file_system_type *fs_type,
  222. int flags, const char *dev_name,
  223. void *data, struct vfsmount *mnt);
  224. /*
  225. * This semaphore is used to serialize ep_free() and eventpoll_release_file().
  226. */
  227. static struct mutex epmutex;
  228. /* Safe wake up implementation */
  229. static struct poll_safewake psw;
  230. /* Slab cache used to allocate "struct epitem" */
  231. static struct kmem_cache *epi_cache __read_mostly;
  232. /* Slab cache used to allocate "struct eppoll_entry" */
  233. static struct kmem_cache *pwq_cache __read_mostly;
  234. /* Virtual fs used to allocate inodes for eventpoll files */
  235. static struct vfsmount *eventpoll_mnt __read_mostly;
  236. /* File callbacks that implement the eventpoll file behaviour */
  237. static const struct file_operations eventpoll_fops = {
  238. .release = ep_eventpoll_close,
  239. .poll = ep_eventpoll_poll
  240. };
  241. /*
  242. * This is used to register the virtual file system from where
  243. * eventpoll inodes are allocated.
  244. */
  245. static struct file_system_type eventpoll_fs_type = {
  246. .name = "eventpollfs",
  247. .get_sb = eventpollfs_get_sb,
  248. .kill_sb = kill_anon_super,
  249. };
  250. /* Very basic directory entry operations for the eventpoll virtual file system */
  251. static struct dentry_operations eventpollfs_dentry_operations = {
  252. .d_delete = eventpollfs_delete_dentry,
  253. };
  254. /* Fast test to see if the file is an evenpoll file */
  255. static inline int is_file_epoll(struct file *f)
  256. {
  257. return f->f_op == &eventpoll_fops;
  258. }
  259. /* Setup the structure that is used as key for the rb-tree */
  260. static inline void ep_set_ffd(struct epoll_filefd *ffd,
  261. struct file *file, int fd)
  262. {
  263. ffd->file = file;
  264. ffd->fd = fd;
  265. }
  266. /* Compare rb-tree keys */
  267. static inline int ep_cmp_ffd(struct epoll_filefd *p1,
  268. struct epoll_filefd *p2)
  269. {
  270. return (p1->file > p2->file ? +1:
  271. (p1->file < p2->file ? -1 : p1->fd - p2->fd));
  272. }
  273. /* Special initialization for the rb-tree node to detect linkage */
  274. static inline void ep_rb_initnode(struct rb_node *n)
  275. {
  276. rb_set_parent(n, n);
  277. }
  278. /* Removes a node from the rb-tree and marks it for a fast is-linked check */
  279. static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r)
  280. {
  281. rb_erase(n, r);
  282. rb_set_parent(n, n);
  283. }
  284. /* Fast check to verify that the item is linked to the main rb-tree */
  285. static inline int ep_rb_linked(struct rb_node *n)
  286. {
  287. return rb_parent(n) != n;
  288. }
  289. /* Tells us if the item is currently linked */
  290. static inline int ep_is_linked(struct list_head *p)
  291. {
  292. return !list_empty(p);
  293. }
  294. /* Get the "struct epitem" from a wait queue pointer */
  295. static inline struct epitem * ep_item_from_wait(wait_queue_t *p)
  296. {
  297. return container_of(p, struct eppoll_entry, wait)->base;
  298. }
  299. /* Get the "struct epitem" from an epoll queue wrapper */
  300. static inline struct epitem * ep_item_from_epqueue(poll_table *p)
  301. {
  302. return container_of(p, struct ep_pqueue, pt)->epi;
  303. }
  304. /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
  305. static inline int ep_op_has_event(int op)
  306. {
  307. return op != EPOLL_CTL_DEL;
  308. }
  309. /* Initialize the poll safe wake up structure */
  310. static void ep_poll_safewake_init(struct poll_safewake *psw)
  311. {
  312. INIT_LIST_HEAD(&psw->wake_task_list);
  313. spin_lock_init(&psw->lock);
  314. }
  315. /*
  316. * Perform a safe wake up of the poll wait list. The problem is that
  317. * with the new callback'd wake up system, it is possible that the
  318. * poll callback is reentered from inside the call to wake_up() done
  319. * on the poll wait queue head. The rule is that we cannot reenter the
  320. * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
  321. * and we cannot reenter the same wait queue head at all. This will
  322. * enable to have a hierarchy of epoll file descriptor of no more than
  323. * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
  324. * because this one gets called by the poll callback, that in turn is called
  325. * from inside a wake_up(), that might be called from irq context.
  326. */
  327. static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
  328. {
  329. int wake_nests = 0;
  330. unsigned long flags;
  331. struct task_struct *this_task = current;
  332. struct list_head *lsthead = &psw->wake_task_list, *lnk;
  333. struct wake_task_node *tncur;
  334. struct wake_task_node tnode;
  335. spin_lock_irqsave(&psw->lock, flags);
  336. /* Try to see if the current task is already inside this wakeup call */
  337. list_for_each(lnk, lsthead) {
  338. tncur = list_entry(lnk, struct wake_task_node, llink);
  339. if (tncur->wq == wq ||
  340. (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
  341. /*
  342. * Ops ... loop detected or maximum nest level reached.
  343. * We abort this wake by breaking the cycle itself.
  344. */
  345. spin_unlock_irqrestore(&psw->lock, flags);
  346. return;
  347. }
  348. }
  349. /* Add the current task to the list */
  350. tnode.task = this_task;
  351. tnode.wq = wq;
  352. list_add(&tnode.llink, lsthead);
  353. spin_unlock_irqrestore(&psw->lock, flags);
  354. /* Do really wake up now */
  355. wake_up(wq);
  356. /* Remove the current task from the list */
  357. spin_lock_irqsave(&psw->lock, flags);
  358. list_del(&tnode.llink);
  359. spin_unlock_irqrestore(&psw->lock, flags);
  360. }
  361. /*
  362. * This is called from eventpoll_release() to unlink files from the eventpoll
  363. * interface. We need to have this facility to cleanup correctly files that are
  364. * closed without being removed from the eventpoll interface.
  365. */
  366. void eventpoll_release_file(struct file *file)
  367. {
  368. struct list_head *lsthead = &file->f_ep_links;
  369. struct eventpoll *ep;
  370. struct epitem *epi;
  371. /*
  372. * We don't want to get "file->f_ep_lock" because it is not
  373. * necessary. It is not necessary because we're in the "struct file"
  374. * cleanup path, and this means that noone is using this file anymore.
  375. * The only hit might come from ep_free() but by holding the semaphore
  376. * will correctly serialize the operation. We do need to acquire
  377. * "ep->sem" after "epmutex" because ep_remove() requires it when called
  378. * from anywhere but ep_free().
  379. */
  380. mutex_lock(&epmutex);
  381. while (!list_empty(lsthead)) {
  382. epi = list_entry(lsthead->next, struct epitem, fllink);
  383. ep = epi->ep;
  384. list_del_init(&epi->fllink);
  385. down_write(&ep->sem);
  386. ep_remove(ep, epi);
  387. up_write(&ep->sem);
  388. }
  389. mutex_unlock(&epmutex);
  390. }
  391. /*
  392. * It opens an eventpoll file descriptor by suggesting a storage of "size"
  393. * file descriptors. The size parameter is just an hint about how to size
  394. * data structures. It won't prevent the user to store more than "size"
  395. * file descriptors inside the epoll interface. It is the kernel part of
  396. * the userspace epoll_create(2).
  397. */
  398. asmlinkage long sys_epoll_create(int size)
  399. {
  400. int error, fd = -1;
  401. struct eventpoll *ep;
  402. struct inode *inode;
  403. struct file *file;
  404. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
  405. current, size));
  406. /*
  407. * Sanity check on the size parameter, and create the internal data
  408. * structure ( "struct eventpoll" ).
  409. */
  410. error = -EINVAL;
  411. if (size <= 0 || (error = ep_alloc(&ep)) != 0)
  412. goto eexit_1;
  413. /*
  414. * Creates all the items needed to setup an eventpoll file. That is,
  415. * a file structure, and inode and a free file descriptor.
  416. */
  417. error = ep_getfd(&fd, &inode, &file, ep);
  418. if (error)
  419. goto eexit_2;
  420. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
  421. current, size, fd));
  422. return fd;
  423. eexit_2:
  424. ep_free(ep);
  425. kfree(ep);
  426. eexit_1:
  427. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
  428. current, size, error));
  429. return error;
  430. }
  431. /*
  432. * The following function implements the controller interface for
  433. * the eventpoll file that enables the insertion/removal/change of
  434. * file descriptors inside the interest set. It represents
  435. * the kernel part of the user space epoll_ctl(2).
  436. */
  437. asmlinkage long
  438. sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
  439. {
  440. int error;
  441. struct file *file, *tfile;
  442. struct eventpoll *ep;
  443. struct epitem *epi;
  444. struct epoll_event epds;
  445. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
  446. current, epfd, op, fd, event));
  447. error = -EFAULT;
  448. if (ep_op_has_event(op) &&
  449. copy_from_user(&epds, event, sizeof(struct epoll_event)))
  450. goto eexit_1;
  451. /* Get the "struct file *" for the eventpoll file */
  452. error = -EBADF;
  453. file = fget(epfd);
  454. if (!file)
  455. goto eexit_1;
  456. /* Get the "struct file *" for the target file */
  457. tfile = fget(fd);
  458. if (!tfile)
  459. goto eexit_2;
  460. /* The target file descriptor must support poll */
  461. error = -EPERM;
  462. if (!tfile->f_op || !tfile->f_op->poll)
  463. goto eexit_3;
  464. /*
  465. * We have to check that the file structure underneath the file descriptor
  466. * the user passed to us _is_ an eventpoll file. And also we do not permit
  467. * adding an epoll file descriptor inside itself.
  468. */
  469. error = -EINVAL;
  470. if (file == tfile || !is_file_epoll(file))
  471. goto eexit_3;
  472. /*
  473. * At this point it is safe to assume that the "private_data" contains
  474. * our own data structure.
  475. */
  476. ep = file->private_data;
  477. down_write(&ep->sem);
  478. /* Try to lookup the file inside our RB tree */
  479. epi = ep_find(ep, tfile, fd);
  480. error = -EINVAL;
  481. switch (op) {
  482. case EPOLL_CTL_ADD:
  483. if (!epi) {
  484. epds.events |= POLLERR | POLLHUP;
  485. error = ep_insert(ep, &epds, tfile, fd);
  486. } else
  487. error = -EEXIST;
  488. break;
  489. case EPOLL_CTL_DEL:
  490. if (epi)
  491. error = ep_remove(ep, epi);
  492. else
  493. error = -ENOENT;
  494. break;
  495. case EPOLL_CTL_MOD:
  496. if (epi) {
  497. epds.events |= POLLERR | POLLHUP;
  498. error = ep_modify(ep, epi, &epds);
  499. } else
  500. error = -ENOENT;
  501. break;
  502. }
  503. /*
  504. * The function ep_find() increments the usage count of the structure
  505. * so, if this is not NULL, we need to release it.
  506. */
  507. if (epi)
  508. ep_release_epitem(epi);
  509. up_write(&ep->sem);
  510. eexit_3:
  511. fput(tfile);
  512. eexit_2:
  513. fput(file);
  514. eexit_1:
  515. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
  516. current, epfd, op, fd, event, error));
  517. return error;
  518. }
  519. /*
  520. * Implement the event wait interface for the eventpoll file. It is the kernel
  521. * part of the user space epoll_wait(2).
  522. */
  523. asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
  524. int maxevents, int timeout)
  525. {
  526. int error;
  527. struct file *file;
  528. struct eventpoll *ep;
  529. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
  530. current, epfd, events, maxevents, timeout));
  531. /* The maximum number of event must be greater than zero */
  532. if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
  533. return -EINVAL;
  534. /* Verify that the area passed by the user is writeable */
  535. if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
  536. error = -EFAULT;
  537. goto eexit_1;
  538. }
  539. /* Get the "struct file *" for the eventpoll file */
  540. error = -EBADF;
  541. file = fget(epfd);
  542. if (!file)
  543. goto eexit_1;
  544. /*
  545. * We have to check that the file structure underneath the fd
  546. * the user passed to us _is_ an eventpoll file.
  547. */
  548. error = -EINVAL;
  549. if (!is_file_epoll(file))
  550. goto eexit_2;
  551. /*
  552. * At this point it is safe to assume that the "private_data" contains
  553. * our own data structure.
  554. */
  555. ep = file->private_data;
  556. /* Time to fish for events ... */
  557. error = ep_poll(ep, events, maxevents, timeout);
  558. eexit_2:
  559. fput(file);
  560. eexit_1:
  561. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
  562. current, epfd, events, maxevents, timeout, error));
  563. return error;
  564. }
  565. #ifdef TIF_RESTORE_SIGMASK
  566. /*
  567. * Implement the event wait interface for the eventpoll file. It is the kernel
  568. * part of the user space epoll_pwait(2).
  569. */
  570. asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
  571. int maxevents, int timeout, const sigset_t __user *sigmask,
  572. size_t sigsetsize)
  573. {
  574. int error;
  575. sigset_t ksigmask, sigsaved;
  576. /*
  577. * If the caller wants a certain signal mask to be set during the wait,
  578. * we apply it here.
  579. */
  580. if (sigmask) {
  581. if (sigsetsize != sizeof(sigset_t))
  582. return -EINVAL;
  583. if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
  584. return -EFAULT;
  585. sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
  586. sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  587. }
  588. error = sys_epoll_wait(epfd, events, maxevents, timeout);
  589. /*
  590. * If we changed the signal mask, we need to restore the original one.
  591. * In case we've got a signal while waiting, we do not restore the
  592. * signal mask yet, and we allow do_signal() to deliver the signal on
  593. * the way back to userspace, before the signal mask is restored.
  594. */
  595. if (sigmask) {
  596. if (error == -EINTR) {
  597. memcpy(&current->saved_sigmask, &sigsaved,
  598. sizeof(sigsaved));
  599. set_thread_flag(TIF_RESTORE_SIGMASK);
  600. } else
  601. sigprocmask(SIG_SETMASK, &sigsaved, NULL);
  602. }
  603. return error;
  604. }
  605. #endif /* #ifdef TIF_RESTORE_SIGMASK */
  606. /*
  607. * Creates the file descriptor to be used by the epoll interface.
  608. */
  609. static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
  610. struct eventpoll *ep)
  611. {
  612. struct qstr this;
  613. char name[32];
  614. struct dentry *dentry;
  615. struct inode *inode;
  616. struct file *file;
  617. int error, fd;
  618. /* Get an ready to use file */
  619. error = -ENFILE;
  620. file = get_empty_filp();
  621. if (!file)
  622. goto eexit_1;
  623. /* Allocates an inode from the eventpoll file system */
  624. inode = ep_eventpoll_inode();
  625. if (IS_ERR(inode)) {
  626. error = PTR_ERR(inode);
  627. goto eexit_2;
  628. }
  629. /* Allocates a free descriptor to plug the file onto */
  630. error = get_unused_fd();
  631. if (error < 0)
  632. goto eexit_3;
  633. fd = error;
  634. /*
  635. * Link the inode to a directory entry by creating a unique name
  636. * using the inode number.
  637. */
  638. error = -ENOMEM;
  639. sprintf(name, "[%lu]", inode->i_ino);
  640. this.name = name;
  641. this.len = strlen(name);
  642. this.hash = inode->i_ino;
  643. dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
  644. if (!dentry)
  645. goto eexit_4;
  646. dentry->d_op = &eventpollfs_dentry_operations;
  647. d_add(dentry, inode);
  648. file->f_path.mnt = mntget(eventpoll_mnt);
  649. file->f_path.dentry = dentry;
  650. file->f_mapping = inode->i_mapping;
  651. file->f_pos = 0;
  652. file->f_flags = O_RDONLY;
  653. file->f_op = &eventpoll_fops;
  654. file->f_mode = FMODE_READ;
  655. file->f_version = 0;
  656. file->private_data = ep;
  657. /* Install the new setup file into the allocated fd. */
  658. fd_install(fd, file);
  659. *efd = fd;
  660. *einode = inode;
  661. *efile = file;
  662. return 0;
  663. eexit_4:
  664. put_unused_fd(fd);
  665. eexit_3:
  666. iput(inode);
  667. eexit_2:
  668. put_filp(file);
  669. eexit_1:
  670. return error;
  671. }
  672. static int ep_alloc(struct eventpoll **pep)
  673. {
  674. struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
  675. if (!ep)
  676. return -ENOMEM;
  677. rwlock_init(&ep->lock);
  678. init_rwsem(&ep->sem);
  679. init_waitqueue_head(&ep->wq);
  680. init_waitqueue_head(&ep->poll_wait);
  681. INIT_LIST_HEAD(&ep->rdllist);
  682. ep->rbr = RB_ROOT;
  683. *pep = ep;
  684. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
  685. current, ep));
  686. return 0;
  687. }
  688. static void ep_free(struct eventpoll *ep)
  689. {
  690. struct rb_node *rbp;
  691. struct epitem *epi;
  692. /* We need to release all tasks waiting for these file */
  693. if (waitqueue_active(&ep->poll_wait))
  694. ep_poll_safewake(&psw, &ep->poll_wait);
  695. /*
  696. * We need to lock this because we could be hit by
  697. * eventpoll_release_file() while we're freeing the "struct eventpoll".
  698. * We do not need to hold "ep->sem" here because the epoll file
  699. * is on the way to be removed and no one has references to it
  700. * anymore. The only hit might come from eventpoll_release_file() but
  701. * holding "epmutex" is sufficent here.
  702. */
  703. mutex_lock(&epmutex);
  704. /*
  705. * Walks through the whole tree by unregistering poll callbacks.
  706. */
  707. for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
  708. epi = rb_entry(rbp, struct epitem, rbn);
  709. ep_unregister_pollwait(ep, epi);
  710. }
  711. /*
  712. * Walks through the whole tree by freeing each "struct epitem". At this
  713. * point we are sure no poll callbacks will be lingering around, and also by
  714. * write-holding "sem" we can be sure that no file cleanup code will hit
  715. * us during this operation. So we can avoid the lock on "ep->lock".
  716. */
  717. while ((rbp = rb_first(&ep->rbr)) != 0) {
  718. epi = rb_entry(rbp, struct epitem, rbn);
  719. ep_remove(ep, epi);
  720. }
  721. mutex_unlock(&epmutex);
  722. }
  723. /*
  724. * Search the file inside the eventpoll tree. It add usage count to
  725. * the returned item, so the caller must call ep_release_epitem()
  726. * after finished using the "struct epitem".
  727. */
  728. static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
  729. {
  730. int kcmp;
  731. unsigned long flags;
  732. struct rb_node *rbp;
  733. struct epitem *epi, *epir = NULL;
  734. struct epoll_filefd ffd;
  735. ep_set_ffd(&ffd, file, fd);
  736. read_lock_irqsave(&ep->lock, flags);
  737. for (rbp = ep->rbr.rb_node; rbp; ) {
  738. epi = rb_entry(rbp, struct epitem, rbn);
  739. kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
  740. if (kcmp > 0)
  741. rbp = rbp->rb_right;
  742. else if (kcmp < 0)
  743. rbp = rbp->rb_left;
  744. else {
  745. ep_use_epitem(epi);
  746. epir = epi;
  747. break;
  748. }
  749. }
  750. read_unlock_irqrestore(&ep->lock, flags);
  751. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
  752. current, file, epir));
  753. return epir;
  754. }
  755. /*
  756. * Increment the usage count of the "struct epitem" making it sure
  757. * that the user will have a valid pointer to reference.
  758. */
  759. static void ep_use_epitem(struct epitem *epi)
  760. {
  761. atomic_inc(&epi->usecnt);
  762. }
  763. /*
  764. * Decrement ( release ) the usage count by signaling that the user
  765. * has finished using the structure. It might lead to freeing the
  766. * structure itself if the count goes to zero.
  767. */
  768. static void ep_release_epitem(struct epitem *epi)
  769. {
  770. if (atomic_dec_and_test(&epi->usecnt))
  771. kmem_cache_free(epi_cache, epi);
  772. }
  773. /*
  774. * This is the callback that is used to add our wait queue to the
  775. * target file wakeup lists.
  776. */
  777. static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
  778. poll_table *pt)
  779. {
  780. struct epitem *epi = ep_item_from_epqueue(pt);
  781. struct eppoll_entry *pwq;
  782. if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
  783. init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
  784. pwq->whead = whead;
  785. pwq->base = epi;
  786. add_wait_queue(whead, &pwq->wait);
  787. list_add_tail(&pwq->llink, &epi->pwqlist);
  788. epi->nwait++;
  789. } else {
  790. /* We have to signal that an error occurred */
  791. epi->nwait = -1;
  792. }
  793. }
  794. static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
  795. {
  796. int kcmp;
  797. struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
  798. struct epitem *epic;
  799. while (*p) {
  800. parent = *p;
  801. epic = rb_entry(parent, struct epitem, rbn);
  802. kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
  803. if (kcmp > 0)
  804. p = &parent->rb_right;
  805. else
  806. p = &parent->rb_left;
  807. }
  808. rb_link_node(&epi->rbn, parent, p);
  809. rb_insert_color(&epi->rbn, &ep->rbr);
  810. }
  811. static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
  812. struct file *tfile, int fd)
  813. {
  814. int error, revents, pwake = 0;
  815. unsigned long flags;
  816. struct epitem *epi;
  817. struct ep_pqueue epq;
  818. error = -ENOMEM;
  819. if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
  820. goto eexit_1;
  821. /* Item initialization follow here ... */
  822. ep_rb_initnode(&epi->rbn);
  823. INIT_LIST_HEAD(&epi->rdllink);
  824. INIT_LIST_HEAD(&epi->fllink);
  825. INIT_LIST_HEAD(&epi->pwqlist);
  826. epi->ep = ep;
  827. ep_set_ffd(&epi->ffd, tfile, fd);
  828. epi->event = *event;
  829. atomic_set(&epi->usecnt, 1);
  830. epi->nwait = 0;
  831. /* Initialize the poll table using the queue callback */
  832. epq.epi = epi;
  833. init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
  834. /*
  835. * Attach the item to the poll hooks and get current event bits.
  836. * We can safely use the file* here because its usage count has
  837. * been increased by the caller of this function.
  838. */
  839. revents = tfile->f_op->poll(tfile, &epq.pt);
  840. /*
  841. * We have to check if something went wrong during the poll wait queue
  842. * install process. Namely an allocation for a wait queue failed due
  843. * high memory pressure.
  844. */
  845. if (epi->nwait < 0)
  846. goto eexit_2;
  847. /* Add the current item to the list of active epoll hook for this file */
  848. spin_lock(&tfile->f_ep_lock);
  849. list_add_tail(&epi->fllink, &tfile->f_ep_links);
  850. spin_unlock(&tfile->f_ep_lock);
  851. /* We have to drop the new item inside our item list to keep track of it */
  852. write_lock_irqsave(&ep->lock, flags);
  853. /* Add the current item to the rb-tree */
  854. ep_rbtree_insert(ep, epi);
  855. /* If the file is already "ready" we drop it inside the ready list */
  856. if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
  857. list_add_tail(&epi->rdllink, &ep->rdllist);
  858. /* Notify waiting tasks that events are available */
  859. if (waitqueue_active(&ep->wq))
  860. __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
  861. if (waitqueue_active(&ep->poll_wait))
  862. pwake++;
  863. }
  864. write_unlock_irqrestore(&ep->lock, flags);
  865. /* We have to call this outside the lock */
  866. if (pwake)
  867. ep_poll_safewake(&psw, &ep->poll_wait);
  868. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
  869. current, ep, tfile, fd));
  870. return 0;
  871. eexit_2:
  872. ep_unregister_pollwait(ep, epi);
  873. /*
  874. * We need to do this because an event could have been arrived on some
  875. * allocated wait queue.
  876. */
  877. write_lock_irqsave(&ep->lock, flags);
  878. if (ep_is_linked(&epi->rdllink))
  879. list_del_init(&epi->rdllink);
  880. write_unlock_irqrestore(&ep->lock, flags);
  881. kmem_cache_free(epi_cache, epi);
  882. eexit_1:
  883. return error;
  884. }
  885. /*
  886. * Modify the interest event mask by dropping an event if the new mask
  887. * has a match in the current file status.
  888. */
  889. static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event)
  890. {
  891. int pwake = 0;
  892. unsigned int revents;
  893. unsigned long flags;
  894. /*
  895. * Set the new event interest mask before calling f_op->poll(), otherwise
  896. * a potential race might occur. In fact if we do this operation inside
  897. * the lock, an event might happen between the f_op->poll() call and the
  898. * new event set registering.
  899. */
  900. epi->event.events = event->events;
  901. /*
  902. * Get current event bits. We can safely use the file* here because
  903. * its usage count has been increased by the caller of this function.
  904. */
  905. revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
  906. write_lock_irqsave(&ep->lock, flags);
  907. /* Copy the data member from inside the lock */
  908. epi->event.data = event->data;
  909. /*
  910. * If the item is not linked to the RB tree it means that it's on its
  911. * way toward the removal. Do nothing in this case.
  912. */
  913. if (ep_rb_linked(&epi->rbn)) {
  914. /*
  915. * If the item is "hot" and it is not registered inside the ready
  916. * list, push it inside. If the item is not "hot" and it is currently
  917. * registered inside the ready list, unlink it.
  918. */
  919. if (revents & event->events) {
  920. if (!ep_is_linked(&epi->rdllink)) {
  921. list_add_tail(&epi->rdllink, &ep->rdllist);
  922. /* Notify waiting tasks that events are available */
  923. if (waitqueue_active(&ep->wq))
  924. __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
  925. TASK_INTERRUPTIBLE);
  926. if (waitqueue_active(&ep->poll_wait))
  927. pwake++;
  928. }
  929. }
  930. }
  931. write_unlock_irqrestore(&ep->lock, flags);
  932. /* We have to call this outside the lock */
  933. if (pwake)
  934. ep_poll_safewake(&psw, &ep->poll_wait);
  935. return 0;
  936. }
  937. /*
  938. * This function unregister poll callbacks from the associated file descriptor.
  939. * Since this must be called without holding "ep->lock" the atomic exchange trick
  940. * will protect us from multiple unregister.
  941. */
  942. static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
  943. {
  944. int nwait;
  945. struct list_head *lsthead = &epi->pwqlist;
  946. struct eppoll_entry *pwq;
  947. /* This is called without locks, so we need the atomic exchange */
  948. nwait = xchg(&epi->nwait, 0);
  949. if (nwait) {
  950. while (!list_empty(lsthead)) {
  951. pwq = list_entry(lsthead->next, struct eppoll_entry, llink);
  952. list_del_init(&pwq->llink);
  953. remove_wait_queue(pwq->whead, &pwq->wait);
  954. kmem_cache_free(pwq_cache, pwq);
  955. }
  956. }
  957. }
  958. /*
  959. * Unlink the "struct epitem" from all places it might have been hooked up.
  960. * This function must be called with write IRQ lock on "ep->lock".
  961. */
  962. static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
  963. {
  964. int error;
  965. /*
  966. * It can happen that this one is called for an item already unlinked.
  967. * The check protect us from doing a double unlink ( crash ).
  968. */
  969. error = -ENOENT;
  970. if (!ep_rb_linked(&epi->rbn))
  971. goto eexit_1;
  972. /*
  973. * Clear the event mask for the unlinked item. This will avoid item
  974. * notifications to be sent after the unlink operation from inside
  975. * the kernel->userspace event transfer loop.
  976. */
  977. epi->event.events = 0;
  978. /*
  979. * At this point is safe to do the job, unlink the item from our rb-tree.
  980. * This operation togheter with the above check closes the door to
  981. * double unlinks.
  982. */
  983. ep_rb_erase(&epi->rbn, &ep->rbr);
  984. /*
  985. * If the item we are going to remove is inside the ready file descriptors
  986. * we want to remove it from this list to avoid stale events.
  987. */
  988. if (ep_is_linked(&epi->rdllink))
  989. list_del_init(&epi->rdllink);
  990. error = 0;
  991. eexit_1:
  992. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
  993. current, ep, epi->ffd.file, error));
  994. return error;
  995. }
  996. /*
  997. * Removes a "struct epitem" from the eventpoll RB tree and deallocates
  998. * all the associated resources.
  999. */
  1000. static int ep_remove(struct eventpoll *ep, struct epitem *epi)
  1001. {
  1002. int error;
  1003. unsigned long flags;
  1004. struct file *file = epi->ffd.file;
  1005. /*
  1006. * Removes poll wait queue hooks. We _have_ to do this without holding
  1007. * the "ep->lock" otherwise a deadlock might occur. This because of the
  1008. * sequence of the lock acquisition. Here we do "ep->lock" then the wait
  1009. * queue head lock when unregistering the wait queue. The wakeup callback
  1010. * will run by holding the wait queue head lock and will call our callback
  1011. * that will try to get "ep->lock".
  1012. */
  1013. ep_unregister_pollwait(ep, epi);
  1014. /* Remove the current item from the list of epoll hooks */
  1015. spin_lock(&file->f_ep_lock);
  1016. if (ep_is_linked(&epi->fllink))
  1017. list_del_init(&epi->fllink);
  1018. spin_unlock(&file->f_ep_lock);
  1019. /* We need to acquire the write IRQ lock before calling ep_unlink() */
  1020. write_lock_irqsave(&ep->lock, flags);
  1021. /* Really unlink the item from the RB tree */
  1022. error = ep_unlink(ep, epi);
  1023. write_unlock_irqrestore(&ep->lock, flags);
  1024. if (error)
  1025. goto eexit_1;
  1026. /* At this point it is safe to free the eventpoll item */
  1027. ep_release_epitem(epi);
  1028. error = 0;
  1029. eexit_1:
  1030. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n",
  1031. current, ep, file, error));
  1032. return error;
  1033. }
  1034. /*
  1035. * This is the callback that is passed to the wait queue wakeup
  1036. * machanism. It is called by the stored file descriptors when they
  1037. * have events to report.
  1038. */
  1039. static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
  1040. {
  1041. int pwake = 0;
  1042. unsigned long flags;
  1043. struct epitem *epi = ep_item_from_wait(wait);
  1044. struct eventpoll *ep = epi->ep;
  1045. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
  1046. current, epi->ffd.file, epi, ep));
  1047. write_lock_irqsave(&ep->lock, flags);
  1048. /*
  1049. * If the event mask does not contain any poll(2) event, we consider the
  1050. * descriptor to be disabled. This condition is likely the effect of the
  1051. * EPOLLONESHOT bit that disables the descriptor when an event is received,
  1052. * until the next EPOLL_CTL_MOD will be issued.
  1053. */
  1054. if (!(epi->event.events & ~EP_PRIVATE_BITS))
  1055. goto is_disabled;
  1056. /* If this file is already in the ready list we exit soon */
  1057. if (ep_is_linked(&epi->rdllink))
  1058. goto is_linked;
  1059. list_add_tail(&epi->rdllink, &ep->rdllist);
  1060. is_linked:
  1061. /*
  1062. * Wake up ( if active ) both the eventpoll wait list and the ->poll()
  1063. * wait list.
  1064. */
  1065. if (waitqueue_active(&ep->wq))
  1066. __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
  1067. TASK_INTERRUPTIBLE);
  1068. if (waitqueue_active(&ep->poll_wait))
  1069. pwake++;
  1070. is_disabled:
  1071. write_unlock_irqrestore(&ep->lock, flags);
  1072. /* We have to call this outside the lock */
  1073. if (pwake)
  1074. ep_poll_safewake(&psw, &ep->poll_wait);
  1075. return 1;
  1076. }
  1077. static int ep_eventpoll_close(struct inode *inode, struct file *file)
  1078. {
  1079. struct eventpoll *ep = file->private_data;
  1080. if (ep) {
  1081. ep_free(ep);
  1082. kfree(ep);
  1083. }
  1084. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
  1085. return 0;
  1086. }
  1087. static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
  1088. {
  1089. unsigned int pollflags = 0;
  1090. unsigned long flags;
  1091. struct eventpoll *ep = file->private_data;
  1092. /* Insert inside our poll wait queue */
  1093. poll_wait(file, &ep->poll_wait, wait);
  1094. /* Check our condition */
  1095. read_lock_irqsave(&ep->lock, flags);
  1096. if (!list_empty(&ep->rdllist))
  1097. pollflags = POLLIN | POLLRDNORM;
  1098. read_unlock_irqrestore(&ep->lock, flags);
  1099. return pollflags;
  1100. }
  1101. /*
  1102. * This function is called without holding the "ep->lock" since the call to
  1103. * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
  1104. * because of the way poll() is traditionally implemented in Linux.
  1105. */
  1106. static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
  1107. struct epoll_event __user *events, int maxevents)
  1108. {
  1109. int eventcnt, error = -EFAULT, pwake = 0;
  1110. unsigned int revents;
  1111. unsigned long flags;
  1112. struct epitem *epi;
  1113. struct list_head injlist;
  1114. INIT_LIST_HEAD(&injlist);
  1115. /*
  1116. * We can loop without lock because this is a task private list.
  1117. * We just splice'd out the ep->rdllist in ep_collect_ready_items().
  1118. * Items cannot vanish during the loop because we are holding "sem" in
  1119. * read.
  1120. */
  1121. for (eventcnt = 0; !list_empty(txlist) && eventcnt < maxevents;) {
  1122. epi = list_entry(txlist->next, struct epitem, rdllink);
  1123. prefetch(epi->rdllink.next);
  1124. /*
  1125. * Get the ready file event set. We can safely use the file
  1126. * because we are holding the "sem" in read and this will
  1127. * guarantee that both the file and the item will not vanish.
  1128. */
  1129. revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
  1130. revents &= epi->event.events;
  1131. /*
  1132. * Is the event mask intersect the caller-requested one,
  1133. * deliver the event to userspace. Again, we are holding
  1134. * "sem" in read, so no operations coming from userspace
  1135. * can change the item.
  1136. */
  1137. if (revents) {
  1138. if (__put_user(revents,
  1139. &events[eventcnt].events) ||
  1140. __put_user(epi->event.data,
  1141. &events[eventcnt].data))
  1142. goto errxit;
  1143. if (epi->event.events & EPOLLONESHOT)
  1144. epi->event.events &= EP_PRIVATE_BITS;
  1145. eventcnt++;
  1146. }
  1147. /*
  1148. * This is tricky. We are holding the "sem" in read, and this
  1149. * means that the operations that can change the "linked" status
  1150. * of the epoll item (epi->rbn and epi->rdllink), cannot touch
  1151. * them. Also, since we are "linked" from a epi->rdllink POV
  1152. * (the item is linked to our transmission list we just
  1153. * spliced), the ep_poll_callback() cannot touch us either,
  1154. * because of the check present in there. Another parallel
  1155. * epoll_wait() will not get the same result set, since we
  1156. * spliced the ready list before. Note that list_del() still
  1157. * shows the item as linked to the test in ep_poll_callback().
  1158. */
  1159. list_del(&epi->rdllink);
  1160. if (!(epi->event.events & EPOLLET) &&
  1161. (revents & epi->event.events))
  1162. list_add_tail(&epi->rdllink, &injlist);
  1163. else {
  1164. /*
  1165. * Be sure the item is totally detached before re-init
  1166. * the list_head. After INIT_LIST_HEAD() is committed,
  1167. * the ep_poll_callback() can requeue the item again,
  1168. * but we don't care since we are already past it.
  1169. */
  1170. smp_mb();
  1171. INIT_LIST_HEAD(&epi->rdllink);
  1172. }
  1173. }
  1174. error = 0;
  1175. errxit:
  1176. /*
  1177. * If the re-injection list or the txlist are not empty, re-splice
  1178. * them to the ready list and do proper wakeups.
  1179. */
  1180. if (!list_empty(&injlist) || !list_empty(txlist)) {
  1181. write_lock_irqsave(&ep->lock, flags);
  1182. list_splice(txlist, &ep->rdllist);
  1183. list_splice(&injlist, &ep->rdllist);
  1184. /*
  1185. * Wake up ( if active ) both the eventpoll wait list and the ->poll()
  1186. * wait list.
  1187. */
  1188. if (waitqueue_active(&ep->wq))
  1189. __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
  1190. TASK_INTERRUPTIBLE);
  1191. if (waitqueue_active(&ep->poll_wait))
  1192. pwake++;
  1193. write_unlock_irqrestore(&ep->lock, flags);
  1194. }
  1195. /* We have to call this outside the lock */
  1196. if (pwake)
  1197. ep_poll_safewake(&psw, &ep->poll_wait);
  1198. return eventcnt == 0 ? error: eventcnt;
  1199. }
  1200. /*
  1201. * Perform the transfer of events to user space.
  1202. */
  1203. static int ep_events_transfer(struct eventpoll *ep,
  1204. struct epoll_event __user *events, int maxevents)
  1205. {
  1206. int eventcnt;
  1207. unsigned long flags;
  1208. struct list_head txlist;
  1209. INIT_LIST_HEAD(&txlist);
  1210. /*
  1211. * We need to lock this because we could be hit by
  1212. * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
  1213. */
  1214. down_read(&ep->sem);
  1215. /*
  1216. * Steal the ready list, and re-init the original one to the
  1217. * empty list.
  1218. */
  1219. write_lock_irqsave(&ep->lock, flags);
  1220. list_splice(&ep->rdllist, &txlist);
  1221. INIT_LIST_HEAD(&ep->rdllist);
  1222. write_unlock_irqrestore(&ep->lock, flags);
  1223. /* Build result set in userspace */
  1224. eventcnt = ep_send_events(ep, &txlist, events, maxevents);
  1225. up_read(&ep->sem);
  1226. return eventcnt;
  1227. }
  1228. static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
  1229. int maxevents, long timeout)
  1230. {
  1231. int res, eavail;
  1232. unsigned long flags;
  1233. long jtimeout;
  1234. wait_queue_t wait;
  1235. /*
  1236. * Calculate the timeout by checking for the "infinite" value ( -1 )
  1237. * and the overflow condition. The passed timeout is in milliseconds,
  1238. * that why (t * HZ) / 1000.
  1239. */
  1240. jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
  1241. MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
  1242. retry:
  1243. write_lock_irqsave(&ep->lock, flags);
  1244. res = 0;
  1245. if (list_empty(&ep->rdllist)) {
  1246. /*
  1247. * We don't have any available event to return to the caller.
  1248. * We need to sleep here, and we will be wake up by
  1249. * ep_poll_callback() when events will become available.
  1250. */
  1251. init_waitqueue_entry(&wait, current);
  1252. __add_wait_queue(&ep->wq, &wait);
  1253. for (;;) {
  1254. /*
  1255. * We don't want to sleep if the ep_poll_callback() sends us
  1256. * a wakeup in between. That's why we set the task state
  1257. * to TASK_INTERRUPTIBLE before doing the checks.
  1258. */
  1259. set_current_state(TASK_INTERRUPTIBLE);
  1260. if (!list_empty(&ep->rdllist) || !jtimeout)
  1261. break;
  1262. if (signal_pending(current)) {
  1263. res = -EINTR;
  1264. break;
  1265. }
  1266. write_unlock_irqrestore(&ep->lock, flags);
  1267. jtimeout = schedule_timeout(jtimeout);
  1268. write_lock_irqsave(&ep->lock, flags);
  1269. }
  1270. __remove_wait_queue(&ep->wq, &wait);
  1271. set_current_state(TASK_RUNNING);
  1272. }
  1273. /* Is it worth to try to dig for events ? */
  1274. eavail = !list_empty(&ep->rdllist);
  1275. write_unlock_irqrestore(&ep->lock, flags);
  1276. /*
  1277. * Try to transfer events to user space. In case we get 0 events and
  1278. * there's still timeout left over, we go trying again in search of
  1279. * more luck.
  1280. */
  1281. if (!res && eavail &&
  1282. !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
  1283. goto retry;
  1284. return res;
  1285. }
  1286. static int eventpollfs_delete_dentry(struct dentry *dentry)
  1287. {
  1288. return 1;
  1289. }
  1290. static struct inode *ep_eventpoll_inode(void)
  1291. {
  1292. int error = -ENOMEM;
  1293. struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
  1294. if (!inode)
  1295. goto eexit_1;
  1296. inode->i_fop = &eventpoll_fops;
  1297. /*
  1298. * Mark the inode dirty from the very beginning,
  1299. * that way it will never be moved to the dirty
  1300. * list because mark_inode_dirty() will think
  1301. * that it already _is_ on the dirty list.
  1302. */
  1303. inode->i_state = I_DIRTY;
  1304. inode->i_mode = S_IRUSR | S_IWUSR;
  1305. inode->i_uid = current->fsuid;
  1306. inode->i_gid = current->fsgid;
  1307. inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  1308. return inode;
  1309. eexit_1:
  1310. return ERR_PTR(error);
  1311. }
  1312. static int
  1313. eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
  1314. const char *dev_name, void *data, struct vfsmount *mnt)
  1315. {
  1316. return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
  1317. mnt);
  1318. }
  1319. static int __init eventpoll_init(void)
  1320. {
  1321. int error;
  1322. mutex_init(&epmutex);
  1323. /* Initialize the structure used to perform safe poll wait head wake ups */
  1324. ep_poll_safewake_init(&psw);
  1325. /* Allocates slab cache used to allocate "struct epitem" items */
  1326. epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
  1327. 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
  1328. NULL, NULL);
  1329. /* Allocates slab cache used to allocate "struct eppoll_entry" */
  1330. pwq_cache = kmem_cache_create("eventpoll_pwq",
  1331. sizeof(struct eppoll_entry), 0,
  1332. EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL);
  1333. /*
  1334. * Register the virtual file system that will be the source of inodes
  1335. * for the eventpoll files
  1336. */
  1337. error = register_filesystem(&eventpoll_fs_type);
  1338. if (error)
  1339. goto epanic;
  1340. /* Mount the above commented virtual file system */
  1341. eventpoll_mnt = kern_mount(&eventpoll_fs_type);
  1342. error = PTR_ERR(eventpoll_mnt);
  1343. if (IS_ERR(eventpoll_mnt))
  1344. goto epanic;
  1345. DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n",
  1346. current));
  1347. return 0;
  1348. epanic:
  1349. panic("eventpoll_init() failed\n");
  1350. }
  1351. static void __exit eventpoll_exit(void)
  1352. {
  1353. /* Undo all operations done inside eventpoll_init() */
  1354. unregister_filesystem(&eventpoll_fs_type);
  1355. mntput(eventpoll_mnt);
  1356. kmem_cache_destroy(pwq_cache);
  1357. kmem_cache_destroy(epi_cache);
  1358. }
  1359. module_init(eventpoll_init);
  1360. module_exit(eventpoll_exit);
  1361. MODULE_LICENSE("GPL");