13 years ago · 28d82dc1c4
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
 
				 
			
 
				 	/* The user that created the eventpoll descriptor */
			
 
				 	struct user_struct *user;
			
 
				+
			
 
				+	struct file *file;
			
 
				+
			
 
				+	/* used to optimize loop detection check */
			
 
				+	int visited;
			
 
				+	struct list_head visited_list_link;
			
 
				 };
			
 
				 
			
 
				 /* Wait structure used by the poll hooks */
			
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
 
				 /* Slab cache used to allocate "struct eppoll_entry" */
			
 
				 static struct kmem_cache *pwq_cache __read_mostly;
			
 
				 
			
 
				+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
			
 
				+static LIST_HEAD(visited_list);
			
 
				+
			
 
				+/*
			
 
				+ * List of files with newly added links, where we may need to limit the number
			
 
				+ * of emanating paths. Protected by the epmutex.
			
 
				+ */
			
 
				+static LIST_HEAD(tfile_check_list);
			
 
				+
			
 
				 #ifdef CONFIG_SYSCTL
			
 
				 
			
 
				 #include <linux/sysctl.h>
			
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
 
				 };
			
 
				 #endif /* CONFIG_SYSCTL */
			
 
				 
			
 
				+static const struct file_operations eventpoll_fops;
			
 
				+
			
 
				+static inline int is_file_epoll(struct file *f)
			
 
				+{
			
 
				+	return f->f_op == &eventpoll_fops;
			
 
				+}
			
 
				 
			
 
				 /* Setup the structure that is used as key for the RB tree */
			
 
				 static inline void ep_set_ffd(struct epoll_filefd *ffd,
			
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
 
				 	.llseek		= noop_llseek,
			
 
				 };
			
 
				 
			
 
				-/* Fast test to see if the file is an eventpoll file */
			
 
				-static inline int is_file_epoll(struct file *f)
			
 
				-{
			
 
				-	return f->f_op == &eventpoll_fops;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * This is called from eventpoll_release() to unlink files from the eventpoll
			
 
				  * interface. We need to have this facility to cleanup correctly files that are
			
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
 
				 	rb_insert_color(&epi->rbn, &ep->rbr);
			
 
				 }
			
 
				 
			
 
				+
			
 
				+
			
 
				+#define PATH_ARR_SIZE 5
			
 
				+/*
			
 
				+ * These are the number paths of length 1 to 5, that we are allowing to emanate
			
 
				+ * from a single file of interest. For example, we allow 1000 paths of length
			
 
				+ * 1, to emanate from each file of interest. This essentially represents the
			
 
				+ * potential wakeup paths, which need to be limited in order to avoid massive
			
 
				+ * uncontrolled wakeup storms. The common use case should be a single ep which
			
 
				+ * is connected to n file sources. In this case each file source has 1 path
			
 
				+ * of length 1. Thus, the numbers below should be more than sufficient. These
			
 
				+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
			
 
				+ * and delete can't add additional paths. Protected by the epmutex.
			
 
				+ */
			
 
				+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
			
 
				+static int path_count[PATH_ARR_SIZE];
			
 
				+
			
 
				+static int path_count_inc(int nests)
			
 
				+{
			
 
				+	if (++path_count[nests] > path_limits[nests])
			
 
				+		return -1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void path_count_init(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < PATH_ARR_SIZE; i++)
			
 
				+		path_count[i] = 0;
			
 
				+}
			
 
				+
			
 
				+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
			
 
				+{
			
 
				+	int error = 0;
			
 
				+	struct file *file = priv;
			
 
				+	struct file *child_file;
			
 
				+	struct epitem *epi;
			
 
				+
			
 
				+	list_for_each_entry(epi, &file->f_ep_links, fllink) {
			
 
				+		child_file = epi->ep->file;
			
 
				+		if (is_file_epoll(child_file)) {
			
 
				+			if (list_empty(&child_file->f_ep_links)) {
			
 
				+				if (path_count_inc(call_nests)) {
			
 
				+					error = -1;
			
 
				+					break;
			
 
				+				}
			
 
				+			} else {
			
 
				+				error = ep_call_nested(&poll_loop_ncalls,
			
 
				+							EP_MAX_NESTS,
			
 
				+							reverse_path_check_proc,
			
 
				+							child_file, child_file,
			
 
				+							current);
			
 
				+			}
			
 
				+			if (error != 0)
			
 
				+				break;
			
 
				+		} else {
			
 
				+			printk(KERN_ERR "reverse_path_check_proc: "
			
 
				+				"file is not an ep!\n");
			
 
				+		}
			
 
				+	}
			
 
				+	return error;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * reverse_path_check - The tfile_check_list is list of file *, which have
			
 
				+ *                      links that are proposed to be newly added. We need to
			
 
				+ *                      make sure that those added links don't add too many
			
 
				+ *                      paths such that we will spend all our time waking up
			
 
				+ *                      eventpoll objects.
			
 
				+ *
			
 
				+ * Returns: Returns zero if the proposed links don't create too many paths,
			
 
				+ *	    -1 otherwise.
			
 
				+ */
			
 
				+static int reverse_path_check(void)
			
 
				+{
			
 
				+	int length = 0;
			
 
				+	int error = 0;
			
 
				+	struct file *current_file;
			
 
				+
			
 
				+	/* let's call this for all tfiles */
			
 
				+	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
			
 
				+		length++;
			
 
				+		path_count_init();
			
 
				+		error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
			
 
				+					reverse_path_check_proc, current_file,
			
 
				+					current_file, current);
			
 
				+		if (error)
			
 
				+			break;
			
 
				+	}
			
 
				+	return error;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Must be called with "mtx" held.
			
 
				  */
			
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
				 	 */
			
 
				 	ep_rbtree_insert(ep, epi);
			
 
				 
			
 
				+	/* now check if we've created too many backpaths */
			
 
				+	error = -EINVAL;
			
 
				+	if (reverse_path_check())
			
 
				+		goto error_remove_epi;
			
 
				+
			
 
				 	/* We have to drop the new item inside our item list to keep track of it */
			
 
				 	spin_lock_irqsave(&ep->lock, flags);
			
 
				 
			
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
				 
			
 
				 	return 0;
			
 
				 
			
 
				+error_remove_epi:
			
 
				+	spin_lock(&tfile->f_lock);
			
 
				+	if (ep_is_linked(&epi->fllink))
			
 
				+		list_del_init(&epi->fllink);
			
 
				+	spin_unlock(&tfile->f_lock);
			
 
				+
			
 
				+	rb_erase(&epi->rbn, &ep->rbr);
			
 
				+
			
 
				 error_unregister:
			
 
				 	ep_unregister_pollwait(ep, epi);
			
 
				 
			
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 
				 	int error = 0;
			
 
				 	struct file *file = priv;
			
 
				 	struct eventpoll *ep = file->private_data;
			
 
				+	struct eventpoll *ep_tovisit;
			
 
				 	struct rb_node *rbp;
			
 
				 	struct epitem *epi;
			
 
				 
			
 
				 	mutex_lock_nested(&ep->mtx, call_nests + 1);
			
 
				+	ep->visited = 1;
			
 
				+	list_add(&ep->visited_list_link, &visited_list);
			
 
				 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
			
 
				 		epi = rb_entry(rbp, struct epitem, rbn);
			
 
				 		if (unlikely(is_file_epoll(epi->ffd.file))) {
			
 
				+			ep_tovisit = epi->ffd.file->private_data;
			
 
				+			if (ep_tovisit->visited)
			
 
				+				continue;
			
 
				 			error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
			
 
				-					       ep_loop_check_proc, epi->ffd.file,
			
 
				-					       epi->ffd.file->private_data, current);
			
 
				+					ep_loop_check_proc, epi->ffd.file,
			
 
				+					ep_tovisit, current);
			
 
				 			if (error != 0)
			
 
				 				break;
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * If we've reached a file that is not associated with
			
 
				+			 * an ep, then we need to check if the newly added
			
 
				+			 * links are going to add too many wakeup paths. We do
			
 
				+			 * this by adding it to the tfile_check_list, if it's
			
 
				+			 * not already there, and calling reverse_path_check()
			
 
				+			 * during ep_insert().
			
 
				+			 */
			
 
				+			if (list_empty(&epi->ffd.file->f_tfile_llink))
			
 
				+				list_add(&epi->ffd.file->f_tfile_llink,
			
 
				+					 &tfile_check_list);
			
 
				 		}
			
 
				 	}
			
 
				 	mutex_unlock(&ep->mtx);
			
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 
				  */
			
 
				 static int ep_loop_check(struct eventpoll *ep, struct file *file)
			
 
				 {
			
 
				-	return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
			
 
				+	int ret;
			
 
				+	struct eventpoll *ep_cur, *ep_next;
			
 
				+
			
 
				+	ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
			
 
				 			      ep_loop_check_proc, file, ep, current);
			
 
				+	/* clear visited list */
			
 
				+	list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
			
 
				+							visited_list_link) {
			
 
				+		ep_cur->visited = 0;
			
 
				+		list_del(&ep_cur->visited_list_link);
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void clear_tfile_check_list(void)
			
 
				+{
			
 
				+	struct file *file;
			
 
				+
			
 
				+	/* first clear the tfile_check_list */
			
 
				+	while (!list_empty(&tfile_check_list)) {
			
 
				+		file = list_first_entry(&tfile_check_list, struct file,
			
 
				+					f_tfile_llink);
			
 
				+		list_del_init(&file->f_tfile_llink);
			
 
				+	}
			
 
				+	INIT_LIST_HEAD(&tfile_check_list);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
 
				  */
			
 
				 SYSCALL_DEFINE1(epoll_create1, int, flags)
			
 
				 {
			
 
				-	int error;
			
 
				+	int error, fd;
			
 
				 	struct eventpoll *ep = NULL;
			
 
				+	struct file *file;
			
 
				 
			
 
				 	/* Check the EPOLL_* constant for consistency.  */
			
 
				 	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
			
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
 
				 	 * Creates all the items needed to setup an eventpoll file. That is,
			
 
				 	 * a file structure and a free file descriptor.
			
 
				 	 */
			
 
				-	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
			
 
				+	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
			
 
				+	if (fd < 0) {
			
 
				+		error = fd;
			
 
				+		goto out_free_ep;
			
 
				+	}
			
 
				+	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
			
 
				 				 O_RDWR | (flags & O_CLOEXEC));
			
 
				-	if (error < 0)
			
 
				-		ep_free(ep);
			
 
				-
			
 
				+	if (IS_ERR(file)) {
			
 
				+		error = PTR_ERR(file);
			
 
				+		goto out_free_fd;
			
 
				+	}
			
 
				+	fd_install(fd, file);
			
 
				+	ep->file = file;
			
 
				+	return fd;
			
 
				+
			
 
				+out_free_fd:
			
 
				+	put_unused_fd(fd);
			
 
				+out_free_ep:
			
 
				+	ep_free(ep);
			
 
				 	return error;
			
 
				 }
			
 
				 
			
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
				 	/*
			
 
				 	 * When we insert an epoll file descriptor, inside another epoll file
			
 
				 	 * descriptor, there is the change of creating closed loops, which are
			
 
				-	 * better be handled here, than in more critical paths.
			
 
				+	 * better be handled here, than in more critical paths. While we are
			
 
				+	 * checking for loops we also determine the list of files reachable
			
 
				+	 * and hang them on the tfile_check_list, so we can check that we
			
 
				+	 * haven't created too many possible wakeup paths.
			
 
				 	 *
			
 
				-	 * We hold epmutex across the loop check and the insert in this case, in
			
 
				-	 * order to prevent two separate inserts from racing and each doing the
			
 
				-	 * insert "at the same time" such that ep_loop_check passes on both
			
 
				-	 * before either one does the insert, thereby creating a cycle.
			
 
				+	 * We need to hold the epmutex across both ep_insert and ep_remove
			
 
				+	 * b/c we want to make sure we are looking at a coherent view of
			
 
				+	 * epoll network.
			
 
				 	 */
			
 
				-	if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
			
 
				+	if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
			
 
				 		mutex_lock(&epmutex);
			
 
				 		did_lock_epmutex = 1;
			
 
				-		error = -ELOOP;
			
 
				-		if (ep_loop_check(ep, tfile) != 0)
			
 
				-			goto error_tgt_fput;
			
 
				 	}
			
 
				-
			
 
				+	if (op == EPOLL_CTL_ADD) {
			
 
				+		if (is_file_epoll(tfile)) {
			
 
				+			error = -ELOOP;
			
 
				+			if (ep_loop_check(ep, tfile) != 0)
			
 
				+				goto error_tgt_fput;
			
 
				+		} else
			
 
				+			list_add(&tfile->f_tfile_llink, &tfile_check_list);
			
 
				+	}
			
 
				 
			
 
				 	mutex_lock_nested(&ep->mtx, 0);
			
 
				 
			
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
				 			error = ep_insert(ep, &epds, tfile, fd);
			
 
				 		} else
			
 
				 			error = -EEXIST;
			
 
				+		clear_tfile_check_list();
			
 
				 		break;
			
 
				 	case EPOLL_CTL_DEL:
			
 
				 		if (epi)
			
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
				 	mutex_unlock(&ep->mtx);
			
 
				 
			
 
				 error_tgt_fput:
			
 
				-	if (unlikely(did_lock_epmutex))
			
 
				+	if (did_lock_epmutex)
			
 
				 		mutex_unlock(&epmutex);
			
 
				 
			
 
				 	fput(tfile);
			
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -61,6 +61,7 @@ struct file;
 
				 static inline void eventpoll_init_file(struct file *file)
			
 
				 {
			
 
				 	INIT_LIST_HEAD(&file->f_ep_links);
			
 
				+	INIT_LIST_HEAD(&file->f_tfile_llink);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1001,6 +1001,7 @@ struct file {
 
				 #ifdef CONFIG_EPOLL
			
 
				 	/* Used by fs/eventpoll.c to link all the hooks to this file */
			
 
				 	struct list_head	f_ep_links;
			
 
				+	struct list_head	f_tfile_llink;
			
 
				 #endif /* #ifdef CONFIG_EPOLL */
			
 
				 	struct address_space	*f_mapping;
			
 
				 #ifdef CONFIG_DEBUG_WRITECOUNT