aboutsummaryrefslogtreecommitdiff
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c354
1 files changed, 256 insertions, 98 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec1836057..b10b48c2a7a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -40,6 +40,8 @@
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/compat.h>
+#include <linux/rculist.h>
/*
* LOCKING:
@@ -104,7 +106,7 @@
struct epoll_filefd {
struct file *file;
int fd;
-};
+} __packed;
/*
* Structure used to track possible nested calls, for too deep recursions
@@ -128,10 +130,16 @@ struct nested_calls {
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
+ * Avoid increasing the size of this struct, there can be many thousands
+ * of these on a server and we do not want this to take another cache line.
*/
struct epitem {
- /* RB tree node used to link this structure to the eventpoll RB tree */
- struct rb_node rbn;
+ union {
+ /* RB tree node links this structure to the eventpoll RB tree */
+ struct rb_node rbn;
+ /* Used to free the struct epitem */
+ struct rcu_head rcu;
+ };
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;
@@ -158,7 +166,7 @@ struct epitem {
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
- struct wakeup_source *ws;
+ struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
@@ -285,7 +293,7 @@ static LIST_HEAD(tfile_check_list);
static long zero;
static long long_max = LONG_MAX;
-ctl_table epoll_table[] = {
+struct ctl_table epoll_table[] = {
{
.procname = "max_user_watches",
.data = &max_user_watches,
@@ -536,6 +544,38 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
}
}
+/* call only when ep->mtx is held */
+static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
+{
+ return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
+}
+
+/* call only when ep->mtx is held */
+static inline void ep_pm_stay_awake(struct epitem *epi)
+{
+ struct wakeup_source *ws = ep_wakeup_source(epi);
+
+ if (ws)
+ __pm_stay_awake(ws);
+}
+
+static inline bool ep_has_wakeup_source(struct epitem *epi)
+{
+ return rcu_access_pointer(epi->ws) ? true : false;
+}
+
+/* call when ep->mtx cannot be held (ep_poll_callback) */
+static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
+{
+ struct wakeup_source *ws;
+
+ rcu_read_lock();
+ ws = rcu_dereference(epi->ws);
+ if (ws)
+ __pm_stay_awake(ws);
+ rcu_read_unlock();
+}
+
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
@@ -545,14 +585,14 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
* @sproc: Pointer to the scan callback.
* @priv: Private opaque data passed to the @sproc callback.
* @depth: The current depth of recursive f_op->poll calls.
+ * @ep_locked: caller already holds ep->mtx
*
* Returns: The same integer error code returned by the @sproc callback.
*/
static int ep_scan_ready_list(struct eventpoll *ep,
int (*sproc)(struct eventpoll *,
struct list_head *, void *),
- void *priv,
- int depth)
+ void *priv, int depth, bool ep_locked)
{
int error, pwake = 0;
unsigned long flags;
@@ -563,7 +603,9 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl().
*/
- mutex_lock_nested(&ep->mtx, depth);
+
+ if (!ep_locked)
+ mutex_lock_nested(&ep->mtx, depth);
/*
* Steal the ready list, and re-init the original one to the
@@ -599,7 +641,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
*/
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
}
}
/*
@@ -627,7 +669,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
}
spin_unlock_irqrestore(&ep->lock, flags);
- mutex_unlock(&ep->mtx);
+ if (!ep_locked)
+ mutex_unlock(&ep->mtx);
/* We have to call this outside the lock */
if (pwake)
@@ -636,6 +679,12 @@ static int ep_scan_ready_list(struct eventpoll *ep,
return error;
}
+static void epi_rcu_free(struct rcu_head *head)
+{
+ struct epitem *epi = container_of(head, struct epitem, rcu);
+ kmem_cache_free(epi_cache, epi);
+}
+
/*
* Removes a "struct epitem" from the eventpoll RB tree and deallocates
* all the associated resources. Must be called with "mtx" held.
@@ -657,8 +706,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
/* Remove the current item from the list of epoll hooks */
spin_lock(&file->f_lock);
- if (ep_is_linked(&epi->fllink))
- list_del_init(&epi->fllink);
+ list_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
rb_erase(&epi->rbn, &ep->rbr);
@@ -668,10 +716,15 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
- wakeup_source_unregister(epi->ws);
-
- /* At this point it is safe to free the eventpoll item */
- kmem_cache_free(epi_cache, epi);
+ wakeup_source_unregister(ep_wakeup_source(epi));
+ /*
+ * At this point it is safe to free the eventpoll item. Use the union
+ * field epi->rcu, since we are trying to minimize the size of
+ * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
+ * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
+ * use of the rbn field.
+ */
+ call_rcu(&epi->rcu, epi_rcu_free);
atomic_long_dec(&ep->user->epoll_watches);
@@ -704,6 +757,7 @@ static void ep_free(struct eventpoll *ep)
epi = rb_entry(rbp, struct epitem, rbn);
ep_unregister_pollwait(ep, epi);
+ cond_resched();
}
/*
@@ -711,11 +765,16 @@ static void ep_free(struct eventpoll *ep)
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
* us during this operation. So we can avoid the lock on "ep->lock".
+ * We do not need to lock ep->mtx, either, we only do it to prevent
+ * a lockdep warning.
*/
+ mutex_lock(&ep->mtx);
while ((rbp = rb_first(&ep->rbr)) != NULL) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
+ cond_resched();
}
+ mutex_unlock(&ep->mtx);
mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx);
@@ -734,6 +793,13 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
return 0;
}
+static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+{
+ pt->_key = epi->event.events;
+
+ return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+}
+
static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
@@ -741,10 +807,9 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
poll_table pt;
init_poll_funcptr(&pt, NULL);
+
list_for_each_entry_safe(epi, tmp, head, rdllink) {
- pt._key = epi->event.events;
- if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
- epi->event.events)
+ if (ep_item_poll(epi, &pt))
return POLLIN | POLLRDNORM;
else {
/*
@@ -752,7 +817,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
* callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here.
*/
- __pm_relax(epi->ws);
+ __pm_relax(ep_wakeup_source(epi));
list_del_init(&epi->rdllink);
}
}
@@ -760,15 +825,34 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
return 0;
}
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+ poll_table *pt);
+
+struct readyevents_arg {
+ struct eventpoll *ep;
+ bool locked;
+};
+
static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
{
- return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
+ struct readyevents_arg *arg = priv;
+
+ return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
+ call_nests + 1, arg->locked);
}
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
int pollflags;
struct eventpoll *ep = file->private_data;
+ struct readyevents_arg arg;
+
+ /*
+ * During ep_insert() we already hold the ep->mtx for the tfile.
+ * Prevent re-aquisition.
+ */
+ arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
+ arg.ep = ep;
/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);
@@ -780,7 +864,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
* could re-enter here.
*/
pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
- ep_poll_readyevents_proc, ep, ep, current);
+ ep_poll_readyevents_proc, &arg, ep, current);
return pollflags != -1 ? pollflags : 0;
}
@@ -825,9 +909,8 @@ static const struct file_operations eventpoll_fops = {
*/
void eventpoll_release_file(struct file *file)
{
- struct list_head *lsthead = &file->f_ep_links;
struct eventpoll *ep;
- struct epitem *epi;
+ struct epitem *epi, *next;
/*
* We don't want to get "file->f_lock" because it is not
@@ -843,17 +926,12 @@ void eventpoll_release_file(struct file *file)
* Besides, ep_remove() acquires the lock, so we can't hold it here.
*/
mutex_lock(&epmutex);
-
- while (!list_empty(lsthead)) {
- epi = list_first_entry(lsthead, struct epitem, fllink);
-
+ list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
ep = epi->ep;
- list_del_init(&epi->fllink);
mutex_lock_nested(&ep->mtx, 0);
ep_remove(ep, epi);
mutex_unlock(&ep->mtx);
}
-
mutex_unlock(&epmutex);
}
@@ -984,7 +1062,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1091,7 +1169,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
struct file *child_file;
struct epitem *epi;
- list_for_each_entry(epi, &file->f_ep_links, fllink) {
+ /* CTL_DEL can remove links here, but that can't increase our count */
+ rcu_read_lock();
+ list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
child_file = epi->ep->file;
if (is_file_epoll(child_file)) {
if (list_empty(&child_file->f_ep_links)) {
@@ -1113,6 +1193,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
"file is not an ep!\n");
}
}
+ rcu_read_unlock();
return error;
}
@@ -1146,6 +1227,7 @@ static int reverse_path_check(void)
static int ep_create_wakeup_source(struct epitem *epi)
{
const char *name;
+ struct wakeup_source *ws;
if (!epi->ep->ws) {
epi->ep->ws = wakeup_source_register("eventpoll");
@@ -1154,24 +1236,36 @@ static int ep_create_wakeup_source(struct epitem *epi)
}
name = epi->ffd.file->f_path.dentry->d_name.name;
- epi->ws = wakeup_source_register(name);
- if (!epi->ws)
+ ws = wakeup_source_register(name);
+
+ if (!ws)
return -ENOMEM;
+ rcu_assign_pointer(epi->ws, ws);
return 0;
}
-static void ep_destroy_wakeup_source(struct epitem *epi)
+/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
+static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
- wakeup_source_unregister(epi->ws);
- epi->ws = NULL;
+ struct wakeup_source *ws = ep_wakeup_source(epi);
+
+ RCU_INIT_POINTER(epi->ws, NULL);
+
+ /*
+ * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
+ * used internally by wakeup_source_remove, too (called by
+ * wakeup_source_unregister), so we cannot use call_rcu
+ */
+ synchronize_rcu();
+ wakeup_source_unregister(ws);
}
/*
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
- struct file *tfile, int fd)
+ struct file *tfile, int fd, int full_check)
{
int error, revents, pwake = 0;
unsigned long flags;
@@ -1199,13 +1293,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
if (error)
goto error_create_wakeup_source;
} else {
- epi->ws = NULL;
+ RCU_INIT_POINTER(epi->ws, NULL);
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
- epq.pt._key = event->events;
/*
* Attach the item to the poll hooks and get current event bits.
@@ -1214,7 +1307,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
* this operation completes, the poll callback can start hitting
* the new item.
*/
- revents = tfile->f_op->poll(tfile, &epq.pt);
+ revents = ep_item_poll(epi, &epq.pt);
/*
* We have to check if something went wrong during the poll wait queue
@@ -1227,7 +1320,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
- list_add_tail(&epi->fllink, &tfile->f_ep_links);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/*
@@ -1238,7 +1331,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* now check if we've created too many backpaths */
error = -EINVAL;
- if (reverse_path_check())
+ if (full_check && reverse_path_check())
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
@@ -1247,7 +1340,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
@@ -1268,8 +1361,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
error_remove_epi:
spin_lock(&tfile->f_lock);
- if (ep_is_linked(&epi->fllink))
- list_del_init(&epi->fllink);
+ list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);
rb_erase(&epi->rbn, &ep->rbr);
@@ -1288,7 +1380,7 @@ error_unregister:
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
- wakeup_source_unregister(epi->ws);
+ wakeup_source_unregister(ep_wakeup_source(epi));
error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
@@ -1314,12 +1406,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* f_op->poll() call and the new event set registering.
*/
epi->event.events = event->events; /* need barrier below */
- pt._key = event->events;
epi->event.data = event->data; /* protected by mtx */
if (epi->event.events & EPOLLWAKEUP) {
- if (!epi->ws)
+ if (!ep_has_wakeup_source(epi))
ep_create_wakeup_source(epi);
- } else if (epi->ws) {
+ } else if (ep_has_wakeup_source(epi)) {
ep_destroy_wakeup_source(epi);
}
@@ -1347,7 +1438,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
+ revents = ep_item_poll(epi, &pt);
/*
* If the item is "hot" and it is not registered inside the ready
@@ -1357,7 +1448,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
@@ -1383,6 +1474,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
unsigned int revents;
struct epitem *epi;
struct epoll_event __user *uevent;
+ struct wakeup_source *ws;
poll_table pt;
init_poll_funcptr(&pt, NULL);
@@ -1405,14 +1497,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
- if (epi->ws && epi->ws->active)
- __pm_stay_awake(ep->ws);
- __pm_relax(epi->ws);
+ ws = ep_wakeup_source(epi);
+ if (ws) {
+ if (ws->active)
+ __pm_stay_awake(ep->ws);
+ __pm_relax(ws);
+ }
+
list_del_init(&epi->rdllink);
- pt._key = epi->event.events;
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
- epi->event.events;
+ revents = ep_item_poll(epi, &pt);
/*
* If the event mask intersect the caller-requested one,
@@ -1424,7 +1518,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
@@ -1444,7 +1538,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
- __pm_stay_awake(epi->ws);
+ ep_pm_stay_awake(epi);
}
}
}
@@ -1460,7 +1554,7 @@ static int ep_send_events(struct eventpoll *ep,
esed.maxevents = maxevents;
esed.events = events;
- return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
+ return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}
static inline struct timespec ep_set_mstimeout(long ms)
@@ -1730,36 +1824,35 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
- int did_lock_epmutex = 0;
- struct file *file, *tfile;
+ int full_check = 0;
+ struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
+ struct eventpoll *tep = NULL;
error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
- /* Get the "struct file *" for the eventpoll file */
error = -EBADF;
- file = fget(epfd);
- if (!file)
+ f = fdget(epfd);
+ if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
- tfile = fget(fd);
- if (!tfile)
+ tf = fdget(fd);
+ if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
- if (!tfile->f_op || !tfile->f_op->poll)
+ if (!tf.file->f_op->poll)
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
- if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
- epds.events &= ~EPOLLWAKEUP;
+ ep_take_care_of_epollwakeup(&epds);
/*
* We have to check that the file structure underneath the file descriptor
@@ -1767,14 +1860,14 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
- if (file == tfile || !is_file_epoll(file))
+ if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = file->private_data;
+ ep = f.file->private_data;
/*
* When we insert an epoll file descriptor, inside another epoll file
@@ -1784,43 +1877,54 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
- * We need to hold the epmutex across both ep_insert and ep_remove
- * b/c we want to make sure we are looking at a coherent view of
- * epoll network.
+ * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
+ * the epoll file descriptor is attaching directly to a wakeup source,
+ * unless the epoll file descriptor is nested. The purpose of taking the
+ * 'epmutex' on add is to prevent complex toplogies such as loops and
+ * deep wakeup paths from forming in parallel through multiple
+ * EPOLL_CTL_ADD operations.
*/
- if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
- mutex_lock(&epmutex);
- did_lock_epmutex = 1;
- }
+ mutex_lock_nested(&ep->mtx, 0);
if (op == EPOLL_CTL_ADD) {
- if (is_file_epoll(tfile)) {
- error = -ELOOP;
- if (ep_loop_check(ep, tfile) != 0) {
- clear_tfile_check_list();
- goto error_tgt_fput;
+ if (!list_empty(&f.file->f_ep_links) ||
+ is_file_epoll(tf.file)) {
+ full_check = 1;
+ mutex_unlock(&ep->mtx);
+ mutex_lock(&epmutex);
+ if (is_file_epoll(tf.file)) {
+ error = -ELOOP;
+ if (ep_loop_check(ep, tf.file) != 0) {
+ clear_tfile_check_list();
+ goto error_tgt_fput;
+ }
+ } else
+ list_add(&tf.file->f_tfile_llink,
+ &tfile_check_list);
+ mutex_lock_nested(&ep->mtx, 0);
+ if (is_file_epoll(tf.file)) {
+ tep = tf.file->private_data;
+ mutex_lock_nested(&tep->mtx, 1);
}
- } else
- list_add(&tfile->f_tfile_llink, &tfile_check_list);
+ }
}
- mutex_lock_nested(&ep->mtx, 0);
-
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
- epi = ep_find(ep, tfile, fd);
+ epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tfile, fd);
+ error = ep_insert(ep, &epds, tf.file, fd, full_check);
} else
error = -EEXIST;
- clear_tfile_check_list();
+ if (full_check)
+ clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
@@ -1836,15 +1940,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
error = -ENOENT;
break;
}
+ if (tep != NULL)
+ mutex_unlock(&tep->mtx);
mutex_unlock(&ep->mtx);
error_tgt_fput:
- if (did_lock_epmutex)
+ if (full_check)
mutex_unlock(&epmutex);
- fput(tfile);
+ fdput(tf);
error_fput:
- fput(file);
+ fdput(f);
error_return:
return error;
@@ -1916,8 +2022,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
return -EINVAL;
if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
return -EFAULT;
- sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ sigsaved = current->blocked;
+ set_current_blocked(&ksigmask);
}
error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1934,12 +2040,58 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
sizeof(sigsaved));
set_restore_sigmask();
} else
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ set_current_blocked(&sigsaved);
}
return error;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents, int, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+{
+ long err;
+ compat_sigset_t csigmask;
+ sigset_t ksigmask, sigsaved;
+
+ /*
+ * If the caller wants a certain signal mask to be set during the wait,
+ * we apply it here.
+ */
+ if (sigmask) {
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+ return -EFAULT;
+ sigset_from_compat(&ksigmask, &csigmask);
+ sigsaved = current->blocked;
+ set_current_blocked(&ksigmask);
+ }
+
+ err = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+ /*
+ * If we changed the signal mask, we need to restore the original one.
+ * In case we've got a signal while waiting, we do not restore the
+ * signal mask yet, and we allow do_signal() to deliver the signal on
+ * the way back to userspace, before the signal mask is restored.
+ */
+ if (sigmask) {
+ if (err == -EINTR) {
+ memcpy(&current->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_restore_sigmask();
+ } else
+ set_current_blocked(&sigsaved);
+ }
+
+ return err;
+}
+#endif
+
static int __init eventpoll_init(void)
{
struct sysinfo si;
@@ -1964,6 +2116,12 @@ static int __init eventpoll_init(void)
/* Initialize the structure used to perform file's f_op->poll() calls */
ep_nested_calls_init(&poll_readywalk_ncalls);
+ /*
+ * We can have many thousands of epitems, so prevent this from
+ * using an extra cache line on 64-bit (and smaller) CPUs
+ */
+ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
+
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);