/*
* fs/eventpoll.c ( Efficent event polling implementation )
- * Copyright (C) 2001,...,2003 Davide Libenzi
+ * Copyright (C) 2001,...,2006 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/io.h>
* LOCKING:
* There are three level of locking required by epoll :
*
- * 1) epsem (semaphore)
+ * 1) epmutex (mutex)
* 2) ep->sem (rw_semaphore)
* 3) ep->lock (rw_lock)
*
* if a file has been pushed inside an epoll set and it is then
* close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
* It is possible to drop the "ep->sem" and to use the global
- * semaphore "epsem" (together with "ep->lock") to have it working,
+ * semaphore "epmutex" (together with "ep->lock") to have it working,
* but having "ep->sem" will make the interface more scalable.
- * Events that require holding "epsem" are very rare, while for
+ * Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->sem" will guarantee
* a greater scalability.
*/
/* Maximum msec timeout value storeable in a long int */
#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
+#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
+
struct epoll_filefd {
struct file *file;
*/
struct wake_task_node {
struct list_head llink;
- task_t *task;
+ struct task_struct *task;
wait_queue_head_t *wq;
};
int maxevents, long timeout);
static int eventpollfs_delete_dentry(struct dentry *dentry);
static struct inode *ep_eventpoll_inode(void);
-static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data);
+static int eventpollfs_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data, struct vfsmount *mnt);
/*
* This semaphore is used to serialize ep_free() and eventpoll_release_file().
*/
-static struct semaphore epsem;
+static struct mutex epmutex;
/* Safe wake up implementation */
static struct poll_safewake psw;
/* Slab cache used to allocate "struct epitem" */
-static kmem_cache_t *epi_cache;
+static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
-static kmem_cache_t *pwq_cache;
+static struct kmem_cache *pwq_cache __read_mostly;
/* Virtual fs used to allocate inodes for eventpoll files */
-static struct vfsmount *eventpoll_mnt;
+static struct vfsmount *eventpoll_mnt __read_mostly;
/* File callbacks that implement the eventpoll file behaviour */
-static struct file_operations eventpoll_fops = {
+static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_close,
.poll = ep_eventpoll_poll
};
/* Special initialization for the rb-tree node to detect linkage */
static inline void ep_rb_initnode(struct rb_node *n)
{
- n->rb_parent = n;
+ rb_set_parent(n, n);
}
/* Removes a node from the rb-tree and marks it for a fast is-linked check */
static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r)
{
rb_erase(n, r);
- n->rb_parent = n;
+ rb_set_parent(n, n);
}
/* Fast check to verify that the item is linked to the main rb-tree */
static inline int ep_rb_linked(struct rb_node *n)
{
- return n->rb_parent != n;
+ return rb_parent(n) != n;
}
/*
{
int wake_nests = 0;
unsigned long flags;
- task_t *this_task = current;
+ struct task_struct *this_task = current;
struct list_head *lsthead = &psw->wake_task_list, *lnk;
struct wake_task_node *tncur;
struct wake_task_node tnode;
}
-/* Used to initialize the epoll bits inside the "struct file" */
-void eventpoll_init_file(struct file *file)
-{
-
- INIT_LIST_HEAD(&file->f_ep_links);
- spin_lock_init(&file->f_ep_lock);
-}
-
-
/*
* This is called from eventpoll_release() to unlink files from the eventpoll
* interface. We need to have this facility to cleanup correctly files that are
* cleanup path, and this means that noone is using this file anymore.
* The only hit might come from ep_free() but by holding the semaphore
* will correctly serialize the operation. We do need to acquire
- * "ep->sem" after "epsem" because ep_remove() requires it when called
+ * "ep->sem" after "epmutex" because ep_remove() requires it when called
* from anywhere but ep_free().
*/
- down(&epsem);
+ mutex_lock(&epmutex);
while (!list_empty(lsthead)) {
epi = list_entry(lsthead->next, struct epitem, fllink);
up_write(&ep->sem);
}
- up(&epsem);
+ mutex_unlock(&epmutex);
}
*/
asmlinkage long sys_epoll_create(int size)
{
- int error, fd;
+ int error, fd = -1;
struct eventpoll *ep;
struct inode *inode;
struct file *file;
return error;
}
-#define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
current, epfd, events, maxevents, timeout));
/* The maximum number of event must be greater than zero */
- if (maxevents <= 0 || maxevents > MAX_EVENTS)
+ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
}
+#ifdef TIF_RESTORE_SIGMASK
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
+ int maxevents, int timeout, const sigset_t __user *sigmask,
+ size_t sigsetsize)
+{
+ int error;
+ sigset_t ksigmask, sigsaved;
+
+ /*
+ * If the caller wants a certain signal mask to be set during the wait,
+ * we apply it here.
+ */
+ if (sigmask) {
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+ return -EFAULT;
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ error = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+ /*
+ * If we changed the signal mask, we need to restore the original one.
+ * In case we've got a signal while waiting, we do not restore the
+ * signal mask yet, and we allow do_signal() to deliver the signal on
+ * the way back to userspace, before the signal mask is restored.
+ */
+ if (sigmask) {
+ if (error == -EINTR) {
+ memcpy(¤t->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_thread_flag(TIF_RESTORE_SIGMASK);
+ } else
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ }
+
+ return error;
+}
+
+#endif /* #ifdef TIF_RESTORE_SIGMASK */
+
+
/*
* Creates the file descriptor to be used by the epoll interface.
*/
/* Allocates an inode from the eventpoll file system */
inode = ep_eventpoll_inode();
- error = PTR_ERR(inode);
- if (IS_ERR(inode))
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
goto eexit_2;
+ }
/* Allocates a free descriptor to plug the file onto */
error = get_unused_fd();
goto eexit_4;
dentry->d_op = &eventpollfs_dentry_operations;
d_add(dentry, inode);
- file->f_vfsmnt = mntget(eventpoll_mnt);
- file->f_dentry = dentry;
+ file->f_path.mnt = mntget(eventpoll_mnt);
+ file->f_path.dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_pos = 0;
* We do not need to hold "ep->sem" here because the epoll file
* is on the way to be removed and no one has references to it
* anymore. The only hit might come from eventpoll_release_file() but
- * holding "epsem" is sufficent here.
+ * holding "epmutex" is sufficent here.
*/
- down(&epsem);
+ mutex_lock(&epmutex);
/*
* Walks through the whole tree by unregistering poll callbacks.
ep_remove(ep, epi);
}
- up(&epsem);
+ mutex_unlock(&epmutex);
}
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
- if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, SLAB_KERNEL))) {
+ if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
struct ep_pqueue epq;
error = -ENOMEM;
- if (!(epi = kmem_cache_alloc(epi_cache, SLAB_KERNEL)))
+ if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
goto eexit_1;
/* Item initialization follow here ... */
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up(&ep->wq);
+ __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up(&ep->wq);
+ __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+ TASK_INTERRUPTIBLE);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
- current, ep, epi->file, error));
+ current, ep, epi->ffd.file, error));
return error;
}
struct eventpoll *ep = epi->ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
- current, epi->file, epi, ep));
+ current, epi->ffd.file, epi, ep));
write_lock_irqsave(&ep->lock, flags);
* wait list.
*/
if (waitqueue_active(&ep->wq))
- wake_up(&ep->wq);
+ __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+ TASK_INTERRUPTIBLE);
if (waitqueue_active(&ep->poll_wait))
pwake++;
* wait list.
*/
if (waitqueue_active(&ep->wq))
- wake_up(&ep->wq);
+ __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+ TASK_INTERRUPTIBLE);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
- add_wait_queue(&ep->wq, &wait);
+ __add_wait_queue(&ep->wq, &wait);
for (;;) {
/*
jtimeout = schedule_timeout(jtimeout);
write_lock_irqsave(&ep->lock, flags);
}
- remove_wait_queue(&ep->wq, &wait);
+ __remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_blksize = PAGE_SIZE;
return inode;
eexit_1:
}
-static struct super_block *
+static int
eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+ const char *dev_name, void *data, struct vfsmount *mnt)
{
- return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+ return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
+ mnt);
}
{
int error;
- init_MUTEX(&epsem);
+ mutex_init(&epmutex);
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_poll_safewake_init(&psw);