X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Feventpoll.c;h=3ae644e7e8606cddcdc1cb02791e51dfc73cb941;hb=refs%2Fheads%2Fvserver;hp=dbec5f9d59921c39ba36cd14ddbccde877558b93;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/fs/eventpoll.c b/fs/eventpoll.c index dbec5f9d5..3ae644e7e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1,6 +1,6 @@ /* * fs/eventpoll.c ( Efficent event polling implementation ) - * Copyright (C) 2001,...,2003 Davide Libenzi + * Copyright (C) 2001,...,2006 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -29,10 +29,12 @@ #include #include #include +#include #include #include #include -#include +#include +#include #include #include #include @@ -45,7 +47,7 @@ * LOCKING: * There are three level of locking required by epoll : * - * 1) epsem (semaphore) + * 1) epmutex (mutex) * 2) ep->sem (rw_semaphore) * 3) ep->lock (rw_lock) * @@ -66,9 +68,9 @@ * if a file has been pushed inside an epoll set and it is then * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). * It is possible to drop the "ep->sem" and to use the global - * semaphore "epsem" (together with "ep->lock") to have it working, + * semaphore "epmutex" (together with "ep->lock") to have it working, * but having "ep->sem" will make the interface more scalable. - * Events that require holding "epsem" are very rare, while for + * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->sem" will guarantee * a greater scalability. */ @@ -100,60 +102,16 @@ /* Maximum number of poll wake up nests we are allowing */ #define EP_MAX_POLLWAKE_NESTS 4 -/* Maximum size of the hash in bits ( 2^N ) */ -#define EP_MAX_HASH_BITS 17 +/* Maximum msec timeout value storeable in a long int */ +#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) -/* Minimum size of the hash in bits ( 2^N ) */ -#define EP_MIN_HASH_BITS 9 - -/* Number of hash entries ( "struct list_head" ) inside a page */ -#define EP_HENTRY_X_PAGE (PAGE_SIZE / sizeof(struct list_head)) - -/* Maximum size of the hash in pages */ -#define EP_MAX_HPAGES ((1 << EP_MAX_HASH_BITS) / EP_HENTRY_X_PAGE + 1) - -/* Number of pages allocated for an "hbits" sized hash table */ -#define EP_HASH_PAGES(hbits) ((int) ((1 << (hbits)) / EP_HENTRY_X_PAGE + \ - ((1 << (hbits)) % EP_HENTRY_X_PAGE ? 1: 0))) - -/* Macro to allocate a "struct epitem" from the slab cache */ -#define EPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(epi_cache, SLAB_KERNEL) - -/* Macro to free a "struct epitem" to the slab cache */ -#define EPI_MEM_FREE(p) kmem_cache_free(epi_cache, p) - -/* Macro to allocate a "struct eppoll_entry" from the slab cache */ -#define PWQ_MEM_ALLOC() (struct eppoll_entry *) kmem_cache_alloc(pwq_cache, SLAB_KERNEL) - -/* Macro to free a "struct eppoll_entry" to the slab cache */ -#define PWQ_MEM_FREE(p) kmem_cache_free(pwq_cache, p) - -/* Fast test to see if the file is an evenpoll file */ -#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops) - -/* - * Remove the item from the list and perform its initialization. - * This is useful for us because we can test if the item is linked - * using "EP_IS_LINKED(p)". - */ -#define EP_LIST_DEL(p) do { list_del(p); INIT_LIST_HEAD(p); } while (0) - -/* Tells us if the item is currently linked */ -#define EP_IS_LINKED(p) (!list_empty(p)) - -/* Get the "struct epitem" from a wait queue pointer */ -#define EP_ITEM_FROM_WAIT(p) ((struct epitem *) container_of(p, struct eppoll_entry, wait)->base) - -/* Get the "struct epitem" from an epoll queue wrapper */ -#define EP_ITEM_FROM_EPQUEUE(p) (container_of(p, struct ep_pqueue, pt)->epi) - -/* - * This is used to optimize the event transfer to userspace. Since this - * is kept on stack, it should be pretty small. - */ -#define EP_MAX_BUF_EVENTS 32 +#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) +struct epoll_filefd { + struct file *file; + int fd; +}; /* * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". @@ -164,7 +122,7 @@ */ struct wake_task_node { struct list_head llink; - task_t *task; + struct task_struct *task; wait_queue_head_t *wq; }; @@ -203,11 +161,8 @@ struct eventpoll { /* List of ready file descriptors */ struct list_head rdllist; - /* Size of the hash */ - unsigned int hashbits; - - /* Pages for the "struct epitem" hash */ - char *hpages[EP_MAX_HPAGES]; + /* RB-Tree root used to store monitored fd structs */ + struct rb_root rbr; }; /* Wait structure used by the poll hooks */ @@ -233,14 +188,14 @@ struct eppoll_entry { * have an entry of this type linked to the hash. */ struct epitem { - /* List header used to link this structure to the eventpoll hash */ - struct list_head llink; + /* RB-Tree node used to link this structure to the eventpoll rb-tree */ + struct rb_node rbn; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; - /* The file descriptor this item refers to */ - int fd; + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; /* Number of active wait queue attached to poll operations */ int nwait; @@ -251,9 +206,6 @@ struct epitem { /* The "container" of this item */ struct eventpoll *ep; - /* The file this item refers to */ - struct file *file; - /* The structure that describe the interested events and the source fd */ struct epoll_event event; @@ -286,22 +238,16 @@ struct ep_pqueue { static void ep_poll_safewake_init(struct poll_safewake *psw); static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq); -static unsigned int ep_get_hash_bits(unsigned int hintsize); -static int ep_getfd(int *efd, struct inode **einode, struct file **efile); -static int ep_alloc_pages(char **pages, int numpages); -static int ep_free_pages(char **pages, int numpages); -static int ep_file_init(struct file *file, unsigned int hashbits); -static unsigned int ep_hash_index(struct eventpoll *ep, struct file *file, - int fd); -static struct list_head *ep_hash_entry(struct eventpoll *ep, - unsigned int index); -static int ep_init(struct eventpoll *ep, unsigned int hashbits); +static int ep_getfd(int *efd, struct inode **einode, struct file **efile, + struct eventpoll *ep); +static int ep_alloc(struct eventpoll **pep); static void ep_free(struct eventpoll *ep); static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); static void ep_use_epitem(struct epitem *epi); static void ep_release_epitem(struct epitem *epi); static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt); +static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi); static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd); static int ep_modify(struct eventpoll *ep, struct epitem *epi, @@ -309,7 +255,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi); static int ep_unlink(struct eventpoll *ep, struct epitem *epi); static int ep_remove(struct eventpoll *ep, struct epitem *epi); -static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync); +static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key); static int ep_eventpoll_close(struct inode *inode, struct file *file); static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait); static int ep_collect_ready_items(struct eventpoll *ep, @@ -324,29 +270,29 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout); static int eventpollfs_delete_dentry(struct dentry *dentry); static struct inode *ep_eventpoll_inode(void); -static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data); +static int eventpollfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt); /* * This semaphore is used to serialize ep_free() and eventpoll_release_file(). */ -struct semaphore epsem; +static struct mutex epmutex; /* Safe wake up implementation */ static struct poll_safewake psw; /* Slab cache used to allocate "struct epitem" */ -static kmem_cache_t *epi_cache; +static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ -static kmem_cache_t *pwq_cache; +static struct kmem_cache *pwq_cache __read_mostly; /* Virtual fs used to allocate inodes for eventpoll files */ -static struct vfsmount *eventpoll_mnt; +static struct vfsmount *eventpoll_mnt __read_mostly; /* File callbacks that implement the eventpoll file behaviour */ -static struct file_operations eventpoll_fops = { +static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_close, .poll = ep_eventpoll_poll }; @@ -368,6 +314,82 @@ static struct dentry_operations eventpollfs_dentry_operations = { +/* Fast test to see if the file is an evenpoll file */ +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} + +/* Setup the structure that is used as key for the rb-tree */ +static inline void ep_set_ffd(struct epoll_filefd *ffd, + struct file *file, int fd) +{ + ffd->file = file; + ffd->fd = fd; +} + +/* Compare rb-tree keys */ +static inline int ep_cmp_ffd(struct epoll_filefd *p1, + struct epoll_filefd *p2) +{ + return (p1->file > p2->file ? +1: + (p1->file < p2->file ? -1 : p1->fd - p2->fd)); +} + +/* Special initialization for the rb-tree node to detect linkage */ +static inline void ep_rb_initnode(struct rb_node *n) +{ + rb_set_parent(n, n); +} + +/* Removes a node from the rb-tree and marks it for a fast is-linked check */ +static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r) +{ + rb_erase(n, r); + rb_set_parent(n, n); +} + +/* Fast check to verify that the item is linked to the main rb-tree */ +static inline int ep_rb_linked(struct rb_node *n) +{ + return rb_parent(n) != n; +} + +/* + * Remove the item from the list and perform its initialization. + * This is useful for us because we can test if the item is linked + * using "ep_is_linked(p)". + */ +static inline void ep_list_del(struct list_head *p) +{ + list_del(p); + INIT_LIST_HEAD(p); +} + +/* Tells us if the item is currently linked */ +static inline int ep_is_linked(struct list_head *p) +{ + return !list_empty(p); +} + +/* Get the "struct epitem" from a wait queue pointer */ +static inline struct epitem * ep_item_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait)->base; +} + +/* Get the "struct epitem" from an epoll queue wrapper */ +static inline struct epitem * ep_item_from_epqueue(poll_table *p) +{ + return container_of(p, struct ep_pqueue, pt)->epi; +} + +/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ +static inline int ep_op_hash_event(int op) +{ + return op != EPOLL_CTL_DEL; +} + /* Initialize the poll safe wake up structure */ static void ep_poll_safewake_init(struct poll_safewake *psw) { @@ -393,7 +415,7 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) { int wake_nests = 0; unsigned long flags; - task_t *this_task = current; + struct task_struct *this_task = current; struct list_head *lsthead = &psw->wake_task_list, *lnk; struct wake_task_node *tncur; struct wake_task_node tnode; @@ -432,28 +454,6 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) } -/* - * Calculate the size of the hash in bits. The returned size will be - * bounded between EP_MIN_HASH_BITS and EP_MAX_HASH_BITS. - */ -static unsigned int ep_get_hash_bits(unsigned int hintsize) -{ - unsigned int i, val; - - for (i = 0, val = 1; val < hintsize && i < EP_MAX_HASH_BITS; i++, val <<= 1); - return i < EP_MIN_HASH_BITS ? EP_MIN_HASH_BITS: i; -} - - -/* Used to initialize the epoll bits inside the "struct file" */ -void eventpoll_init_file(struct file *file) -{ - - INIT_LIST_HEAD(&file->f_ep_links); - spin_lock_init(&file->f_ep_lock); -} - - /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are @@ -471,22 +471,22 @@ void eventpoll_release_file(struct file *file) * cleanup path, and this means that noone is using this file anymore. * The only hit might come from ep_free() but by holding the semaphore * will correctly serialize the operation. We do need to acquire - * "ep->sem" after "epsem" because ep_remove() requires it when called + * "ep->sem" after "epmutex" because ep_remove() requires it when called * from anywhere but ep_free(). */ - down(&epsem); + mutex_lock(&epmutex); while (!list_empty(lsthead)) { epi = list_entry(lsthead->next, struct epitem, fllink); ep = epi->ep; - EP_LIST_DEL(&epi->fllink); + ep_list_del(&epi->fllink); down_write(&ep->sem); ep_remove(ep, epi); up_write(&ep->sem); } - up(&epsem); + mutex_unlock(&epmutex); } @@ -499,38 +499,38 @@ void eventpoll_release_file(struct file *file) */ asmlinkage long sys_epoll_create(int size) { - int error, fd; - unsigned int hashbits; + int error, fd = -1; + struct eventpoll *ep; struct inode *inode; struct file *file; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", current, size)); - /* Correctly size the hash */ - hashbits = ep_get_hash_bits((unsigned int) size); + /* + * Sanity check on the size parameter, and create the internal data + * structure ( "struct eventpoll" ). + */ + error = -EINVAL; + if (size <= 0 || (error = ep_alloc(&ep)) != 0) + goto eexit_1; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure, and inode and a free file descriptor. */ - error = ep_getfd(&fd, &inode, &file); - if (error) - goto eexit_1; - - /* Setup the file internal data structure ( "struct eventpoll" ) */ - error = ep_file_init(file, hashbits); + error = ep_getfd(&fd, &inode, &file, ep); if (error) goto eexit_2; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, fd)); return fd; eexit_2: - sys_close(fd); + ep_free(ep); + kfree(ep); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, error)); @@ -557,7 +557,8 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) current, epfd, op, fd, event)); error = -EFAULT; - if (copy_from_user(&epds, event, sizeof(struct epoll_event))) + if (ep_op_hash_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) goto eexit_1; /* Get the "struct file *" for the eventpoll file */ @@ -582,7 +583,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (file == tfile || !IS_FILE_EPOLL(file)) + if (file == tfile || !is_file_epoll(file)) goto eexit_3; /* @@ -657,12 +658,14 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, current, epfd, events, maxevents, timeout)); /* The maximum number of event must be greater than zero */ - if (maxevents <= 0) + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ - if ((error = verify_area(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))) + if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { + error = -EFAULT; goto eexit_1; + } /* Get the "struct file *" for the eventpoll file */ error = -EBADF; @@ -675,7 +678,7 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; - if (!IS_FILE_EPOLL(file)) + if (!is_file_epoll(file)) goto eexit_2; /* @@ -697,10 +700,60 @@ eexit_1: } +#ifdef TIF_RESTORE_SIGMASK + +/* + * Implement the event wait interface for the eventpoll file. It is the kernel + * part of the user space epoll_pwait(2). + */ +asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, + int maxevents, int timeout, const sigset_t __user *sigmask, + size_t sigsetsize) +{ + int error; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) + return -EFAULT; + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + error = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (error == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_thread_flag(TIF_RESTORE_SIGMASK); + } else + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + } + + return error; +} + +#endif /* #ifdef TIF_RESTORE_SIGMASK */ + + /* * Creates the file descriptor to be used by the epoll interface. */ -static int ep_getfd(int *efd, struct inode **einode, struct file **efile) +static int ep_getfd(int *efd, struct inode **einode, struct file **efile, + struct eventpoll *ep) { struct qstr this; char name[32]; @@ -717,9 +770,10 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile) /* Allocates an inode from the eventpoll file system */ inode = ep_eventpoll_inode(); - error = PTR_ERR(inode); - if (IS_ERR(inode)) + if (IS_ERR(inode)) { + error = PTR_ERR(inode); goto eexit_2; + } /* Allocates a free descriptor to plug the file onto */ error = get_unused_fd(); @@ -741,8 +795,8 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile) goto eexit_4; dentry->d_op = &eventpollfs_dentry_operations; d_add(dentry, inode); - file->f_vfsmnt = mntget(eventpoll_mnt); - file->f_dentry = dget(dentry); + file->f_path.mnt = mntget(eventpoll_mnt); + file->f_path.dentry = dentry; file->f_mapping = inode->i_mapping; file->f_pos = 0; @@ -750,7 +804,7 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile) file->f_op = &eventpoll_fops; file->f_mode = FMODE_READ; file->f_version = 0; - file->private_data = NULL; + file->private_data = ep; /* Install the new setup file into the allocated fd. */ fd_install(fd, file); @@ -771,114 +825,31 @@ eexit_1: } -static int ep_alloc_pages(char **pages, int numpages) +static int ep_alloc(struct eventpoll **pep) { - int i; - - for (i = 0; i < numpages; i++) { - pages[i] = (char *) __get_free_pages(GFP_KERNEL, 0); - if (!pages[i]) { - for (--i; i >= 0; i--) { - ClearPageReserved(virt_to_page(pages[i])); - free_pages((unsigned long) pages[i], 0); - } - return -ENOMEM; - } - SetPageReserved(virt_to_page(pages[i])); - } - return 0; -} - - -static int ep_free_pages(char **pages, int numpages) -{ - int i; - - for (i = 0; i < numpages; i++) { - ClearPageReserved(virt_to_page(pages[i])); - free_pages((unsigned long) pages[i], 0); - } - return 0; -} - + struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); -static int ep_file_init(struct file *file, unsigned int hashbits) -{ - int error; - struct eventpoll *ep; - - if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL))) + if (!ep) return -ENOMEM; - memset(ep, 0, sizeof(*ep)); - - error = ep_init(ep, hashbits); - if (error) { - kfree(ep); - return error; - } - - file->private_data = ep; - - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n", - current, ep)); - return 0; -} - - -/* - * Calculate the index of the hash relative to "file". - */ -static unsigned int ep_hash_index(struct eventpoll *ep, struct file *file, int fd) -{ - unsigned long ptr = (unsigned long) file ^ (fd << ep->hashbits); - - return (unsigned int) hash_ptr((void *) ptr, ep->hashbits); -} - - -/* - * Returns the hash entry ( struct list_head * ) of the passed index. - */ -static struct list_head *ep_hash_entry(struct eventpoll *ep, unsigned int index) -{ - - return (struct list_head *) (ep->hpages[index / EP_HENTRY_X_PAGE] + - (index % EP_HENTRY_X_PAGE) * sizeof(struct list_head)); -} - - -static int ep_init(struct eventpoll *ep, unsigned int hashbits) -{ - int error; - unsigned int i, hsize; - rwlock_init(&ep->lock); init_rwsem(&ep->sem); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); + ep->rbr = RB_ROOT; - /* Hash allocation and setup */ - ep->hashbits = hashbits; - error = ep_alloc_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits)); - if (error) - goto eexit_1; - - /* Initialize hash buckets */ - for (i = 0, hsize = 1 << hashbits; i < hsize; i++) - INIT_LIST_HEAD(ep_hash_entry(ep, i)); + *pep = ep; + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", + current, ep)); return 0; -eexit_1: - return error; } static void ep_free(struct eventpoll *ep) { - unsigned int i, hsize; - struct list_head *lsthead, *lnk; + struct rb_node *rbp; struct epitem *epi; /* We need to release all tasks waiting for these file */ @@ -891,21 +862,17 @@ static void ep_free(struct eventpoll *ep) * We do not need to hold "ep->sem" here because the epoll file * is on the way to be removed and no one has references to it * anymore. The only hit might come from eventpoll_release_file() but - * holding "epsem" is sufficent here. + * holding "epmutex" is sufficent here. */ - down(&epsem); + mutex_lock(&epmutex); /* - * Walks through the whole hash by unregistering poll callbacks. + * Walks through the whole tree by unregistering poll callbacks. */ - for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) { - lsthead = ep_hash_entry(ep, i); - - list_for_each(lnk, lsthead) { - epi = list_entry(lnk, struct epitem, llink); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); - ep_unregister_pollwait(ep, epi); - } + ep_unregister_pollwait(ep, epi); } /* @@ -914,20 +881,12 @@ static void ep_free(struct eventpoll *ep) * write-holding "sem" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". */ - for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) { - lsthead = ep_hash_entry(ep, i); - - while (!list_empty(lsthead)) { - epi = list_entry(lsthead->next, struct epitem, llink); - - ep_remove(ep, epi); - } + while ((rbp = rb_first(&ep->rbr)) != 0) { + epi = rb_entry(rbp, struct epitem, rbn); + ep_remove(ep, epi); } - up(&epsem); - - /* Free hash pages */ - ep_free_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits)); + mutex_unlock(&epmutex); } @@ -938,29 +897,33 @@ static void ep_free(struct eventpoll *ep) */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { + int kcmp; unsigned long flags; - struct list_head *lsthead, *lnk; - struct epitem *epi = NULL; + struct rb_node *rbp; + struct epitem *epi, *epir = NULL; + struct epoll_filefd ffd; + ep_set_ffd(&ffd, file, fd); read_lock_irqsave(&ep->lock, flags); - - lsthead = ep_hash_entry(ep, ep_hash_index(ep, file, fd)); - list_for_each(lnk, lsthead) { - epi = list_entry(lnk, struct epitem, llink); - - if (epi->file == file && epi->fd == fd) { + for (rbp = ep->rbr.rb_node; rbp; ) { + epi = rb_entry(rbp, struct epitem, rbn); + kcmp = ep_cmp_ffd(&ffd, &epi->ffd); + if (kcmp > 0) + rbp = rbp->rb_right; + else if (kcmp < 0) + rbp = rbp->rb_left; + else { ep_use_epitem(epi); + epir = epi; break; } - epi = NULL; } - read_unlock_irqrestore(&ep->lock, flags); DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", - current, file, epi)); + current, file, epir)); - return epi; + return epir; } @@ -984,7 +947,7 @@ static void ep_release_epitem(struct epitem *epi) { if (atomic_dec_and_test(&epi->usecnt)) - EPI_MEM_FREE(epi); + kmem_cache_free(epi_cache, epi); } @@ -995,10 +958,10 @@ static void ep_release_epitem(struct epitem *epi) static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { - struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt); + struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; - if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) { + if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; @@ -1012,6 +975,26 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, } +static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) +{ + int kcmp; + struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; + struct epitem *epic; + + while (*p) { + parent = *p; + epic = rb_entry(parent, struct epitem, rbn); + kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); + if (kcmp > 0) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&epi->rbn, parent, p); + rb_insert_color(&epi->rbn, &ep->rbr); +} + + static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { @@ -1021,18 +1004,17 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct ep_pqueue epq; error = -ENOMEM; - if (!(epi = EPI_MEM_ALLOC())) + if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) goto eexit_1; /* Item initialization follow here ... */ - INIT_LIST_HEAD(&epi->llink); + ep_rb_initnode(&epi->rbn); INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->txlink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; - epi->file = tfile; - epi->fd = fd; + ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; atomic_set(&epi->usecnt, 1); epi->nwait = 0; @@ -1064,16 +1046,16 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to drop the new item inside our item list to keep track of it */ write_lock_irqsave(&ep->lock, flags); - /* Add the current item to the hash table */ - list_add(&epi->llink, ep_hash_entry(ep, ep_hash_index(ep, tfile, fd))); + /* Add the current item to the rb-tree */ + ep_rbtree_insert(ep, epi); /* If the file is already "ready" we drop it inside the ready list */ - if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) { + if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; } @@ -1097,11 +1079,11 @@ eexit_2: * allocated wait queue. */ write_lock_irqsave(&ep->lock, flags); - if (EP_IS_LINKED(&epi->rdllink)) - EP_LIST_DEL(&epi->rdllink); + if (ep_is_linked(&epi->rdllink)) + ep_list_del(&epi->rdllink); write_unlock_irqrestore(&ep->lock, flags); - EPI_MEM_FREE(epi); + kmem_cache_free(epi_cache, epi); eexit_1: return error; } @@ -1129,7 +1111,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->file->f_op->poll(epi->file, NULL); + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); write_lock_irqsave(&ep->lock, flags); @@ -1140,19 +1122,20 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * If the item is not linked to the hash it means that it's on its * way toward the removal. Do nothing in this case. */ - if (EP_IS_LINKED(&epi->llink)) { + if (ep_rb_linked(&epi->rbn)) { /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. If the item is not "hot" and it is currently * registered inside the ready list, unlink it. */ if (revents & event->events) { - if (!EP_IS_LINKED(&epi->rdllink)) { + if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | + TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; } @@ -1187,9 +1170,9 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) while (!list_empty(lsthead)) { pwq = list_entry(lsthead->next, struct eppoll_entry, llink); - EP_LIST_DEL(&pwq->llink); + ep_list_del(&pwq->llink); remove_wait_queue(pwq->whead, &pwq->wait); - PWQ_MEM_FREE(pwq); + kmem_cache_free(pwq_cache, pwq); } } } @@ -1208,7 +1191,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) * The check protect us from doing a double unlink ( crash ). */ error = -ENOENT; - if (!EP_IS_LINKED(&epi->llink)) + if (!ep_rb_linked(&epi->rbn)) goto eexit_1; /* @@ -1219,24 +1202,24 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) epi->event.events = 0; /* - * At this point is safe to do the job, unlink the item from our list. + * At this point is safe to do the job, unlink the item from our rb-tree. * This operation togheter with the above check closes the door to * double unlinks. */ - EP_LIST_DEL(&epi->llink); + ep_rb_erase(&epi->rbn, &ep->rbr); /* * If the item we are going to remove is inside the ready file descriptors * we want to remove it from this list to avoid stale events. */ - if (EP_IS_LINKED(&epi->rdllink)) - EP_LIST_DEL(&epi->rdllink); + if (ep_is_linked(&epi->rdllink)) + ep_list_del(&epi->rdllink); error = 0; eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", - current, ep, epi->file, error)); + current, ep, epi->ffd.file, error)); return error; } @@ -1250,7 +1233,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { int error; unsigned long flags; - struct file *file = epi->file; + struct file *file = epi->ffd.file; /* * Removes poll wait queue hooks. We _have_ to do this without holding @@ -1264,8 +1247,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_ep_lock); - if (EP_IS_LINKED(&epi->fllink)) - EP_LIST_DEL(&epi->fllink); + if (ep_is_linked(&epi->fllink)) + ep_list_del(&epi->fllink); spin_unlock(&file->f_ep_lock); /* We need to acquire the write IRQ lock before calling ep_unlink() */ @@ -1296,15 +1279,15 @@ eexit_1: * machanism. It is called by the stored file descriptors when they * have events to report. */ -static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync) +static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; - struct epitem *epi = EP_ITEM_FROM_WAIT(wait); + struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", - current, epi->file, epi, ep)); + current, epi->ffd.file, epi, ep)); write_lock_irqsave(&ep->lock, flags); @@ -1318,7 +1301,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync) goto is_disabled; /* If this file is already in the ready list we exit soon */ - if (EP_IS_LINKED(&epi->rdllink)) + if (ep_is_linked(&epi->rdllink)) goto is_linked; list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1329,7 +1312,8 @@ is_linked: * wait list. */ if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | + TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1397,7 +1381,7 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist lnk = lnk->next; /* If this file is already in the ready list we exit soon */ - if (!EP_IS_LINKED(&epi->txlink)) { + if (!ep_is_linked(&epi->txlink)) { /* * This is initialized in this way so that the default * behaviour of the reinjecting code will be to push back @@ -1412,7 +1396,7 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist /* * Unlink the item from the ready list. */ - EP_LIST_DEL(&epi->rdllink); + ep_list_del(&epi->rdllink); } } @@ -1430,11 +1414,10 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, struct epoll_event __user *events) { - int eventcnt = 0, eventbuf = 0; + int eventcnt = 0; unsigned int revents; struct list_head *lnk; struct epitem *epi; - struct epoll_event event[EP_MAX_BUF_EVENTS]; /* * We can loop without lock because this is a task private list. @@ -1450,7 +1433,7 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, * because we are holding the "sem" in read and this will * guarantee that both the file and the item will not vanish. */ - revents = epi->file->f_op->poll(epi->file, NULL); + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); /* * Set the return event set for the current file descriptor. @@ -1460,28 +1443,16 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, epi->revents = revents & epi->event.events; if (epi->revents) { - event[eventbuf] = epi->event; - event[eventbuf].events &= revents; - eventbuf++; - if (eventbuf == EP_MAX_BUF_EVENTS) { - if (__copy_to_user(&events[eventcnt], event, - eventbuf * sizeof(struct epoll_event))) - return -EFAULT; - eventcnt += eventbuf; - eventbuf = 0; - } + if (__put_user(epi->revents, + &events[eventcnt].events) || + __put_user(epi->event.data, + &events[eventcnt].data)) + return -EFAULT; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; + eventcnt++; } } - - if (eventbuf) { - if (__copy_to_user(&events[eventcnt], event, - eventbuf * sizeof(struct epoll_event))) - return -EFAULT; - eventcnt += eventbuf; - } - return eventcnt; } @@ -1504,7 +1475,7 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) epi = list_entry(txlist->next, struct epitem, txlink); /* Unlink the current item from the transfer list */ - EP_LIST_DEL(&epi->txlink); + ep_list_del(&epi->txlink); /* * If the item is no more linked to the interest set, we don't @@ -1513,8 +1484,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) * item is set to have an Edge Triggered behaviour, we don't have * to push it back either. */ - if (EP_IS_LINKED(&epi->llink) && !(epi->event.events & EPOLLET) && - (epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) { + if (ep_rb_linked(&epi->rbn) && !(epi->event.events & EPOLLET) && + (epi->revents & epi->event.events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ricnt++; } @@ -1526,7 +1497,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) * wait list. */ if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | + TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; } @@ -1584,8 +1556,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ - jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ? - MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000; + jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? + MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; retry: write_lock_irqsave(&ep->lock, flags); @@ -1598,7 +1570,7 @@ retry: * ep_poll_callback() when events will become available. */ init_waitqueue_entry(&wait, current); - add_wait_queue(&ep->wq, &wait); + __add_wait_queue(&ep->wq, &wait); for (;;) { /* @@ -1618,7 +1590,7 @@ retry: jtimeout = schedule_timeout(jtimeout); write_lock_irqsave(&ep->lock, flags); } - remove_wait_queue(&ep->wq, &wait); + __remove_wait_queue(&ep->wq, &wait); set_current_state(TASK_RUNNING); } @@ -1669,7 +1641,6 @@ static struct inode *ep_eventpoll_inode(void) inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_blksize = PAGE_SIZE; return inode; eexit_1: @@ -1677,11 +1648,12 @@ eexit_1: } -static struct super_block * +static int eventpollfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) + const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC); + return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC, + mnt); } @@ -1689,28 +1661,20 @@ static int __init eventpoll_init(void) { int error; - init_MUTEX(&epsem); + mutex_init(&epmutex); /* Initialize the structure used to perform safe poll wait head wake ups */ ep_poll_safewake_init(&psw); /* Allocates slab cache used to allocate "struct epitem" items */ - error = -ENOMEM; - epi_cache = kmem_cache_create("eventpoll_epi", - sizeof(struct epitem), - 0, - SLAB_HWCACHE_ALIGN | EPI_SLAB_DEBUG, NULL, NULL); - if (!epi_cache) - goto eexit_1; + epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), + 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, + NULL, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ - error = -ENOMEM; pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), - 0, - EPI_SLAB_DEBUG, NULL, NULL); - if (!pwq_cache) - goto eexit_2; + sizeof(struct eppoll_entry), 0, + EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL); /* * Register the virtual file system that will be the source of inodes @@ -1718,27 +1682,20 @@ static int __init eventpoll_init(void) */ error = register_filesystem(&eventpoll_fs_type); if (error) - goto eexit_3; + goto epanic; /* Mount the above commented virtual file system */ eventpoll_mnt = kern_mount(&eventpoll_fs_type); error = PTR_ERR(eventpoll_mnt); if (IS_ERR(eventpoll_mnt)) - goto eexit_4; - - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n", current)); + goto epanic; + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n", + current)); return 0; -eexit_4: - unregister_filesystem(&eventpoll_fs_type); -eexit_3: - kmem_cache_destroy(pwq_cache); -eexit_2: - kmem_cache_destroy(epi_cache); -eexit_1: - - return error; +epanic: + panic("eventpoll_init() failed\n"); } @@ -1755,4 +1712,3 @@ module_init(eventpoll_init); module_exit(eventpoll_exit); MODULE_LICENSE("GPL"); -