X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Feventpoll.c;h=c81ddcd31abae1beb51672f7405230470bffa6d7;hb=9bf4aaab3e101692164d49b7ca357651eb691cb6;hp=bc30a3cdd168ec5133d094c1d0348c6bac9cabd9;hpb=db216c3d5e4c040e557a50f8f5d35d5c415e8c1c;p=linux-2.6.git diff --git a/fs/eventpoll.c b/fs/eventpoll.c index bc30a3cdd..c81ddcd31 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -100,22 +101,6 @@ /* Maximum number of poll wake up nests we are allowing */ #define EP_MAX_POLLWAKE_NESTS 4 -/* Maximum size of the hash in bits ( 2^N ) */ -#define EP_MAX_HASH_BITS 17 - -/* Minimum size of the hash in bits ( 2^N ) */ -#define EP_MIN_HASH_BITS 9 - -/* Number of hash entries ( "struct list_head" ) inside a page */ -#define EP_HENTRY_X_PAGE (PAGE_SIZE / sizeof(struct list_head)) - -/* Maximum size of the hash in pages */ -#define EP_MAX_HPAGES ((1 << EP_MAX_HASH_BITS) / EP_HENTRY_X_PAGE + 1) - -/* Number of pages allocated for an "hbits" sized hash table */ -#define EP_HASH_PAGES(hbits) ((int) ((1 << (hbits)) / EP_HENTRY_X_PAGE + \ - ((1 << (hbits)) % EP_HENTRY_X_PAGE ? 1: 0))) - /* Macro to allocate a "struct epitem" from the slab cache */ #define EPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(epi_cache, SLAB_KERNEL) @@ -131,6 +116,22 @@ /* Fast test to see if the file is an evenpoll file */ #define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops) +/* Setup the structure that is used as key for the rb-tree */ +#define EP_SET_FFD(p, f, d) do { (p)->file = (f); (p)->fd = (d); } while (0) + +/* Compare rb-tree keys */ +#define EP_CMP_FFD(p1, p2) ((p1)->file > (p2)->file ? +1: \ + ((p1)->file < (p2)->file ? -1: (p1)->fd - (p2)->fd)) + +/* Special initialization for the rb-tree node to detect linkage */ +#define EP_RB_INITNODE(n) (n)->rb_parent = (n) + +/* Removes a node from the rb-tree and marks it for a fast is-linked check */ +#define EP_RB_ERASE(n, r) do { rb_erase(n, r); (n)->rb_parent = (n); } while (0) + +/* Fast check to verify that the item is linked to the main rb-tree */ +#define EP_RB_LINKED(n) ((n)->rb_parent != (n)) + /* * Remove the item from the list and perform its initialization. * This is useful for us because we can test if the item is linked @@ -147,6 +148,12 @@ /* Get the "struct epitem" from an epoll queue wrapper */ #define EP_ITEM_FROM_EPQUEUE(p) (container_of(p, struct ep_pqueue, pt)->epi) + +struct epoll_filefd { + struct file *file; + int fd; +}; + /* * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". * It is used to keep track on all tasks that are currently inside the wake_up() code @@ -195,11 +202,8 @@ struct eventpoll { /* List of ready file descriptors */ struct list_head rdllist; - /* Size of the hash */ - unsigned int hashbits; - - /* Pages for the "struct epitem" hash */ - char *hpages[EP_MAX_HPAGES]; + /* RB-Tree root used to store monitored fd structs */ + struct rb_root rbr; }; /* Wait structure used by the poll hooks */ @@ -225,14 +229,14 @@ struct eppoll_entry { * have an entry of this type linked to the hash. */ struct epitem { - /* List header used to link this structure to the eventpoll hash */ - struct list_head llink; + /* RB-Tree node used to link this structure to the eventpoll rb-tree */ + struct rb_node rbn; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; - /* The file descriptor this item refers to */ - int fd; + /* The file descriptor information this item refers to */ + struct epoll_filefd ffd; /* Number of active wait queue attached to poll operations */ int nwait; @@ -243,9 +247,6 @@ struct epitem { /* The "container" of this item */ struct eventpoll *ep; - /* The file this item refers to */ - struct file *file; - /* The structure that describe the interested events and the source fd */ struct epoll_event event; @@ -278,22 +279,15 @@ struct ep_pqueue { static void ep_poll_safewake_init(struct poll_safewake *psw); static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq); -static unsigned int ep_get_hash_bits(unsigned int hintsize); static int ep_getfd(int *efd, struct inode **einode, struct file **efile); -static int ep_alloc_pages(char **pages, int numpages); -static int ep_free_pages(char **pages, int numpages); -static int ep_file_init(struct file *file, unsigned int hashbits); -static unsigned int ep_hash_index(struct eventpoll *ep, struct file *file, - int fd); -static struct list_head *ep_hash_entry(struct eventpoll *ep, - unsigned int index); -static int ep_init(struct eventpoll *ep, unsigned int hashbits); +static int ep_file_init(struct file *file); static void ep_free(struct eventpoll *ep); static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); static void ep_use_epitem(struct epitem *epi); static void ep_release_epitem(struct epitem *epi); static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt); +static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi); static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd); static int ep_modify(struct eventpoll *ep, struct epitem *epi, @@ -424,19 +418,6 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) } -/* - * Calculate the size of the hash in bits. The returned size will be - * bounded between EP_MIN_HASH_BITS and EP_MAX_HASH_BITS. - */ -static unsigned int ep_get_hash_bits(unsigned int hintsize) -{ - unsigned int i, val; - - for (i = 0, val = 1; val < hintsize && i < EP_MAX_HASH_BITS; i++, val <<= 1); - return i < EP_MIN_HASH_BITS ? EP_MIN_HASH_BITS: i; -} - - /* Used to initialize the epoll bits inside the "struct file" */ void eventpoll_init_file(struct file *file) { @@ -492,7 +473,6 @@ void eventpoll_release_file(struct file *file) asmlinkage long sys_epoll_create(int size) { int error, fd; - unsigned int hashbits; struct inode *inode; struct file *file; @@ -504,9 +484,6 @@ asmlinkage long sys_epoll_create(int size) if (size <= 0) goto eexit_1; - /* Correctly size the hash */ - hashbits = ep_get_hash_bits((unsigned int) size); - /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure, and inode and a free file descriptor. @@ -516,7 +493,7 @@ asmlinkage long sys_epoll_create(int size) goto eexit_1; /* Setup the file internal data structure ( "struct eventpoll" ) */ - error = ep_file_init(file, hashbits); + error = ep_file_init(file); if (error) goto eexit_2; @@ -739,7 +716,7 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile) dentry->d_op = &eventpollfs_dentry_operations; d_add(dentry, inode); file->f_vfsmnt = mntget(eventpoll_mnt); - file->f_dentry = dget(dentry); + file->f_dentry = dentry; file->f_mapping = inode->i_mapping; file->f_pos = 0; @@ -768,114 +745,32 @@ eexit_1: } -static int ep_alloc_pages(char **pages, int numpages) +static int ep_file_init(struct file *file) { - int i; - - for (i = 0; i < numpages; i++) { - pages[i] = (char *) __get_free_pages(GFP_KERNEL, 0); - if (!pages[i]) { - for (--i; i >= 0; i--) { - ClearPageReserved(virt_to_page(pages[i])); - free_pages((unsigned long) pages[i], 0); - } - return -ENOMEM; - } - SetPageReserved(virt_to_page(pages[i])); - } - return 0; -} - - -static int ep_free_pages(char **pages, int numpages) -{ - int i; - - for (i = 0; i < numpages; i++) { - ClearPageReserved(virt_to_page(pages[i])); - free_pages((unsigned long) pages[i], 0); - } - return 0; -} - - -static int ep_file_init(struct file *file, unsigned int hashbits) -{ - int error; struct eventpoll *ep; if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL))) return -ENOMEM; memset(ep, 0, sizeof(*ep)); - - error = ep_init(ep, hashbits); - if (error) { - kfree(ep); - return error; - } - - file->private_data = ep; - - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n", - current, ep)); - return 0; -} - - -/* - * Calculate the index of the hash relative to "file". - */ -static unsigned int ep_hash_index(struct eventpoll *ep, struct file *file, int fd) -{ - unsigned long ptr = (unsigned long) file ^ (fd << ep->hashbits); - - return (unsigned int) hash_ptr((void *) ptr, ep->hashbits); -} - - -/* - * Returns the hash entry ( struct list_head * ) of the passed index. - */ -static struct list_head *ep_hash_entry(struct eventpoll *ep, unsigned int index) -{ - - return (struct list_head *) (ep->hpages[index / EP_HENTRY_X_PAGE] + - (index % EP_HENTRY_X_PAGE) * sizeof(struct list_head)); -} - - -static int ep_init(struct eventpoll *ep, unsigned int hashbits) -{ - int error; - unsigned int i, hsize; - rwlock_init(&ep->lock); init_rwsem(&ep->sem); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); + ep->rbr = RB_ROOT; - /* Hash allocation and setup */ - ep->hashbits = hashbits; - error = ep_alloc_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits)); - if (error) - goto eexit_1; - - /* Initialize hash buckets */ - for (i = 0, hsize = 1 << hashbits; i < hsize; i++) - INIT_LIST_HEAD(ep_hash_entry(ep, i)); + file->private_data = ep; + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n", + current, ep)); return 0; -eexit_1: - return error; } static void ep_free(struct eventpoll *ep) { - unsigned int i, hsize; - struct list_head *lsthead, *lnk; + struct rb_node *rbp; struct epitem *epi; /* We need to release all tasks waiting for these file */ @@ -893,16 +788,12 @@ static void ep_free(struct eventpoll *ep) down(&epsem); /* - * Walks through the whole hash by unregistering poll callbacks. + * Walks through the whole tree by unregistering poll callbacks. */ - for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) { - lsthead = ep_hash_entry(ep, i); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); - list_for_each(lnk, lsthead) { - epi = list_entry(lnk, struct epitem, llink); - - ep_unregister_pollwait(ep, epi); - } + ep_unregister_pollwait(ep, epi); } /* @@ -911,20 +802,12 @@ static void ep_free(struct eventpoll *ep) * write-holding "sem" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". */ - for (i = 0, hsize = 1 << ep->hashbits; i < hsize; i++) { - lsthead = ep_hash_entry(ep, i); - - while (!list_empty(lsthead)) { - epi = list_entry(lsthead->next, struct epitem, llink); - - ep_remove(ep, epi); - } + while ((rbp = rb_first(&ep->rbr)) != 0) { + epi = rb_entry(rbp, struct epitem, rbn); + ep_remove(ep, epi); } up(&epsem); - - /* Free hash pages */ - ep_free_pages(ep->hpages, EP_HASH_PAGES(ep->hashbits)); } @@ -935,29 +818,33 @@ static void ep_free(struct eventpoll *ep) */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { + int kcmp; unsigned long flags; - struct list_head *lsthead, *lnk; - struct epitem *epi = NULL; + struct rb_node *rbp; + struct epitem *epi, *epir = NULL; + struct epoll_filefd ffd; + EP_SET_FFD(&ffd, file, fd); read_lock_irqsave(&ep->lock, flags); - - lsthead = ep_hash_entry(ep, ep_hash_index(ep, file, fd)); - list_for_each(lnk, lsthead) { - epi = list_entry(lnk, struct epitem, llink); - - if (epi->file == file && epi->fd == fd) { + for (rbp = ep->rbr.rb_node; rbp; ) { + epi = rb_entry(rbp, struct epitem, rbn); + kcmp = EP_CMP_FFD(&ffd, &epi->ffd); + if (kcmp > 0) + rbp = rbp->rb_right; + else if (kcmp < 0) + rbp = rbp->rb_left; + else { ep_use_epitem(epi); + epir = epi; break; } - epi = NULL; } - read_unlock_irqrestore(&ep->lock, flags); DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", - current, file, epi)); + current, file, epir)); - return epi; + return epir; } @@ -1009,6 +896,26 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, } +static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) +{ + int kcmp; + struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; + struct epitem *epic; + + while (*p) { + parent = *p; + epic = rb_entry(parent, struct epitem, rbn); + kcmp = EP_CMP_FFD(&epi->ffd, &epic->ffd); + if (kcmp > 0) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&epi->rbn, parent, p); + rb_insert_color(&epi->rbn, &ep->rbr); +} + + static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { @@ -1022,14 +929,13 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, goto eexit_1; /* Item initialization follow here ... */ - INIT_LIST_HEAD(&epi->llink); + EP_RB_INITNODE(&epi->rbn); INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->txlink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; - epi->file = tfile; - epi->fd = fd; + EP_SET_FFD(&epi->ffd, tfile, fd); epi->event = *event; atomic_set(&epi->usecnt, 1); epi->nwait = 0; @@ -1061,8 +967,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to drop the new item inside our item list to keep track of it */ write_lock_irqsave(&ep->lock, flags); - /* Add the current item to the hash table */ - list_add(&epi->llink, ep_hash_entry(ep, ep_hash_index(ep, tfile, fd))); + /* Add the current item to the rb-tree */ + ep_rbtree_insert(ep, epi); /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) { @@ -1126,7 +1032,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->file->f_op->poll(epi->file, NULL); + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); write_lock_irqsave(&ep->lock, flags); @@ -1137,7 +1043,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * If the item is not linked to the hash it means that it's on its * way toward the removal. Do nothing in this case. */ - if (EP_IS_LINKED(&epi->llink)) { + if (EP_RB_LINKED(&epi->rbn)) { /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. If the item is not "hot" and it is currently @@ -1205,7 +1111,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) * The check protect us from doing a double unlink ( crash ). */ error = -ENOENT; - if (!EP_IS_LINKED(&epi->llink)) + if (!EP_RB_LINKED(&epi->rbn)) goto eexit_1; /* @@ -1216,11 +1122,11 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) epi->event.events = 0; /* - * At this point is safe to do the job, unlink the item from our list. + * At this point is safe to do the job, unlink the item from our rb-tree. * This operation togheter with the above check closes the door to * double unlinks. */ - EP_LIST_DEL(&epi->llink); + EP_RB_ERASE(&epi->rbn, &ep->rbr); /* * If the item we are going to remove is inside the ready file descriptors @@ -1247,7 +1153,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { int error; unsigned long flags; - struct file *file = epi->file; + struct file *file = epi->ffd.file; /* * Removes poll wait queue hooks. We _have_ to do this without holding @@ -1446,7 +1352,7 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, * because we are holding the "sem" in read and this will * guarantee that both the file and the item will not vanish. */ - revents = epi->file->f_op->poll(epi->file, NULL); + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); /* * Set the return event set for the current file descriptor. @@ -1497,7 +1403,7 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) * item is set to have an Edge Triggered behaviour, we don't have * to push it back either. */ - if (EP_IS_LINKED(&epi->llink) && !(epi->event.events & EPOLLET) && + if (EP_RB_LINKED(&epi->rbn) && !(epi->event.events & EPOLLET) && (epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ricnt++;