X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Fdcache.c;h=cbf76e692013281de5852d569896ac1625bd96fb;hb=353442f4fa3f17a2e386191ca2af8705408fcc60;hp=b90d3813fed7506e6a63419eb7d01dd63a07a3e5;hpb=bc77d24c47b89f1e0efed0b8e4be5f8aad102883;p=linux-2.6.git diff --git a/fs/dcache.c b/fs/dcache.c index b90d3813f..cbf76e692 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -14,10 +14,11 @@ * the dcache entry is deleted or garbage collected. */ -#include +#include #include #include #include +#include #include #include #include @@ -30,16 +31,18 @@ #include #include #include +#include -#define DCACHE_PARANOIA 1 -/* #define DCACHE_DEBUG 1 */ -spinlock_t dcache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; +int sysctl_vfs_cache_pressure __read_mostly = 100; +EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(dcache_lock); -static kmem_cache_t *dentry_cache; +static kmem_cache_t *dentry_cache __read_mostly; #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) @@ -54,9 +57,9 @@ static kmem_cache_t *dentry_cache; #define D_HASHBITS d_hash_shift #define D_HASHMASK d_hash_mask -static unsigned int d_hash_mask; -static unsigned int d_hash_shift; -static struct hlist_head *dentry_hashtable; +static unsigned int d_hash_mask __read_mostly; +static unsigned int d_hash_shift __read_mostly; +static struct hlist_head *dentry_hashtable __read_mostly; static LIST_HEAD(dentry_unused); /* Statistics gathering. */ @@ -64,9 +67,9 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static void d_callback(void *arg) +static void d_callback(struct rcu_head *head) { - struct dentry * dentry = (struct dentry *)arg; + struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu); if (dname_external(dentry)) kfree(dentry->d_name.name); @@ -81,11 +84,11 @@ static void d_free(struct dentry *dentry) { if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - if (dentry->d_extra_attributes) { - kfree(dentry->d_extra_attributes); - dentry->d_extra_attributes = NULL; - } - call_rcu(&dentry->d_rcu, d_callback, dentry); + if (dentry->d_extra_attributes) { + kfree(dentry->d_extra_attributes); + dentry->d_extra_attributes = NULL; + } + call_rcu(&dentry->d_u.d_rcu, d_callback); } /* @@ -93,7 +96,7 @@ static void d_free(struct dentry *dentry) * d_iput() operation if defined. * Called with dcache_lock and per dentry lock held, drops both. */ -static inline void dentry_iput(struct dentry * dentry) +static void dentry_iput(struct dentry * dentry) { struct inode *inode = dentry->d_inode; if (inode) { @@ -101,6 +104,8 @@ static inline void dentry_iput(struct dentry * dentry) list_del_init(&dentry->d_alias); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else @@ -146,6 +151,8 @@ void dput(struct dentry *dentry) return; repeat: + if (atomic_read(&dentry->d_count) == 1) + might_sleep(); if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) return; @@ -155,7 +162,7 @@ repeat: spin_unlock(&dcache_lock); return; } - + /* * AV: ->d_delete() is _NOT_ allowed to block now. */ @@ -188,7 +195,7 @@ kill_it: { list_del(&dentry->d_lru); dentry_stat.nr_unused--; } - list_del(&dentry->d_child); + list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); @@ -278,22 +285,25 @@ struct dentry * dget_locked(struct dentry *dentry) /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question + * @want_discon: flag, used by d_splice_alias, to request + * that only a DISCONNECTED alias be returned. * - * If inode has a hashed alias - acquire the reference to alias and - * return it. Otherwise return NULL. Notice that if inode is a directory - * there can be only one alias and it can be unhashed only if it has - * no children. + * If inode has a hashed alias, or is a directory and has any alias, + * acquire the reference to alias and return it. Otherwise return NULL. + * Notice that if inode is a directory there can be only one alias and + * it can be unhashed only if it has no children, or if it is the root + * of a filesystem. * * If the inode has a DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that one. + * any other hashed alias over that one unless @want_discon is set, + * in which case only return a DCACHE_DISCONNECTED alias. */ -struct dentry * d_find_alias(struct inode *inode) +static struct dentry * __d_find_alias(struct inode *inode, int want_discon) { struct list_head *head, *next, *tmp; struct dentry *alias, *discon_alias=NULL; - spin_lock(&dcache_lock); head = &inode->i_dentry; next = inode->i_dentry.next; while (next != head) { @@ -301,57 +311,69 @@ struct dentry * d_find_alias(struct inode *inode) next = tmp->next; prefetch(next); alias = list_entry(tmp, struct dentry, d_alias); - if (!d_unhashed(alias)) { + if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { if (alias->d_flags & DCACHE_DISCONNECTED) discon_alias = alias; - else { + else if (!want_discon) { __dget_locked(alias); - spin_unlock(&dcache_lock); return alias; } } } if (discon_alias) __dget_locked(discon_alias); - spin_unlock(&dcache_lock); return discon_alias; } +struct dentry * d_find_alias(struct inode *inode) +{ + struct dentry *de = NULL; + + if (!list_empty(&inode->i_dentry)) { + spin_lock(&dcache_lock); + de = __d_find_alias(inode, 0); + spin_unlock(&dcache_lock); + } + return de; +} + /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { - struct list_head *tmp, *head = &inode->i_dentry; + struct dentry *dentry; restart: spin_lock(&dcache_lock); - tmp = head; - while ((tmp = tmp->next) != head) { - struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + spin_lock(&dentry->d_lock); if (!atomic_read(&dentry->d_count)) { __dget_locked(dentry); __d_drop(dentry); + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); dput(dentry); goto restart; } + spin_unlock(&dentry->d_lock); } spin_unlock(&dcache_lock); } /* - * Throw away a dentry - free the inode, dput the parent. - * This requires that the LRU list has already been - * removed. + * Throw away a dentry - free the inode, dput the parent. This requires that + * the LRU list has already been removed. + * * Called with dcache_lock, drops it and then regains. + * Called with dentry->d_lock held, drops it. */ -static inline void prune_one_dentry(struct dentry * dentry) +static void prune_one_dentry(struct dentry * dentry) { struct dentry * parent; __d_drop(dentry); - list_del(&dentry->d_child); + list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ dentry_iput(dentry); parent = dentry->d_parent; @@ -364,6 +386,8 @@ static inline void prune_one_dentry(struct dentry * dentry) /** * prune_dcache - shrink the dcache * @count: number of entries to try and free + * @sb: if given, ignore dentries for other superblocks + * which are being unmounted. * * Shrink the dcache. This is done when we need * more memory, or simply when we need to unmount @@ -374,14 +398,29 @@ static inline void prune_one_dentry(struct dentry * dentry) * all the dentries are in use. */ -static void prune_dcache(int count) +static void prune_dcache(int count, struct super_block *sb) { spin_lock(&dcache_lock); for (; count ; count--) { struct dentry *dentry; struct list_head *tmp; + struct rw_semaphore *s_umount; + + cond_resched_lock(&dcache_lock); tmp = dentry_unused.prev; + if (sb) { + /* Try to find a dentry for this sb, but don't try + * too hard, if they aren't near the tail they will + * be moved down again soon + */ + int skip = count; + while (skip && tmp != &dentry_unused && + list_entry(tmp, struct dentry, d_lru)->d_sb != sb) { + skip--; + tmp = tmp->prev; + } + } if (tmp == &dentry_unused) break; list_del_init(tmp); @@ -407,7 +446,45 @@ static void prune_dcache(int count) spin_unlock(&dentry->d_lock); continue; } - prune_one_dentry(dentry); + /* + * If the dentry is not DCACHED_REFERENCED, it is time + * to remove it from the dcache, provided the super block is + * NULL (which means we are trying to reclaim memory) + * or this dentry belongs to the same super block that + * we want to shrink. + */ + /* + * If this dentry is for "my" filesystem, then I can prune it + * without taking the s_umount lock (I already hold it). + */ + if (sb && dentry->d_sb == sb) { + prune_one_dentry(dentry); + continue; + } + /* + * ...otherwise we need to be sure this filesystem isn't being + * unmounted, otherwise we could race with + * generic_shutdown_super(), and end up holding a reference to + * an inode while the filesystem is unmounted. + * So we try to get s_umount, and make sure s_root isn't NULL. + * (Take a local copy of s_umount to avoid a use-after-free of + * `dentry'). + */ + s_umount = &dentry->d_sb->s_umount; + if (down_read_trylock(s_umount)) { + if (dentry->d_sb->s_root != NULL) { + prune_one_dentry(dentry); + up_read(s_umount); + continue; + } + up_read(s_umount); + } + spin_unlock(&dentry->d_lock); + /* Cannot remove the first dentry, and it isn't appropriate + * to move it to the head of the list, so give up, and try + * later + */ + break; } spin_unlock(&dcache_lock); } @@ -444,25 +521,18 @@ void shrink_dcache_sb(struct super_block * sb) * superblock to the most recent end of the unused list. */ spin_lock(&dcache_lock); - next = dentry_unused.next; - while (next != &dentry_unused) { - tmp = next; - next = tmp->next; + list_for_each_safe(tmp, next, &dentry_unused) { dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; - list_del(tmp); - list_add(tmp, &dentry_unused); + list_move(tmp, &dentry_unused); } /* * Pass two ... free the dentries for this superblock. */ repeat: - next = dentry_unused.next; - while (next != &dentry_unused) { - tmp = next; - next = tmp->next; + list_for_each_safe(tmp, next, &dentry_unused) { dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; @@ -474,11 +544,142 @@ repeat: continue; } prune_one_dentry(dentry); + cond_resched_lock(&dcache_lock); goto repeat; } spin_unlock(&dcache_lock); } +/* + * destroy a single subtree of dentries for unmount + * - see the comments on shrink_dcache_for_umount() for a description of the + * locking + */ +static void shrink_dcache_for_umount_subtree(struct dentry *dentry) +{ + struct dentry *parent; + + BUG_ON(!IS_ROOT(dentry)); + + /* detach this root from the system */ + spin_lock(&dcache_lock); + if (!list_empty(&dentry->d_lru)) { + dentry_stat.nr_unused--; + list_del_init(&dentry->d_lru); + } + __d_drop(dentry); + spin_unlock(&dcache_lock); + + for (;;) { + /* descend to the first leaf in the current subtree */ + while (!list_empty(&dentry->d_subdirs)) { + struct dentry *loop; + + /* this is a branch with children - detach all of them + * from the system in one go */ + spin_lock(&dcache_lock); + list_for_each_entry(loop, &dentry->d_subdirs, + d_u.d_child) { + if (!list_empty(&loop->d_lru)) { + dentry_stat.nr_unused--; + list_del_init(&loop->d_lru); + } + + __d_drop(loop); + cond_resched_lock(&dcache_lock); + } + spin_unlock(&dcache_lock); + + /* move to the first child */ + dentry = list_entry(dentry->d_subdirs.next, + struct dentry, d_u.d_child); + } + + /* consume the dentries from this leaf up through its parents + * until we find one with children or run out altogether */ + do { + struct inode *inode; + + if (atomic_read(&dentry->d_count) != 0) { + printk(KERN_ERR + "BUG: Dentry %p{i=%lx,n=%s}" + " still in use (%d)" + " [unmount of %s %s]\n", + dentry, + dentry->d_inode ? + dentry->d_inode->i_ino : 0UL, + dentry->d_name.name, + atomic_read(&dentry->d_count), + dentry->d_sb->s_type->name, + dentry->d_sb->s_id); + BUG(); + } + + parent = dentry->d_parent; + if (parent == dentry) + parent = NULL; + else + atomic_dec(&parent->d_count); + + list_del(&dentry->d_u.d_child); + dentry_stat.nr_dentry--; /* For d_free, below */ + + inode = dentry->d_inode; + if (inode) { + dentry->d_inode = NULL; + list_del_init(&dentry->d_alias); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); + } + + d_free(dentry); + + /* finished when we fall off the top of the tree, + * otherwise we ascend to the parent and move to the + * next sibling if there is one */ + if (!parent) + return; + + dentry = parent; + + } while (list_empty(&dentry->d_subdirs)); + + dentry = list_entry(dentry->d_subdirs.next, + struct dentry, d_u.d_child); + } +} + +/* + * destroy the dentries attached to a superblock on unmounting + * - we don't need to use dentry->d_lock, and only need dcache_lock when + * removing the dentry from the system lists and hashes because: + * - the superblock is detached from all mountings and open files, so the + * dentry trees will not be rearranged by the VFS + * - s_umount is write-locked, so the memory pressure shrinker will ignore + * any dentries belonging to this superblock that it comes across + * - the filesystem itself is no longer permitted to rearrange the dentries + * in this superblock + */ +void shrink_dcache_for_umount(struct super_block *sb) +{ + struct dentry *dentry; + + if (down_read_trylock(&sb->s_umount)) + BUG(); + + dentry = sb->s_root; + sb->s_root = NULL; + atomic_dec(&dentry->d_count); + shrink_dcache_for_umount_subtree(dentry); + + while (!hlist_empty(&sb->s_anon)) { + dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash); + shrink_dcache_for_umount_subtree(dentry); + } +} + /* * Search for at least 1 mount point in the dentry's subdirs. * We descend to the next level whenever the d_subdirs @@ -506,7 +707,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; /* Have we found a mount point ? */ if (d_mountpoint(dentry)) @@ -520,7 +721,7 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; goto resume; } @@ -537,6 +738,13 @@ positive: * list for prune_dcache(). We descend to the next level * whenever the d_subdirs list is non-empty and continue * searching. + * + * It returns zero iff there are no unused children, + * otherwise it returns the number of children moved to + * the end of the unused list. This may not be the total + * number of unused children, because select_parent can + * drop the lock and return early due to latency + * constraints. */ static int select_parent(struct dentry * parent) { @@ -550,7 +758,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; if (!list_empty(&dentry->d_lru)) { @@ -562,19 +770,24 @@ resume: * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - list_add(&dentry->d_lru, dentry_unused.prev); + list_add_tail(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; found++; } + + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (found && need_resched()) + goto out; + /* * Descend a level if the d_subdirs list is non-empty. */ if (!list_empty(&dentry->d_subdirs)) { this_parent = dentry; -#ifdef DCACHE_DEBUG -printk(KERN_DEBUG "select_parent: descending to %s/%s, found=%d\n", -dentry->d_parent->d_name.name, dentry->d_name.name, found); -#endif goto repeat; } } @@ -582,14 +795,11 @@ dentry->d_parent->d_name.name, dentry->d_name.name, found); * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; -#ifdef DCACHE_DEBUG -printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n", -this_parent->d_parent->d_name.name, this_parent->d_name.name, found); -#endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } @@ -606,46 +816,7 @@ void shrink_dcache_parent(struct dentry * parent) int found; while ((found = select_parent(parent)) != 0) - prune_dcache(found); -} - -/** - * shrink_dcache_anon - further prune the cache - * @head: head of d_hash list of dentries to prune - * - * Prune the dentries that are anonymous - * - * parsing d_hash list does not read_barrier_depends() as it - * done under dcache_lock. - * - */ -void shrink_dcache_anon(struct hlist_head *head) -{ - struct hlist_node *lp; - int found; - do { - found = 0; - spin_lock(&dcache_lock); - hlist_for_each(lp, head) { - struct dentry *this = hlist_entry(lp, struct dentry, d_hash); - if (!list_empty(&this->d_lru)) { - dentry_stat.nr_unused--; - list_del(&this->d_lru); - } - - /* - * move only zero ref count dentries to the end - * of the unused list for prune_dcache - */ - if (!atomic_read(&this->d_count)) { - list_add_tail(&this->d_lru, &dentry_unused); - dentry_stat.nr_unused++; - found++; - } - } - spin_unlock(&dcache_lock); - prune_dcache(found); - } while(found); + prune_dcache(found, parent->d_sb); } /* @@ -660,14 +831,14 @@ void shrink_dcache_anon(struct hlist_head *head) * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, unsigned int gfp_mask) +static int shrink_dcache_memory(int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; - prune_dcache(nr); + prune_dcache(nr, NULL); } - return dentry_stat.nr_unused; + return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } /** @@ -707,7 +878,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) atomic_set(&dentry->d_count, 1); dentry->d_flags = DCACHE_UNHASHED; - dentry->d_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = NULL; dentry->d_sb = NULL; @@ -715,8 +886,9 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_fsdata = NULL; dentry->d_extra_attributes = NULL; dentry->d_mounted = 0; +#ifdef CONFIG_PROFILING dentry->d_cookie = NULL; - dentry->d_bucket = NULL; +#endif INIT_HLIST_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); @@ -726,18 +898,28 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_parent = dget(parent); dentry->d_sb = parent->d_sb; } else { - INIT_LIST_HEAD(&dentry->d_child); + INIT_LIST_HEAD(&dentry->d_u.d_child); } spin_lock(&dcache_lock); if (parent) - list_add(&dentry->d_child, &parent->d_subdirs); + list_add(&dentry->d_u.d_child, &parent->d_subdirs); dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); return dentry; } +struct dentry *d_alloc_name(struct dentry *parent, const char *name) +{ + struct qstr q; + + q.name = name; + q.len = strlen(name); + q.hash = full_name_hash(q.name, q.len); + return d_alloc(parent, &q); +} + /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete @@ -755,15 +937,88 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) void d_instantiate(struct dentry *entry, struct inode * inode) { - if (!list_empty(&entry->d_alias)) BUG(); + BUG_ON(!list_empty(&entry->d_alias)); spin_lock(&dcache_lock); if (inode) list_add(&entry->d_alias, &inode->i_dentry); entry->d_inode = inode; + fsnotify_d_instantiate(entry, inode); spin_unlock(&dcache_lock); security_d_instantiate(entry, inode); } +/** + * d_instantiate_unique - instantiate a non-aliased dentry + * @entry: dentry to instantiate + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. On success, it returns NULL. + * If an unhashed alias of "entry" already exists, then we return the + * aliased dentry instead and drop one reference to inode. + * + * Note that in order to avoid conflicts with rename() etc, the caller + * had better be holding the parent directory semaphore. + * + * This also assumes that the inode count has been incremented + * (or otherwise set) by the caller to indicate that it is now + * in use by the dcache. + */ +static struct dentry *__d_instantiate_unique(struct dentry *entry, + struct inode *inode) +{ + struct dentry *alias; + int len = entry->d_name.len; + const char *name = entry->d_name.name; + unsigned int hash = entry->d_name.hash; + + if (!inode) { + entry->d_inode = NULL; + return NULL; + } + + list_for_each_entry(alias, &inode->i_dentry, d_alias) { + struct qstr *qstr = &alias->d_name; + + if (qstr->hash != hash) + continue; + if (alias->d_parent != entry->d_parent) + continue; + if (qstr->len != len) + continue; + if (memcmp(qstr->name, name, len)) + continue; + dget_locked(alias); + return alias; + } + + list_add(&entry->d_alias, &inode->i_dentry); + entry->d_inode = inode; + fsnotify_d_instantiate(entry, inode); + return NULL; +} + +struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) +{ + struct dentry *result; + + BUG_ON(!list_empty(&entry->d_alias)); + + spin_lock(&dcache_lock); + result = __d_instantiate_unique(entry, inode); + spin_unlock(&dcache_lock); + + if (!result) { + security_d_instantiate(entry, inode); + return NULL; + } + + BUG_ON(!d_unhashed(result)); + iput(inode); + return result; +} + +EXPORT_SYMBOL(d_instantiate_unique); + /** * d_alloc_root - allocate root dentry * @root_inode: inode to allocate the root for @@ -836,33 +1091,21 @@ struct dentry * d_alloc_anon(struct inode *inode) tmp->d_parent = tmp; /* make sure dput doesn't croak */ spin_lock(&dcache_lock); - if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { - /* A directory can only have one dentry. - * This (now) has one, so use it. - */ - res = list_entry(inode->i_dentry.next, struct dentry, d_alias); - __dget_locked(res); - } else { + res = __d_find_alias(inode, 0); + if (!res) { /* attach a disconnected dentry */ res = tmp; tmp = NULL; - if (res) { - spin_lock(&res->d_lock); - res->d_sb = inode->i_sb; - res->d_parent = res; - res->d_inode = inode; + spin_lock(&res->d_lock); + res->d_sb = inode->i_sb; + res->d_parent = res; + res->d_inode = inode; + res->d_flags |= DCACHE_DISCONNECTED; + res->d_flags &= ~DCACHE_UNHASHED; + list_add(&res->d_alias, &inode->i_dentry); + hlist_add_head(&res->d_hash, &inode->i_sb->s_anon); + spin_unlock(&res->d_lock); - /* - * Set d_bucket to an "impossible" bucket address so - * that d_move() doesn't get a false positive - */ - res->d_bucket = NULL; - res->d_flags |= DCACHE_DISCONNECTED; - res->d_flags &= ~DCACHE_UNHASHED; - list_add(&res->d_alias, &inode->i_dentry); - hlist_add_head(&res->d_hash, &inode->i_sb->s_anon); - spin_unlock(&res->d_lock); - } inode = NULL; /* don't drop reference */ } spin_unlock(&dcache_lock); @@ -884,7 +1127,7 @@ struct dentry * d_alloc_anon(struct inode *inode) * DCACHE_DISCONNECTED), then d_move that in place of the given dentry * and return it, else simply d_add the inode to the dentry and return NULL. * - * This is (will be) needed in the lookup routine of any filesystem that is exportable + * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL @@ -895,11 +1138,12 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { struct dentry *new = NULL; - if (inode && S_ISDIR(inode->i_mode)) { + if (inode) { spin_lock(&dcache_lock); - if (!list_empty(&inode->i_dentry)) { - new = list_entry(inode->i_dentry.next, struct dentry, d_alias); - __dget_locked(new); + new = __d_find_alias(inode, 1); + if (new) { + BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); + fsnotify_d_instantiate(new, inode); spin_unlock(&dcache_lock); security_d_instantiate(new, inode); d_rehash(dentry); @@ -909,6 +1153,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) /* d_instantiate takes dcache_lock, so we do it by hand */ list_add(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; + fsnotify_d_instantiate(dentry, inode); spin_unlock(&dcache_lock); security_d_instantiate(dentry, inode); d_rehash(dentry); @@ -970,18 +1215,13 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) struct hlist_head *head = d_hash(parent,hash); struct dentry *found = NULL; struct hlist_node *node; + struct dentry *dentry; rcu_read_lock(); - hlist_for_each (node, head) { - struct dentry *dentry; + hlist_for_each_entry_rcu(dentry, node, head, d_hash) { struct qstr *qstr; - smp_read_barrier_depends(); - dentry = hlist_entry(node, struct dentry, d_hash); - - smp_rmb(); - if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) @@ -989,13 +1229,6 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) spin_lock(&dentry->d_lock); - /* - * If lookup ends up in a different bucket due to concurrent - * rename, fail it - */ - if (unlikely(dentry->d_bucket != head)) - goto terminate; - /* * Recheck the dentry after taking the lock - d_move may have * changed things. Don't bother checking the hash because we're @@ -1004,8 +1237,11 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) if (dentry->d_parent != parent) goto next; + /* + * It is safe to compare names since d_move() cannot + * change the qstr (protected by d_lock). + */ qstr = &dentry->d_name; - smp_read_barrier_depends(); if (parent->d_op && parent->d_op->d_compare) { if (parent->d_op->d_compare(parent, qstr, name)) goto next; @@ -1020,7 +1256,6 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) atomic_inc(&dentry->d_count); found = dentry; } -terminate: spin_unlock(&dentry->d_lock); break; next: @@ -1031,6 +1266,32 @@ next: return found; } +/** + * d_hash_and_lookup - hash the qstr then search for a dentry + * @dir: Directory to search in + * @name: qstr of name we wish to find + * + * On hash failure or on lookup failure NULL is returned. + */ +struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) +{ + struct dentry *dentry = NULL; + + /* + * Check for a fs-specific hash function. Note that we must + * calculate the standard hash first, as the d_op->d_hash() + * routine may choose to leave the hash value unchanged. + */ + name->hash = full_name_hash(name->name, name->len); + if (dir->d_op && dir->d_op->d_hash) { + if (dir->d_op->d_hash(dir, name) < 0) + goto out; + } + dentry = d_lookup(dir, name); +out: + return dentry; +} + /** * d_validate - verify dentry provided from insecure source * @dentry: The dentry alleged to be valid child of @dparent @@ -1058,7 +1319,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent) spin_lock(&dcache_lock); base = d_hash(dparent, dentry->d_name.hash); hlist_for_each(lhp,base) { - /* read_barrier_depends() not required for d_hash list + /* hlist_for_each_entry_rcu() not required for d_hash list * as it is parsed under dcache_lock */ if (dentry == hlist_entry(lhp, struct dentry, d_hash)) { @@ -1095,13 +1356,19 @@ out: void d_delete(struct dentry * dentry) { + int isdir = 0; /* * Are we the only user? */ spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); + isdir = S_ISDIR(dentry->d_inode->i_mode); if (atomic_read(&dentry->d_count) == 1) { dentry_iput(dentry); + fsnotify_nameremove(dentry, isdir); + + /* remove this and other inotify debug checks after 2.6.18 */ + dentry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED; return; } @@ -1110,6 +1377,20 @@ void d_delete(struct dentry * dentry) spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + + fsnotify_nameremove(dentry, isdir); +} + +static void __d_rehash(struct dentry * entry, struct hlist_head *list) +{ + + entry->d_flags &= ~DCACHE_UNHASHED; + hlist_add_head_rcu(&entry->d_hash, list); +} + +static void _d_rehash(struct dentry * entry) +{ + __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash)); } /** @@ -1121,14 +1402,10 @@ void d_delete(struct dentry * dentry) void d_rehash(struct dentry * entry) { - struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); - spin_lock(&dcache_lock); spin_lock(&entry->d_lock); - entry->d_flags &= ~DCACHE_UNHASHED; + _d_rehash(entry); spin_unlock(&entry->d_lock); - entry->d_bucket = list; - hlist_add_head_rcu(&entry->d_hash, list); spin_unlock(&dcache_lock); } @@ -1206,6 +1483,8 @@ static void switch_names(struct dentry *dentry, struct dentry *target) void d_move(struct dentry * dentry, struct dentry * target) { + struct hlist_head *list; + if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -1216,42 +1495,40 @@ void d_move(struct dentry * dentry, struct dentry * target) */ if (target < dentry) { spin_lock(&target->d_lock); - spin_lock(&dentry->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); } else { spin_lock(&dentry->d_lock); - spin_lock(&target->d_lock); + spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED); } /* Move the dentry to the target hash queue, if on different bucket */ if (dentry->d_flags & DCACHE_UNHASHED) goto already_unhashed; - if (dentry->d_bucket != target->d_bucket) { - hlist_del_rcu(&dentry->d_hash); + + hlist_del_rcu(&dentry->d_hash); + already_unhashed: - dentry->d_bucket = target->d_bucket; - hlist_add_head_rcu(&dentry->d_hash, target->d_bucket); - dentry->d_flags &= ~DCACHE_UNHASHED; - } + list = d_hash(target->d_parent, target->d_name.hash); + __d_rehash(dentry, list); /* Unhash the target: dput() will then get rid of it */ __d_drop(target); - /* flush any possible attributes */ - if (dentry->d_extra_attributes) { - kfree(dentry->d_extra_attributes); - dentry->d_extra_attributes = NULL; - } - if (target->d_extra_attributes) { - kfree(target->d_extra_attributes); - target->d_extra_attributes = NULL; - } + /* flush any possible attributes */ + if (dentry->d_extra_attributes) { + kfree(dentry->d_extra_attributes); + dentry->d_extra_attributes = NULL; + } + if (target->d_extra_attributes) { + kfree(target->d_extra_attributes); + target->d_extra_attributes = NULL; + } - list_del(&dentry->d_child); - list_del(&target->d_child); + list_del(&dentry->d_u.d_child); + list_del(&target->d_u.d_child); /* Switch the names.. */ switch_names(dentry, target); - smp_wmb(); do_switch(dentry->d_name.len, target->d_name.len); do_switch(dentry->d_name.hash, target->d_name.hash); @@ -1259,21 +1536,136 @@ already_unhashed: if (IS_ROOT(dentry)) { dentry->d_parent = target->d_parent; target->d_parent = target; - INIT_LIST_HEAD(&target->d_child); + INIT_LIST_HEAD(&target->d_u.d_child); } else { do_switch(dentry->d_parent, target->d_parent); /* And add them back to the (new) parent lists */ - list_add(&target->d_child, &target->d_parent->d_subdirs); + list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); } - list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); + list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); spin_unlock(&target->d_lock); + fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); write_sequnlock(&rename_lock); spin_unlock(&dcache_lock); } +/* + * Prepare an anonymous dentry for life in the superblock's dentry tree as a + * named dentry in place of the dentry to be replaced. + */ +static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) +{ + struct dentry *dparent, *aparent; + + switch_names(dentry, anon); + do_switch(dentry->d_name.len, anon->d_name.len); + do_switch(dentry->d_name.hash, anon->d_name.hash); + + dparent = dentry->d_parent; + aparent = anon->d_parent; + + dentry->d_parent = (aparent == anon) ? dentry : aparent; + list_del(&dentry->d_u.d_child); + if (!IS_ROOT(dentry)) + list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + else + INIT_LIST_HEAD(&dentry->d_u.d_child); + + anon->d_parent = (dparent == dentry) ? anon : dparent; + list_del(&anon->d_u.d_child); + if (!IS_ROOT(anon)) + list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs); + else + INIT_LIST_HEAD(&anon->d_u.d_child); + + anon->d_flags &= ~DCACHE_DISCONNECTED; +} + +/** + * d_materialise_unique - introduce an inode into the tree + * @dentry: candidate dentry + * @inode: inode to bind to the dentry, to which aliases may be attached + * + * Introduces an dentry into the tree, substituting an extant disconnected + * root directory alias in its place if there is one + */ +struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) +{ + struct dentry *alias, *actual; + + BUG_ON(!d_unhashed(dentry)); + + spin_lock(&dcache_lock); + + if (!inode) { + actual = dentry; + dentry->d_inode = NULL; + goto found_lock; + } + + /* See if a disconnected directory already exists as an anonymous root + * that we should splice into the tree instead */ + if (S_ISDIR(inode->i_mode) && (alias = __d_find_alias(inode, 1))) { + spin_lock(&alias->d_lock); + + /* Is this a mountpoint that we could splice into our tree? */ + if (IS_ROOT(alias)) + goto connect_mountpoint; + + if (alias->d_name.len == dentry->d_name.len && + alias->d_parent == dentry->d_parent && + memcmp(alias->d_name.name, + dentry->d_name.name, + dentry->d_name.len) == 0) + goto replace_with_alias; + + spin_unlock(&alias->d_lock); + + /* Doh! Seem to be aliasing directories for some reason... */ + dput(alias); + } + + /* Add a unique reference */ + actual = __d_instantiate_unique(dentry, inode); + if (!actual) + actual = dentry; + else if (unlikely(!d_unhashed(actual))) + goto shouldnt_be_hashed; + +found_lock: + spin_lock(&actual->d_lock); +found: + _d_rehash(actual); + spin_unlock(&actual->d_lock); + spin_unlock(&dcache_lock); + + if (actual == dentry) { + security_d_instantiate(dentry, inode); + return NULL; + } + + iput(inode); + return actual; + + /* Convert the anonymous/root alias into an ordinary dentry */ +connect_mountpoint: + __d_materialise_dentry(dentry, alias); + + /* Replace the candidate dentry with the alias in the tree */ +replace_with_alias: + __d_drop(alias); + actual = alias; + goto found; + +shouldnt_be_hashed: + spin_unlock(&dcache_lock); + BUG(); + goto shouldnt_be_hashed; +} + /** * d_path - return the path of a dentry * @dentry: dentry to report @@ -1470,7 +1862,6 @@ int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry) struct dentry * saved = new_dentry; unsigned long seq; - result = 0; /* need rcu_readlock to protect against the d_parent trashing due to * d_move */ @@ -1478,6 +1869,7 @@ int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry) do { /* for restarting inner loop in case of seq retry */ new_dentry = saved; + result = 0; seq = read_seqbegin(&rename_lock); for (;;) { if (new_dentry != old_dentry) { @@ -1507,7 +1899,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; if (d_unhashed(dentry)||!dentry->d_inode) continue; @@ -1518,7 +1910,7 @@ resume: atomic_dec(&dentry->d_count); } if (this_parent != root) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; atomic_dec(&this_parent->d_count); this_parent = this_parent->d_parent; goto resume; @@ -1545,26 +1937,12 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name) struct dentry * dentry; ino_t ino = 0; - /* - * Check for a fs-specific hash function. Note that we must - * calculate the standard hash first, as the d_op->d_hash() - * routine may choose to leave the hash value unchanged. - */ - name->hash = full_name_hash(name->name, name->len); - if (dir->d_op && dir->d_op->d_hash) - { - if (dir->d_op->d_hash(dir, name) != 0) - goto out; - } - - dentry = d_lookup(dir, name); - if (dentry) - { + dentry = d_hash_and_lookup(dir, name); + if (dentry) { if (dentry->d_inode) ino = dentry->d_inode->i_ino; dput(dentry); } -out: return ino; } @@ -1578,6 +1956,30 @@ static int __init set_dhash_entries(char *str) } __setup("dhash_entries=", set_dhash_entries); +static void __init dcache_init_early(void) +{ + int loop; + + /* If hashes are distributed across NUMA nodes, defer + * hash allocation until vmalloc space is available. + */ + if (hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_head), + dhash_entries, + 13, + HASH_EARLY, + &d_hash_shift, + &d_hash_mask, + 0); + + for (loop = 0; loop < (1 << d_hash_shift); loop++) + INIT_HLIST_HEAD(&dentry_hashtable[loop]); +} + void flush_dentry_attributes (void) { struct hlist_node *tmp; @@ -1597,10 +1999,7 @@ EXPORT_SYMBOL_GPL(flush_dentry_attributes); static void __init dcache_init(unsigned long mempages) { - struct hlist_head *d; - unsigned long order; - unsigned int nr_hash; - int i; + int loop; /* * A constructor could be added for stable state like the lists, @@ -1610,65 +2009,47 @@ static void __init dcache_init(unsigned long mempages) dentry_cache = kmem_cache_create("dentry_cache", sizeof(struct dentry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), NULL, NULL); set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); - if (!dhash_entries) - dhash_entries = PAGE_SHIFT < 13 ? - mempages >> (13 - PAGE_SHIFT) : - mempages << (PAGE_SHIFT - 13); - - dhash_entries *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < dhash_entries; order++) - ; - - if (order > 5) - order = 5; - - do { - unsigned long tmp; - - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct hlist_head); - d_hash_mask = (nr_hash - 1); - - tmp = nr_hash; - d_hash_shift = 0; - while ((tmp >>= 1UL) != 0UL) - d_hash_shift++; - - dentry_hashtable = (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, order); - } while (dentry_hashtable == NULL && --order >= 0); - - printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); - - if (!dentry_hashtable) - panic("Failed to allocate dcache hash table\n"); - - d = dentry_hashtable; - i = nr_hash; - do { - INIT_HLIST_HEAD(d); - d++; - i--; - } while (i); + /* Hash may have been set up in dcache_init_early */ + if (!hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_head), + dhash_entries, + 13, + 0, + &d_hash_shift, + &d_hash_mask, + 0); + + for (loop = 0; loop < (1 << d_hash_shift); loop++) + INIT_HLIST_HEAD(&dentry_hashtable[loop]); } /* SLAB cache for __getname() consumers */ -kmem_cache_t *names_cachep; +kmem_cache_t *names_cachep __read_mostly; /* SLAB cache for file structures */ -kmem_cache_t *filp_cachep; +kmem_cache_t *filp_cachep __read_mostly; EXPORT_SYMBOL(d_genocide); extern void bdev_cache_init(void); extern void chrdev_init(void); +void __init vfs_caches_init_early(void) +{ + dcache_init_early(); + inode_init_early(); +} + void __init vfs_caches_init(unsigned long mempages) { unsigned long reserve; @@ -1683,7 +2064,7 @@ void __init vfs_caches_init(unsigned long mempages) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); @@ -1702,6 +2083,7 @@ EXPORT_SYMBOL(d_instantiate); EXPORT_SYMBOL(d_invalidate); EXPORT_SYMBOL(d_lookup); EXPORT_SYMBOL(d_move); +EXPORT_SYMBOL_GPL(d_materialise_unique); EXPORT_SYMBOL(d_path); EXPORT_SYMBOL(d_prune_aliases); EXPORT_SYMBOL(d_rehash); @@ -1711,8 +2093,6 @@ EXPORT_SYMBOL(dget_locked); EXPORT_SYMBOL(dput); EXPORT_SYMBOL(find_inode_number); EXPORT_SYMBOL(have_submounts); -EXPORT_SYMBOL(is_subdir); EXPORT_SYMBOL(names_cachep); -EXPORT_SYMBOL(shrink_dcache_anon); EXPORT_SYMBOL(shrink_dcache_parent); EXPORT_SYMBOL(shrink_dcache_sb);