X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Fdcache.c;h=5e279e6dfafc0e50bdb03bee048a66405e84e8c2;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=d4fa197bd613a651e8752542d5760dda571d413f;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/fs/dcache.c b/fs/dcache.c index d4fa197bd..5e279e6df 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -15,12 +15,15 @@ */ #include +#include #include #include #include +#include #include #include #include +#include #include #include #include @@ -29,16 +32,20 @@ #include #include #include +#include -#define DCACHE_PARANOIA 1 -/* #define DCACHE_DEBUG 1 */ -spinlock_t dcache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; +int sysctl_vfs_cache_pressure __read_mostly = 100; +EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); +static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; EXPORT_SYMBOL(dcache_lock); -static kmem_cache_t *dentry_cache; +static kmem_cache_t *dentry_cache __read_mostly; + +#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) /* * This is the single most critical data structure when it comes @@ -51,9 +58,9 @@ static kmem_cache_t *dentry_cache; #define D_HASHBITS d_hash_shift #define D_HASHMASK d_hash_mask -static unsigned int d_hash_mask; -static unsigned int d_hash_shift; -static struct hlist_head *dentry_hashtable; +static unsigned int d_hash_mask __read_mostly; +static unsigned int d_hash_shift __read_mostly; +static struct hlist_head *dentry_hashtable __read_mostly; static LIST_HEAD(dentry_unused); /* Statistics gathering. */ @@ -61,13 +68,12 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static void d_callback(void *arg) +static void d_callback(struct rcu_head *head) { - struct dentry * dentry = (struct dentry *)arg; + struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu); - if (dname_external(dentry)) { - kfree(dentry->d_qstr); - } + if (dname_external(dentry)) + kfree(dentry->d_name.name); kmem_cache_free(dentry_cache, dentry); } @@ -79,7 +85,11 @@ static void d_free(struct dentry *dentry) { if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - call_rcu(&dentry->d_rcu, d_callback, dentry); + if (dentry->d_extra_attributes) { + kfree(dentry->d_extra_attributes); + dentry->d_extra_attributes = NULL; + } + call_rcu(&dentry->d_u.d_rcu, d_callback); } /* @@ -87,7 +97,7 @@ static void d_free(struct dentry *dentry) * d_iput() operation if defined. * Called with dcache_lock and per dentry lock held, drops both. */ -static inline void dentry_iput(struct dentry * dentry) +static void dentry_iput(struct dentry * dentry) { struct inode *inode = dentry->d_inode; if (inode) { @@ -95,6 +105,8 @@ static inline void dentry_iput(struct dentry * dentry) list_del_init(&dentry->d_alias); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else @@ -140,6 +152,8 @@ void dput(struct dentry *dentry) return; repeat: + if (atomic_read(&dentry->d_count) == 1) + might_sleep(); if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) return; @@ -149,7 +163,7 @@ repeat: spin_unlock(&dcache_lock); return; } - + /* * AV: ->d_delete() is _NOT_ allowed to block now. */ @@ -161,7 +175,7 @@ repeat: if (d_unhashed(dentry)) goto kill_it; if (list_empty(&dentry->d_lru)) { - dentry->d_vfs_flags |= DCACHE_REFERENCED; + dentry->d_flags |= DCACHE_REFERENCED; list_add(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; } @@ -182,7 +196,7 @@ kill_it: { list_del(&dentry->d_lru); dentry_stat.nr_unused--; } - list_del(&dentry->d_child); + list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); @@ -257,7 +271,7 @@ int d_invalidate(struct dentry * dentry) static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); - if (atomic_read(&dentry->d_count) == 1) { + if (!list_empty(&dentry->d_lru)) { dentry_stat.nr_unused--; list_del_init(&dentry->d_lru); } @@ -272,22 +286,25 @@ struct dentry * dget_locked(struct dentry *dentry) /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question + * @want_discon: flag, used by d_splice_alias, to request + * that only a DISCONNECTED alias be returned. * - * If inode has a hashed alias - acquire the reference to alias and - * return it. Otherwise return NULL. Notice that if inode is a directory - * there can be only one alias and it can be unhashed only if it has - * no children. + * If inode has a hashed alias, or is a directory and has any alias, + * acquire the reference to alias and return it. Otherwise return NULL. + * Notice that if inode is a directory there can be only one alias and + * it can be unhashed only if it has no children, or if it is the root + * of a filesystem. * * If the inode has a DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that one. + * any other hashed alias over that one unless @want_discon is set, + * in which case only return a DCACHE_DISCONNECTED alias. */ -struct dentry * d_find_alias(struct inode *inode) +static struct dentry * __d_find_alias(struct inode *inode, int want_discon) { struct list_head *head, *next, *tmp; struct dentry *alias, *discon_alias=NULL; - spin_lock(&dcache_lock); head = &inode->i_dentry; next = inode->i_dentry.next; while (next != head) { @@ -295,41 +312,52 @@ struct dentry * d_find_alias(struct inode *inode) next = tmp->next; prefetch(next); alias = list_entry(tmp, struct dentry, d_alias); - if (!d_unhashed(alias)) { + if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { if (alias->d_flags & DCACHE_DISCONNECTED) discon_alias = alias; - else { + else if (!want_discon) { __dget_locked(alias); - spin_unlock(&dcache_lock); return alias; } } } if (discon_alias) __dget_locked(discon_alias); - spin_unlock(&dcache_lock); return discon_alias; } +struct dentry * d_find_alias(struct inode *inode) +{ + struct dentry *de = NULL; + + if (!list_empty(&inode->i_dentry)) { + spin_lock(&dcache_lock); + de = __d_find_alias(inode, 0); + spin_unlock(&dcache_lock); + } + return de; +} + /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { - struct list_head *tmp, *head = &inode->i_dentry; + struct dentry *dentry; restart: spin_lock(&dcache_lock); - tmp = head; - while ((tmp = tmp->next) != head) { - struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + spin_lock(&dentry->d_lock); if (!atomic_read(&dentry->d_count)) { __dget_locked(dentry); __d_drop(dentry); + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); dput(dentry); goto restart; } + spin_unlock(&dentry->d_lock); } spin_unlock(&dcache_lock); } @@ -345,7 +373,7 @@ static inline void prune_one_dentry(struct dentry * dentry) struct dentry * parent; __d_drop(dentry); - list_del(&dentry->d_child); + list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ dentry_iput(dentry); parent = dentry->d_parent; @@ -375,6 +403,8 @@ static void prune_dcache(int count) struct dentry *dentry; struct list_head *tmp; + cond_resched_lock(&dcache_lock); + tmp = dentry_unused.prev; if (tmp == &dentry_unused) break; @@ -394,8 +424,8 @@ static void prune_dcache(int count) continue; } /* If the dentry was recently referenced, don't free it. */ - if (dentry->d_vfs_flags & DCACHE_REFERENCED) { - dentry->d_vfs_flags &= ~DCACHE_REFERENCED; + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; list_add(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; spin_unlock(&dentry->d_lock); @@ -438,10 +468,7 @@ void shrink_dcache_sb(struct super_block * sb) * superblock to the most recent end of the unused list. */ spin_lock(&dcache_lock); - next = dentry_unused.next; - while (next != &dentry_unused) { - tmp = next; - next = tmp->next; + list_for_each_safe(tmp, next, &dentry_unused) { dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; @@ -453,10 +480,7 @@ void shrink_dcache_sb(struct super_block * sb) * Pass two ... free the dentries for this superblock. */ repeat: - next = dentry_unused.next; - while (next != &dentry_unused) { - tmp = next; - next = tmp->next; + list_for_each_safe(tmp, next, &dentry_unused) { dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; @@ -468,6 +492,7 @@ repeat: continue; } prune_one_dentry(dentry); + cond_resched_lock(&dcache_lock); goto repeat; } spin_unlock(&dcache_lock); @@ -500,7 +525,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; /* Have we found a mount point ? */ if (d_mountpoint(dentry)) @@ -514,7 +539,7 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; goto resume; } @@ -531,6 +556,13 @@ positive: * list for prune_dcache(). We descend to the next level * whenever the d_subdirs list is non-empty and continue * searching. + * + * It returns zero iff there are no unused children, + * otherwise it returns the number of children moved to + * the end of the unused list. This may not be the total + * number of unused children, because select_parent can + * drop the lock and return early due to latency + * constraints. */ static int select_parent(struct dentry * parent) { @@ -544,7 +576,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; if (!list_empty(&dentry->d_lru)) { @@ -560,15 +592,20 @@ resume: dentry_stat.nr_unused++; found++; } + + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (found && need_resched()) + goto out; + /* * Descend a level if the d_subdirs list is non-empty. */ if (!list_empty(&dentry->d_subdirs)) { this_parent = dentry; -#ifdef DCACHE_DEBUG -printk(KERN_DEBUG "select_parent: descending to %s/%s, found=%d\n", -dentry->d_parent->d_name.name, dentry->d_name.name, found); -#endif goto repeat; } } @@ -576,14 +613,11 @@ dentry->d_parent->d_name.name, dentry->d_name.name, found); * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; -#ifdef DCACHE_DEBUG -printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n", -this_parent->d_parent->d_name.name, this_parent->d_name.name, found); -#endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } @@ -609,7 +643,7 @@ void shrink_dcache_parent(struct dentry * parent) * * Prune the dentries that are anonymous * - * parsing d_hash list does not read_barrier_depends() as it + * parsing d_hash list does not hlist_for_each_entry_rcu() as it * done under dcache_lock. * */ @@ -624,7 +658,7 @@ void shrink_dcache_anon(struct hlist_head *head) struct dentry *this = hlist_entry(lp, struct dentry, d_hash); if (!list_empty(&this->d_lru)) { dentry_stat.nr_unused--; - list_del(&this->d_lru); + list_del_init(&this->d_lru); } /* @@ -643,30 +677,27 @@ void shrink_dcache_anon(struct hlist_head *head) } /* - * This is called from kswapd when we think we need some more memory. + * Scan `nr' dentries and return the number which remain. + * + * We need to avoid reentering the filesystem if the caller is performing a + * GFP_NOFS allocation attempt. One example deadlock is: + * + * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache-> + * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode-> + * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK. + * + * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, unsigned int gfp_mask) +static int shrink_dcache_memory(int nr, gfp_t gfp_mask) { if (nr) { - /* - * Nasty deadlock avoidance. - * - * ext2_new_block->getblk->GFP->shrink_dcache_memory-> - * prune_dcache->prune_one_dentry->dput->dentry_iput->iput-> - * inode->i_sb->s_op->put_inode->ext2_discard_prealloc-> - * ext2_free_blocks->lock_super->DEADLOCK. - * - * We should make sure we don't hold the superblock lock over - * block allocations, but for now: - */ - if (gfp_mask & __GFP_FS) - prune_dcache(nr); + if (!(gfp_mask & __GFP_FS)) + return -1; + prune_dcache(nr); } - return dentry_stat.nr_unused; + return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } -#define NAME_ALLOC_LEN(len) ((len+16) & ~15) - /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate @@ -677,52 +708,44 @@ static int shrink_dcache_memory(int nr, unsigned int gfp_mask) * copied and the copy passed in may be reused after this call. */ -struct dentry * d_alloc(struct dentry * parent, const struct qstr *name) +struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { - char * str; struct dentry *dentry; - struct qstr * qstr; + char *dname; dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) return NULL; if (name->len > DNAME_INLINE_LEN-1) { - qstr = kmalloc(sizeof(*qstr) + NAME_ALLOC_LEN(name->len), - GFP_KERNEL); - if (!qstr) { + dname = kmalloc(name->len + 1, GFP_KERNEL); + if (!dname) { kmem_cache_free(dentry_cache, dentry); return NULL; } - qstr->name = qstr->name_str; - qstr->len = name->len; - qstr->hash = name->hash; - dentry->d_qstr = qstr; - str = qstr->name_str; } else { - dentry->d_qstr = &dentry->d_name; - str = dentry->d_iname; + dname = dentry->d_iname; } + dentry->d_name.name = dname; - memcpy(str, name->name, name->len); - str[name->len] = 0; + dentry->d_name.len = name->len; + dentry->d_name.hash = name->hash; + memcpy(dname, name->name, name->len); + dname[name->len] = 0; atomic_set(&dentry->d_count, 1); - dentry->d_vfs_flags = DCACHE_UNHASHED; - dentry->d_lock = SPIN_LOCK_UNLOCKED; - dentry->d_flags = 0; + dentry->d_flags = DCACHE_UNHASHED; + spin_lock_init(&dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = NULL; - dentry->d_move_count = 0; dentry->d_sb = NULL; - dentry->d_name.name = str; - dentry->d_name.len = name->len; - dentry->d_name.hash = name->hash; dentry->d_op = NULL; dentry->d_fsdata = NULL; + dentry->d_extra_attributes = NULL; dentry->d_mounted = 0; +#ifdef CONFIG_PROFILING dentry->d_cookie = NULL; - dentry->d_bucket = NULL; +#endif INIT_HLIST_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); @@ -732,18 +755,28 @@ struct dentry * d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_parent = dget(parent); dentry->d_sb = parent->d_sb; } else { - INIT_LIST_HEAD(&dentry->d_child); + INIT_LIST_HEAD(&dentry->d_u.d_child); } spin_lock(&dcache_lock); if (parent) - list_add(&dentry->d_child, &parent->d_subdirs); + list_add(&dentry->d_u.d_child, &parent->d_subdirs); dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); return dentry; } +struct dentry *d_alloc_name(struct dentry *parent, const char *name) +{ + struct qstr q; + + q.name = name; + q.len = strlen(name); + q.hash = full_name_hash(q.name, q.len); + return d_alloc(parent, &q); +} + /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete @@ -761,14 +794,69 @@ struct dentry * d_alloc(struct dentry * parent, const struct qstr *name) void d_instantiate(struct dentry *entry, struct inode * inode) { - if (!list_empty(&entry->d_alias)) BUG(); + BUG_ON(!list_empty(&entry->d_alias)); spin_lock(&dcache_lock); if (inode) list_add(&entry->d_alias, &inode->i_dentry); entry->d_inode = inode; + fsnotify_d_instantiate(entry, inode); + spin_unlock(&dcache_lock); + security_d_instantiate(entry, inode); +} + +/** + * d_instantiate_unique - instantiate a non-aliased dentry + * @entry: dentry to instantiate + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. On success, it returns NULL. + * If an unhashed alias of "entry" already exists, then we return the + * aliased dentry instead and drop one reference to inode. + * + * Note that in order to avoid conflicts with rename() etc, the caller + * had better be holding the parent directory semaphore. + * + * This also assumes that the inode count has been incremented + * (or otherwise set) by the caller to indicate that it is now + * in use by the dcache. + */ +struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) +{ + struct dentry *alias; + int len = entry->d_name.len; + const char *name = entry->d_name.name; + unsigned int hash = entry->d_name.hash; + + BUG_ON(!list_empty(&entry->d_alias)); + spin_lock(&dcache_lock); + if (!inode) + goto do_negative; + list_for_each_entry(alias, &inode->i_dentry, d_alias) { + struct qstr *qstr = &alias->d_name; + + if (qstr->hash != hash) + continue; + if (alias->d_parent != entry->d_parent) + continue; + if (qstr->len != len) + continue; + if (memcmp(qstr->name, name, len)) + continue; + dget_locked(alias); + spin_unlock(&dcache_lock); + BUG_ON(!d_unhashed(alias)); + iput(inode); + return alias; + } + list_add(&entry->d_alias, &inode->i_dentry); +do_negative: + entry->d_inode = inode; + fsnotify_d_instantiate(entry, inode); spin_unlock(&dcache_lock); security_d_instantiate(entry, inode); + return NULL; } +EXPORT_SYMBOL(d_instantiate_unique); /** * d_alloc_root - allocate root dentry @@ -784,7 +872,8 @@ struct dentry * d_alloc_root(struct inode * root_inode) struct dentry *res = NULL; if (root_inode) { - static const struct qstr name = { .name = "/", .len = 1, .hash = 0 }; + static const struct qstr name = { .name = "/", .len = 1 }; + res = d_alloc(NULL, &name); if (res) { res->d_sb = root_inode->i_sb; @@ -795,10 +884,11 @@ struct dentry * d_alloc_root(struct inode * root_inode) return res; } -static inline struct hlist_head * d_hash(struct dentry * parent, unsigned long hash) +static inline struct hlist_head *d_hash(struct dentry *parent, + unsigned long hash) { - hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash ^ (hash >> D_HASHBITS); + hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; + hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS); return dentry_hashtable + (hash & D_HASHMASK); } @@ -824,7 +914,7 @@ static inline struct hlist_head * d_hash(struct dentry * parent, unsigned long h struct dentry * d_alloc_anon(struct inode *inode) { - static const struct qstr anonstring = { "", 0, 0}; + static const struct qstr anonstring = { .name = "" }; struct dentry *tmp; struct dentry *res; @@ -840,28 +930,21 @@ struct dentry * d_alloc_anon(struct inode *inode) tmp->d_parent = tmp; /* make sure dput doesn't croak */ spin_lock(&dcache_lock); - if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { - /* A directory can only have one dentry. - * This (now) has one, so use it. - */ - res = list_entry(inode->i_dentry.next, struct dentry, d_alias); - __dget_locked(res); - } else { + res = __d_find_alias(inode, 0); + if (!res) { /* attach a disconnected dentry */ res = tmp; tmp = NULL; - if (res) { - spin_lock(&res->d_lock); - res->d_sb = inode->i_sb; - res->d_parent = res; - res->d_inode = inode; - res->d_bucket = d_hash(res, res->d_name.hash); - res->d_flags |= DCACHE_DISCONNECTED; - res->d_vfs_flags &= ~DCACHE_UNHASHED; - list_add(&res->d_alias, &inode->i_dentry); - hlist_add_head(&res->d_hash, &inode->i_sb->s_anon); - spin_unlock(&res->d_lock); - } + spin_lock(&res->d_lock); + res->d_sb = inode->i_sb; + res->d_parent = res; + res->d_inode = inode; + res->d_flags |= DCACHE_DISCONNECTED; + res->d_flags &= ~DCACHE_UNHASHED; + list_add(&res->d_alias, &inode->i_dentry); + hlist_add_head(&res->d_hash, &inode->i_sb->s_anon); + spin_unlock(&res->d_lock); + inode = NULL; /* don't drop reference */ } spin_unlock(&dcache_lock); @@ -883,7 +966,7 @@ struct dentry * d_alloc_anon(struct inode *inode) * DCACHE_DISCONNECTED), then d_move that in place of the given dentry * and return it, else simply d_add the inode to the dentry and return NULL. * - * This is (will be) needed in the lookup routine of any filesystem that is exportable + * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL @@ -894,11 +977,12 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { struct dentry *new = NULL; - if (inode && S_ISDIR(inode->i_mode)) { + if (inode) { spin_lock(&dcache_lock); - if (!list_empty(&inode->i_dentry)) { - new = list_entry(inode->i_dentry.next, struct dentry, d_alias); - __dget_locked(new); + new = __d_find_alias(inode, 1); + if (new) { + BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); + fsnotify_d_instantiate(new, inode); spin_unlock(&dcache_lock); security_d_instantiate(new, inode); d_rehash(dentry); @@ -908,6 +992,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) /* d_instantiate takes dcache_lock, so we do it by hand */ list_add(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; + fsnotify_d_instantiate(dentry, inode); spin_unlock(&dcache_lock); security_d_instantiate(dentry, inode); d_rehash(dentry); @@ -930,8 +1015,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) * * __d_lookup is dcache_lock free. The hash list is protected using RCU. * Memory barriers are used while updating and doing lockless traversal. - * To avoid races with d_move while rename is happening, d_move_count is - * used. + * To avoid races with d_move while rename is happening, d_lock is used. * * Overflows in memcmp(), while d_move, are avoided by keeping the length * and name pointer in one structure pointed by d_qstr. @@ -940,8 +1024,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) * lookup is going on. * * dentry_unused list is not updated even if lookup finds the required dentry - * in there. It is updated in places such as prune_dcache, shrink_dcache_sb and - * select_parent. This laziness saves lookup from dcache_lock acquisition. + * in there. It is updated in places such as prune_dcache, shrink_dcache_sb, + * select_parent and __dget_locked. This laziness saves lookup from dcache_lock + * acquisition. * * d_lookup() is protected against the concurrent renames in some unrelated * directory using the seqlockt_t rename_lock. @@ -969,64 +1054,83 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) struct hlist_head *head = d_hash(parent,hash); struct dentry *found = NULL; struct hlist_node *node; + struct dentry *dentry; rcu_read_lock(); - hlist_for_each (node, head) { - struct dentry *dentry; - unsigned long move_count; - struct qstr * qstr; + hlist_for_each_entry_rcu(dentry, node, head, d_hash) { + struct qstr *qstr; - smp_read_barrier_depends(); - dentry = hlist_entry(node, struct dentry, d_hash); + if (dentry->d_name.hash != hash) + continue; + if (dentry->d_parent != parent) + continue; - /* if lookup ends up in a different bucket - * due to concurrent rename, fail it - */ - if (unlikely(dentry->d_bucket != head)) - break; + spin_lock(&dentry->d_lock); /* - * We must take a snapshot of d_move_count followed by - * read memory barrier before any search key comparison + * Recheck the dentry after taking the lock - d_move may have + * changed things. Don't bother checking the hash because we're + * about to compare the whole name anyway. */ - move_count = dentry->d_move_count; - smp_rmb(); - - if (dentry->d_name.hash != hash) - continue; if (dentry->d_parent != parent) - continue; + goto next; - qstr = dentry->d_qstr; - smp_read_barrier_depends(); + /* + * It is safe to compare names since d_move() cannot + * change the qstr (protected by d_lock). + */ + qstr = &dentry->d_name; if (parent->d_op && parent->d_op->d_compare) { if (parent->d_op->d_compare(parent, qstr, name)) - continue; + goto next; } else { if (qstr->len != len) - continue; + goto next; if (memcmp(qstr->name, str, len)) - continue; + goto next; } - spin_lock(&dentry->d_lock); - /* - * If dentry is moved, fail the lookup - */ - if (likely(move_count == dentry->d_move_count)) { - if (!d_unhashed(dentry)) { - atomic_inc(&dentry->d_count); - found = dentry; - } + + if (!d_unhashed(dentry)) { + atomic_inc(&dentry->d_count); + found = dentry; } spin_unlock(&dentry->d_lock); break; +next: + spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } +/** + * d_hash_and_lookup - hash the qstr then search for a dentry + * @dir: Directory to search in + * @name: qstr of name we wish to find + * + * On hash failure or on lookup failure NULL is returned. + */ +struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) +{ + struct dentry *dentry = NULL; + + /* + * Check for a fs-specific hash function. Note that we must + * calculate the standard hash first, as the d_op->d_hash() + * routine may choose to leave the hash value unchanged. + */ + name->hash = full_name_hash(name->name, name->len); + if (dir->d_op && dir->d_op->d_hash) { + if (dir->d_op->d_hash(dir, name) < 0) + goto out; + } + dentry = d_lookup(dir, name); +out: + return dentry; +} + /** * d_validate - verify dentry provided from insecure source * @dentry: The dentry alleged to be valid child of @dparent @@ -1054,7 +1158,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent) spin_lock(&dcache_lock); base = d_hash(dparent, dentry->d_name.hash); hlist_for_each(lhp,base) { - /* read_barrier_depends() not required for d_hash list + /* hlist_for_each_entry_rcu() not required for d_hash list * as it is parsed under dcache_lock */ if (dentry == hlist_entry(lhp, struct dentry, d_hash)) { @@ -1091,13 +1195,19 @@ out: void d_delete(struct dentry * dentry) { + int isdir = 0; /* * Are we the only user? */ spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); + isdir = S_ISDIR(dentry->d_inode->i_mode); if (atomic_read(&dentry->d_count) == 1) { dentry_iput(dentry); + fsnotify_nameremove(dentry, isdir); + + /* remove this and other inotify debug checks after 2.6.18 */ + dentry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED; return; } @@ -1106,6 +1216,15 @@ void d_delete(struct dentry * dentry) spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + + fsnotify_nameremove(dentry, isdir); +} + +static void __d_rehash(struct dentry * entry, struct hlist_head *list) +{ + + entry->d_flags &= ~DCACHE_UNHASHED; + hlist_add_head_rcu(&entry->d_hash, list); } /** @@ -1118,10 +1237,11 @@ void d_delete(struct dentry * dentry) void d_rehash(struct dentry * entry) { struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); + spin_lock(&dcache_lock); - entry->d_vfs_flags &= ~DCACHE_UNHASHED; - entry->d_bucket = list; - hlist_add_head_rcu(&entry->d_hash, list); + spin_lock(&entry->d_lock); + __d_rehash(entry, list); + spin_unlock(&entry->d_lock); spin_unlock(&dcache_lock); } @@ -1140,28 +1260,40 @@ void d_rehash(struct dentry * entry) * then no longer matches the actual (corrupted) string of the target. * The hash value has to match the hash queue that the dentry is on.. */ -static inline void switch_names(struct dentry * dentry, struct dentry * target) +static void switch_names(struct dentry *dentry, struct dentry *target) { - const unsigned char *old_name, *new_name; - struct qstr *old_qstr, *new_qstr; - - memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); - old_qstr = target->d_qstr; - old_name = target->d_name.name; - new_qstr = dentry->d_qstr; - new_name = dentry->d_name.name; - if (old_name == target->d_iname) { - old_name = dentry->d_iname; - old_qstr = &dentry->d_name; - } - if (new_name == dentry->d_iname) { - new_name = target->d_iname; - new_qstr = &target->d_name; + if (dname_external(target)) { + if (dname_external(dentry)) { + /* + * Both external: swap the pointers + */ + do_switch(target->d_name.name, dentry->d_name.name); + } else { + /* + * dentry:internal, target:external. Steal target's + * storage and make target internal. + */ + dentry->d_name.name = target->d_name.name; + target->d_name.name = target->d_iname; + } + } else { + if (dname_external(dentry)) { + /* + * dentry:external, target:internal. Give dentry's + * storage to target and make dentry internal + */ + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + target->d_name.name = dentry->d_name.name; + dentry->d_name.name = dentry->d_iname; + } else { + /* + * Both are internal. Just copy target to dentry + */ + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + } } - target->d_name.name = new_name; - dentry->d_name.name = old_name; - target->d_qstr = new_qstr; - dentry->d_qstr = old_qstr; } /* @@ -1187,6 +1319,8 @@ static inline void switch_names(struct dentry * dentry, struct dentry * target) void d_move(struct dentry * dentry, struct dentry * target) { + struct hlist_head *list; + if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -1204,25 +1338,33 @@ void d_move(struct dentry * dentry, struct dentry * target) } /* Move the dentry to the target hash queue, if on different bucket */ - if (dentry->d_vfs_flags & DCACHE_UNHASHED) + if (dentry->d_flags & DCACHE_UNHASHED) goto already_unhashed; - if (dentry->d_bucket != target->d_bucket) { - hlist_del_rcu(&dentry->d_hash); + + hlist_del_rcu(&dentry->d_hash); + already_unhashed: - dentry->d_bucket = target->d_bucket; - hlist_add_head_rcu(&dentry->d_hash, target->d_bucket); - dentry->d_vfs_flags &= ~DCACHE_UNHASHED; - } + list = d_hash(target->d_parent, target->d_name.hash); + __d_rehash(dentry, list); /* Unhash the target: dput() will then get rid of it */ __d_drop(target); - list_del(&dentry->d_child); - list_del(&target->d_child); + /* flush any possible attributes */ + if (dentry->d_extra_attributes) { + kfree(dentry->d_extra_attributes); + dentry->d_extra_attributes = NULL; + } + if (target->d_extra_attributes) { + kfree(target->d_extra_attributes); + target->d_extra_attributes = NULL; + } + + list_del(&dentry->d_u.d_child); + list_del(&target->d_u.d_child); /* Switch the names.. */ switch_names(dentry, target); - smp_wmb(); do_switch(dentry->d_name.len, target->d_name.len); do_switch(dentry->d_name.hash, target->d_name.hash); @@ -1230,17 +1372,17 @@ already_unhashed: if (IS_ROOT(dentry)) { dentry->d_parent = target->d_parent; target->d_parent = target; - INIT_LIST_HEAD(&target->d_child); + INIT_LIST_HEAD(&target->d_u.d_child); } else { do_switch(dentry->d_parent, target->d_parent); /* And add them back to the (new) parent lists */ - list_add(&target->d_child, &target->d_parent->d_subdirs); + list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); } - list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); - dentry->d_move_count++; + list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); spin_unlock(&target->d_lock); + fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); write_sequnlock(&rename_lock); spin_unlock(&dcache_lock); @@ -1262,7 +1404,7 @@ already_unhashed: * * "buflen" should be positive. Caller holds the dcache_lock. */ -static char * __d_path( struct dentry *dentry, struct vfsmount *vfsmnt, +char * __d_path( struct dentry *dentry, struct vfsmount *vfsmnt, struct dentry *root, struct vfsmount *rootmnt, char *buffer, int buflen) { @@ -1330,6 +1472,8 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +EXPORT_SYMBOL_GPL(__d_path); + /* write full pathname into buffer and return start of pathname */ char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, char *buf, int buflen) @@ -1337,6 +1481,7 @@ char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, char *res; struct vfsmount *rootmnt; struct dentry *root; + read_lock(¤t->fs->lock); rootmnt = mntget(current->fs->rootmnt); root = dget(current->fs->root); @@ -1439,7 +1584,6 @@ int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry) struct dentry * saved = new_dentry; unsigned long seq; - result = 0; /* need rcu_readlock to protect against the d_parent trashing due to * d_move */ @@ -1447,6 +1591,7 @@ int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry) do { /* for restarting inner loop in case of seq retry */ new_dentry = saved; + result = 0; seq = read_seqbegin(&rename_lock); for (;;) { if (new_dentry != old_dentry) { @@ -1476,7 +1621,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; if (d_unhashed(dentry)||!dentry->d_inode) continue; @@ -1487,7 +1632,7 @@ resume: atomic_dec(&dentry->d_count); } if (this_parent != root) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; atomic_dec(&this_parent->d_count); this_parent = this_parent->d_parent; goto resume; @@ -1514,26 +1659,12 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name) struct dentry * dentry; ino_t ino = 0; - /* - * Check for a fs-specific hash function. Note that we must - * calculate the standard hash first, as the d_op->d_hash() - * routine may choose to leave the hash value unchanged. - */ - name->hash = full_name_hash(name->name, name->len); - if (dir->d_op && dir->d_op->d_hash) - { - if (dir->d_op->d_hash(dir, name) != 0) - goto out; - } - - dentry = d_lookup(dir, name); - if (dentry) - { + dentry = d_hash_and_lookup(dir, name); + if (dentry) { if (dentry->d_inode) ino = dentry->d_inode->i_ino; dput(dentry); } -out: return ino; } @@ -1547,13 +1678,51 @@ static int __init set_dhash_entries(char *str) } __setup("dhash_entries=", set_dhash_entries); -static void __init dcache_init(unsigned long mempages) +static void __init dcache_init_early(void) +{ + int loop; + + /* If hashes are distributed across NUMA nodes, defer + * hash allocation until vmalloc space is available. + */ + if (hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_head), + dhash_entries, + 13, + HASH_EARLY, + &d_hash_shift, + &d_hash_mask, + 0); + + for (loop = 0; loop < (1 << d_hash_shift); loop++) + INIT_HLIST_HEAD(&dentry_hashtable[loop]); +} + +void flush_dentry_attributes (void) { - struct hlist_head *d; - unsigned long order; - unsigned int nr_hash; + struct hlist_node *tmp; + struct dentry *dentry; int i; + spin_lock(&dcache_lock); + for (i = 0; i <= d_hash_mask; i++) + hlist_for_each_entry(dentry, tmp, dentry_hashtable+i, d_hash) { + kfree(dentry->d_extra_attributes); + dentry->d_extra_attributes = NULL; + } + spin_unlock(&dcache_lock); +} + +EXPORT_SYMBOL_GPL(flush_dentry_attributes); + +static void __init dcache_init(unsigned long mempages) +{ + int loop; + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature @@ -1562,64 +1731,47 @@ static void __init dcache_init(unsigned long mempages) dentry_cache = kmem_cache_create("dentry_cache", sizeof(struct dentry), 0, - SLAB_RECLAIM_ACCOUNT, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), NULL, NULL); - if (!dentry_cache) - panic("Cannot create dentry cache"); set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); - if (!dhash_entries) - dhash_entries = PAGE_SHIFT < 13 ? - mempages >> (13 - PAGE_SHIFT) : - mempages << (PAGE_SHIFT - 13); - - dhash_entries *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < dhash_entries; order++) - ; - - do { - unsigned long tmp; - - nr_hash = (1UL << order) * PAGE_SIZE / - sizeof(struct hlist_head); - d_hash_mask = (nr_hash - 1); - - tmp = nr_hash; - d_hash_shift = 0; - while ((tmp >>= 1UL) != 0UL) - d_hash_shift++; - - dentry_hashtable = (struct hlist_head *) - __get_free_pages(GFP_ATOMIC, order); - } while (dentry_hashtable == NULL && --order >= 0); - - printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); - - if (!dentry_hashtable) - panic("Failed to allocate dcache hash table\n"); + /* Hash may have been set up in dcache_init_early */ + if (!hashdist) + return; - d = dentry_hashtable; - i = nr_hash; - do { - INIT_HLIST_HEAD(d); - d++; - i--; - } while (i); + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_head), + dhash_entries, + 13, + 0, + &d_hash_shift, + &d_hash_mask, + 0); + + for (loop = 0; loop < (1 << d_hash_shift); loop++) + INIT_HLIST_HEAD(&dentry_hashtable[loop]); } /* SLAB cache for __getname() consumers */ -kmem_cache_t *names_cachep; +kmem_cache_t *names_cachep __read_mostly; /* SLAB cache for file structures */ -kmem_cache_t *filp_cachep; +kmem_cache_t *filp_cachep __read_mostly; EXPORT_SYMBOL(d_genocide); extern void bdev_cache_init(void); extern void chrdev_init(void); +void __init vfs_caches_init_early(void) +{ + dcache_init_early(); + inode_init_early(); +} + void __init vfs_caches_init(unsigned long mempages) { unsigned long reserve; @@ -1627,20 +1779,14 @@ void __init vfs_caches_init(unsigned long mempages) /* Base hash sizes on available memory, with a reserve equal to 150% of current kernel size */ - reserve = (mempages - nr_free_pages()) * 3/2; + reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); mempages -= reserve; - names_cachep = kmem_cache_create("names_cache", - PATH_MAX, 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!names_cachep) - panic("Cannot create names SLAB cache"); + names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - filp_cachep = kmem_cache_create("filp", - sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN, filp_ctor, filp_dtor); - if(!filp_cachep) - panic("Cannot create filp SLAB cache"); + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); @@ -1668,8 +1814,6 @@ EXPORT_SYMBOL(dget_locked); EXPORT_SYMBOL(dput); EXPORT_SYMBOL(find_inode_number); EXPORT_SYMBOL(have_submounts); -EXPORT_SYMBOL(is_subdir); EXPORT_SYMBOL(names_cachep); -EXPORT_SYMBOL(shrink_dcache_anon); EXPORT_SYMBOL(shrink_dcache_parent); EXPORT_SYMBOL(shrink_dcache_sb);