X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=fs%2Fnamespace.c;h=5a8b800854435e135d07cd6f4e292f1dfb4368c6;hb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;hp=9bd40b9ed65c3530519b3d1f15bdc027f5541130;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/fs/namespace.c b/fs/namespace.c index 9bd40b9ed..5a8b80085 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_fslink); if (name) { int size = strlen(name)+1; char *newname = kmalloc(size, GFP_KERNEL); @@ -108,13 +110,9 @@ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) EXPORT_SYMBOL(lookup_mnt); -static int check_mnt(struct vfsmount *mnt) +static inline int check_mnt(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); - while (mnt->mnt_parent != mnt) - mnt = mnt->mnt_parent; - spin_unlock(&vfsmount_lock); - return mnt == current->namespace->root; + return mnt->mnt_namespace == current->namespace; } static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) @@ -166,6 +164,14 @@ clone_mnt(struct vfsmount *old, struct dentry *root) mnt->mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; + mnt->mnt_namespace = old->mnt_namespace; + + /* stick the duplicate mount on the same expiry list + * as the original if that was on one */ + spin_lock(&vfsmount_lock); + if (!list_empty(&old->mnt_fslink)) + list_add(&mnt->mnt_fslink, &old->mnt_fslink); + spin_unlock(&vfsmount_lock); } return mnt; } @@ -226,6 +232,7 @@ static int show_vfsmnt(struct seq_file *m, void *v) { MS_MANDLOCK, ",mand" }, { MS_NOATIME, ",noatime" }, { MS_NODIRATIME, ",nodiratime" }, + { MS_TAGXID, ",tagxid" }, { 0, NULL } }; static struct proc_fs_info mnt_info[] = { @@ -238,6 +245,8 @@ static int show_vfsmnt(struct seq_file *m, void *v) if (vx_flags(VXF_HIDE_MOUNT, 0)) return 0; + if (!vx_check_vfsmount(current->vx_info, mnt)) + return 0; mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); seq_putc(m, ' '); @@ -343,6 +352,7 @@ static inline void __umount_tree(struct vfsmount *mnt, struct list_head *kill) while (!list_empty(kill)) { mnt = list_entry(kill->next, struct vfsmount, mnt_list); list_del_init(&mnt->mnt_list); + list_del_init(&mnt->mnt_fslink); if (mnt->mnt_parent == mnt) { spin_unlock(&vfsmount_lock); } else { @@ -391,6 +401,24 @@ static int do_umount(struct vfsmount *mnt, int flags) if (retval) return retval; + /* + * Allow userspace to request a mountpoint be expired rather than + * unmounting unconditionally. Unmount only happens if: + * (1) the mark is already set (the mark is cleared by mntput()) + * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] + */ + if (flags & MNT_EXPIRE) { + if (mnt == current->fs->rootmnt || + flags & (MNT_FORCE | MNT_DETACH)) + return -EINVAL; + + if (atomic_read(&mnt->mnt_count) != 2) + return -EBUSY; + + if (!xchg(&mnt->mnt_expiry_mark, 1)) + return -EAGAIN; + } + /* * If we may have to abort operations to get out of this * mount, and they will themselves hold resources we must @@ -423,7 +451,7 @@ static int do_umount(struct vfsmount *mnt, int flags) down_write(&sb->s_umount); if (!(sb->s_flags & MS_RDONLY)) { lock_kernel(); - retval = do_remount_sb(sb, MS_RDONLY, 0, 0); + retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); unlock_kernel(); } up_write(&sb->s_umount); @@ -484,7 +512,7 @@ asmlinkage long sys_umount(char __user * name, int flags) retval = do_umount(nd.mnt, flags); dput_and_out: - path_release(&nd); + path_release_on_umount(&nd); out: return retval; } @@ -643,6 +671,11 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) } if (mnt) { + /* stop bind mounts from expiring */ + spin_lock(&vfsmount_lock); + list_del_init(&mnt->mnt_fslink); + spin_unlock(&vfsmount_lock); + err = graft_tree(mnt, nd); if (err) { spin_lock(&vfsmount_lock); @@ -663,12 +696,13 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) * on it - tough luck. */ -static int do_remount(struct nameidata *nd,int flags,int mnt_flags,void *data) +static int do_remount(struct nameidata *nd, int flags, int mnt_flags, + void *data) { int err; struct super_block * sb = nd->mnt->mnt_sb; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && !vx_ccaps(VXC_SECURE_REMOUNT)) return -EPERM; if (!check_mnt(nd->mnt)) @@ -677,6 +711,8 @@ static int do_remount(struct nameidata *nd,int flags,int mnt_flags,void *data) if (nd->dentry != nd->mnt->mnt_root) return -EINVAL; + if (vx_ccaps(VXC_SECURE_REMOUNT)) + mnt_flags |= MNT_NODEV; down_write(&sb->s_umount); err = do_remount_sb(sb, flags, data, 0); if (!err) @@ -692,7 +728,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name) struct nameidata old_nd, parent_nd; struct vfsmount *p; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && !vx_ccaps(VXC_SECURE_MOUNT)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -735,6 +771,10 @@ static int do_move_mount(struct nameidata *nd, char *old_name) detach_mnt(old_nd.mnt, &parent_nd); attach_mnt(old_nd.mnt, nd); + + /* if the mount is moved, it should no longer be expire + * automatically */ + list_del_init(&old_nd.mnt->mnt_fslink); out2: spin_unlock(&vfsmount_lock); out1: @@ -747,11 +787,14 @@ out: return err; } -static int do_add_mount(struct nameidata *nd, char *type, int flags, +/* + * create a new mount for userspace and request it to be added into the + * namespace's tree + */ +static int do_new_mount(struct nameidata *nd, char *type, int flags, int mnt_flags, char *name, void *data) { struct vfsmount *mnt; - int err; if (!type || !memchr(type, 0, PAGE_SIZE)) return -EINVAL; @@ -761,9 +804,20 @@ static int do_add_mount(struct nameidata *nd, char *type, int flags, return -EPERM; mnt = do_kern_mount(type, flags, name, data); - err = PTR_ERR(mnt); if (IS_ERR(mnt)) - goto out; + return PTR_ERR(mnt); + + return do_add_mount(mnt, nd, mnt_flags, NULL); +} + +/* + * add a mount into a namespace's mount tree + * - provide the option of adding the new mount to an expiration list + */ +int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, + int mnt_flags, struct list_head *fslist) +{ + int err; down_write(¤t->namespace->sem); /* Something was mounted here while we slept */ @@ -775,23 +829,167 @@ static int do_add_mount(struct nameidata *nd, char *type, int flags, /* Refuse the same filesystem on the same mount point */ err = -EBUSY; - if (nd->mnt->mnt_sb == mnt->mnt_sb && nd->mnt->mnt_root == nd->dentry) + if (nd->mnt->mnt_sb == newmnt->mnt_sb && + nd->mnt->mnt_root == nd->dentry) goto unlock; err = -EINVAL; - if (S_ISLNK(mnt->mnt_root->d_inode->i_mode)) + if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) goto unlock; - mnt->mnt_flags = mnt_flags; - err = graft_tree(mnt, nd); + newmnt->mnt_flags = mnt_flags; + err = graft_tree(newmnt, nd); + + if (err == 0 && fslist) { + /* add to the specified expiration list */ + spin_lock(&vfsmount_lock); + list_add_tail(&newmnt->mnt_fslink, fslist); + spin_unlock(&vfsmount_lock); + } + unlock: up_write(¤t->namespace->sem); - mntput(mnt); -out: + mntput(newmnt); return err; } -int copy_mount_options (const void __user *data, unsigned long *where) +EXPORT_SYMBOL_GPL(do_add_mount); + +/* + * process a list of expirable mountpoints with the intent of discarding any + * mountpoints that aren't in use and haven't been touched since last we came + * here + */ +void mark_mounts_for_expiry(struct list_head *mounts) +{ + struct namespace *namespace; + struct vfsmount *mnt, *next; + LIST_HEAD(graveyard); + + if (list_empty(mounts)) + return; + + spin_lock(&vfsmount_lock); + + /* extract from the expiration list every vfsmount that matches the + * following criteria: + * - only referenced by its parent vfsmount + * - still marked for expiry (marked on the last call here; marks are + * cleared by mntput()) + */ + list_for_each_entry_safe(mnt, next, mounts, mnt_fslink) { + if (!xchg(&mnt->mnt_expiry_mark, 1) || + atomic_read(&mnt->mnt_count) != 1) + continue; + + mntget(mnt); + list_move(&mnt->mnt_fslink, &graveyard); + } + + /* + * go through the vfsmounts we've just consigned to the graveyard to + * - check that they're still dead + * - delete the vfsmount from the appropriate namespace under lock + * - dispose of the corpse + */ + while (!list_empty(&graveyard)) { + mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink); + list_del_init(&mnt->mnt_fslink); + + /* don't do anything if the namespace is dead - all the + * vfsmounts from it are going away anyway */ + namespace = mnt->mnt_namespace; + if (!namespace || atomic_read(&namespace->count) <= 0) + continue; + get_namespace(namespace); + + spin_unlock(&vfsmount_lock); + down_write(&namespace->sem); + spin_lock(&vfsmount_lock); + + /* check that it is still dead: the count should now be 2 - as + * contributed by the vfsmount parent and the mntget above */ + if (atomic_read(&mnt->mnt_count) == 2) { + struct vfsmount *xdmnt; + struct dentry *xdentry; + + /* delete from the namespace */ + list_del_init(&mnt->mnt_list); + list_del_init(&mnt->mnt_child); + list_del_init(&mnt->mnt_hash); + mnt->mnt_mountpoint->d_mounted--; + + xdentry = mnt->mnt_mountpoint; + mnt->mnt_mountpoint = mnt->mnt_root; + xdmnt = mnt->mnt_parent; + mnt->mnt_parent = mnt; + + spin_unlock(&vfsmount_lock); + + mntput(xdmnt); + dput(xdentry); + + /* now lay it to rest if this was the last ref on the + * superblock */ + if (atomic_read(&mnt->mnt_sb->s_active) == 1) { + /* last instance - try to be smart */ + lock_kernel(); + DQUOT_OFF(mnt->mnt_sb); + acct_auto_close(mnt->mnt_sb); + unlock_kernel(); + } + + mntput(mnt); + } else { + /* someone brought it back to life whilst we didn't + * have any locks held so return it to the expiration + * list */ + list_add_tail(&mnt->mnt_fslink, mounts); + spin_unlock(&vfsmount_lock); + } + + up_write(&namespace->sem); + + mntput(mnt); + put_namespace(namespace); + + spin_lock(&vfsmount_lock); + } + + spin_unlock(&vfsmount_lock); +} + +EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); + +/* + * Some copy_from_user() implementations do not return the exact number of + * bytes remaining to copy on a fault. But copy_mount_options() requires that. + * Note that this function differs from copy_from_user() in that it will oops + * on bad values of `to', rather than returning a short copy. + */ +static long +exact_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + char *t = to; + const char __user *f = from; + char c; + + if (!access_ok(VERIFY_READ, from, n)) + return n; + + while (n) { + if (__get_user(c, f)) { + memset(t, 0, n); + break; + } + *t++ = c; + f++; + n--; + } + return n; +} + +int copy_mount_options(const void __user *data, unsigned long *where) { int i; unsigned long page; @@ -813,7 +1011,7 @@ int copy_mount_options (const void __user *data, unsigned long *where) if (size > PAGE_SIZE) size = PAGE_SIZE; - i = size - copy_from_user((void *)page, data, size); + i = size - exact_copy_from_user((void *)page, data, size); if (!i) { free_page(page); return -EFAULT; @@ -888,7 +1086,7 @@ long do_mount(char * dev_name, char * dir_name, char *type_page, else if (flags & MS_MOVE) retval = do_move_mount(&nd, dev_name); else - retval = do_add_mount(&nd, type_page, flags, mnt_flags, + retval = do_new_mount(&nd, type_page, flags, mnt_flags, dev_name, data_page); dput_out: path_release(&nd); @@ -901,6 +1099,7 @@ int copy_namespace(int flags, struct task_struct *tsk) struct namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; struct fs_struct *fs = tsk->fs; + struct vfsmount *p, *q; if (!namespace) return 0; @@ -910,7 +1109,7 @@ int copy_namespace(int flags, struct task_struct *tsk) if (!(flags & CLONE_NEWNS)) return 0; - if (!capable(CAP_SYS_ADMIN)) { + if (!capable(CAP_SYS_ADMIN) && !vx_ccaps(VXC_SECURE_MOUNT)) { put_namespace(namespace); return -EPERM; } @@ -935,14 +1134,16 @@ int copy_namespace(int flags, struct task_struct *tsk) list_add_tail(&new_ns->list, &new_ns->root->mnt_list); spin_unlock(&vfsmount_lock); - /* Second pass: switch the tsk->fs->* elements */ - if (fs) { - struct vfsmount *p, *q; - write_lock(&fs->lock); - - p = namespace->root; - q = new_ns->root; - while (p) { + /* + * Second pass: switch the tsk->fs->* elements and mark new vfsmounts + * as belonging to new namespace. We have already acquired a private + * fs_struct, so tsk->fs->lock is not needed. + */ + p = namespace->root; + q = new_ns->root; + while (p) { + q->mnt_namespace = new_ns; + if (fs) { if (p == fs->rootmnt) { rootmnt = p; fs->rootmnt = mntget(q); @@ -955,10 +1156,9 @@ int copy_namespace(int flags, struct task_struct *tsk) altrootmnt = p; fs->altrootmnt = mntget(q); } - p = next_mnt(p, namespace->root); - q = next_mnt(q, new_ns->root); } - write_unlock(&fs->lock); + p = next_mnt(p, namespace->root); + q = next_mnt(q, new_ns->root); } up_write(&tsk->namespace->sem); @@ -1213,6 +1413,7 @@ static void __init init_mount_tree(void) init_rwsem(&namespace->sem); list_add(&mnt->mnt_list, &namespace->list); namespace->root = mnt; + mnt->mnt_namespace = namespace; init_task.namespace = namespace; read_lock(&tasklist_lock); @@ -1280,8 +1481,15 @@ void __init mnt_init(unsigned long mempages) void __put_namespace(struct namespace *namespace) { + struct vfsmount *mnt; + down_write(&namespace->sem); spin_lock(&vfsmount_lock); + + list_for_each_entry(mnt, &namespace->list, mnt_list) { + mnt->mnt_namespace = NULL; + } + umount_tree(namespace->root); spin_unlock(&vfsmount_lock); up_write(&namespace->sem);