fs/namei.c

   1 /*
   2  *  linux/fs/namei.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  * Some corrections by tytso.
   9  */
  10
  11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12  * lookup logic.
  13  */
  14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  15  */
  16
  17 #include <linux/init.h>
  18 #include <linux/module.h>
  19 #include <linux/slab.h>
  20 #include <linux/fs.h>
  21 #include <linux/namei.h>
  22 #include <linux/quotaops.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/dnotify.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/personality.h>
  27 #include <linux/security.h>
  28 #include <linux/mount.h>
  29 #include <linux/audit.h>
  30 #include <linux/vs_base.h>
  31
  32 #include <asm/namei.h>
  33 #include <asm/uaccess.h>
  34
  35 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
  36
  37 /* [Feb-1997 T. Schoebel-Theuer]
  38  * Fundamental changes in the pathname lookup mechanisms (namei)
  39  * were necessary because of omirr.  The reason is that omirr needs
  40  * to know the _real_ pathname, not the user-supplied one, in case
  41  * of symlinks (and also when transname replacements occur).
  42  *
  43  * The new code replaces the old recursive symlink resolution with
  44  * an iterative one (in case of non-nested symlink chains).  It does
  45  * this with calls to <fs>_follow_link().
  46  * As a side effect, dir_namei(), _namei() and follow_link() are now
  47  * replaced with a single function lookup_dentry() that can handle all
  48  * the special cases of the former code.
  49  *
  50  * With the new dcache, the pathname is stored at each inode, at least as
  51  * long as the refcount of the inode is positive.  As a side effect, the
  52  * size of the dcache depends on the inode cache and thus is dynamic.
  53  *
  54  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  55  * resolution to correspond with current state of the code.
  56  *
  57  * Note that the symlink resolution is not *completely* iterative.
  58  * There is still a significant amount of tail- and mid- recursion in
  59  * the algorithm.  Also, note that <fs>_readlink() is not used in
  60  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  61  * may return different results than <fs>_follow_link().  Many virtual
  62  * filesystems (including /proc) exhibit this behavior.
  63  */
  64
  65 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  66  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  67  * and the name already exists in form of a symlink, try to create the new
  68  * name indicated by the symlink. The old code always complained that the
  69  * name already exists, due to not following the symlink even if its target
  70  * is nonexistent.  The new semantics affects also mknod() and link() when
  71  * the name is a symlink pointing to a non-existant name.
  72  *
  73  * I don't know which semantics is the right one, since I have no access
  74  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  75  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  76  * "old" one. Personally, I think the new semantics is much more logical.
  77  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  78  * file does succeed in both HP-UX and SunOs, but not in Solaris
  79  * and in the old Linux semantics.
  80  */
  81
  82 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  83  * semantics.  See the comments in "open_namei" and "do_link" below.
  84  *
  85  * [10-Sep-98 Alan Modra] Another symlink change.
  86  */
  87
  88 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  89  *      inside the path - always follow.
  90  *      in the last component in creation/removal/renaming - never follow.
  91  *      if LOOKUP_FOLLOW passed - follow.
  92  *      if the pathname has trailing slashes - follow.
  93  *      otherwise - don't follow.
  94  * (applied in that order).
  95  *
  96  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
  97  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
  98  * During the 2.4 we need to fix the userland stuff depending on it -
  99  * hopefully we will be able to get rid of that wart in 2.5. So far only
 100  * XEmacs seems to be relying on it...
 101  */
 102 /*
 103  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 104  * implemented.  Let's see if raised priority of ->s_vfs_rename_sem gives
 105  * any extra contention...
 106  */
 107
 108 /* In order to reduce some races, while at the same time doing additional
 109  * checking and hopefully speeding things up, we copy filenames to the
 110  * kernel data space before using them..
 111  *
 112  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 113  * PATH_MAX includes the nul terminator --RR.
 114  */
 115 static inline int do_getname(const char __user *filename, char *page)
 116 {
 117         int retval;
 118         unsigned long len = PATH_MAX;
 119
 120         if ((unsigned long) filename >= TASK_SIZE) {
 121                 if (!segment_eq(get_fs(), KERNEL_DS))
 122                         return -EFAULT;
 123         } else if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
 124                 len = TASK_SIZE - (unsigned long) filename;
 125
 126         retval = strncpy_from_user((char *)page, filename, len);
 127         if (retval > 0) {
 128                 if (retval < len)
 129                         return 0;
 130                 return -ENAMETOOLONG;
 131         } else if (!retval)
 132                 retval = -ENOENT;
 133         return retval;
 134 }
 135
 136 char * getname(const char __user * filename)
 137 {
 138         char *tmp, *result;
 139
 140         result = ERR_PTR(-ENOMEM);
 141         tmp = __getname();
 142         if (tmp)  {
 143                 int retval = do_getname(filename, tmp);
 144
 145                 result = tmp;
 146                 if (retval < 0) {
 147                         __putname(tmp);
 148                         result = ERR_PTR(retval);
 149                 }
 150         }
 151         if (unlikely(current->audit_context) && !IS_ERR(result) && result)
 152                 audit_getname(result);
 153         return result;
 154 }
 155
 156 /*
 157  *      vfs_permission()
 158  *
 159  * is used to check for read/write/execute permissions on a file.
 160  * We use "fsuid" for this, letting us set arbitrary permissions
 161  * for filesystem access without changing the "normal" uids which
 162  * are used for other things..
 163  */
 164 int vfs_permission(struct inode * inode, int mask)
 165 {
 166         umode_t                 mode = inode->i_mode;
 167
 168         /* Prevent vservers from escaping chroot() barriers */
 169         if (IS_BARRIER(inode) && !vx_check(0, VX_ADMIN))
 170                 return -EACCES;
 171
 172         if (mask & MAY_WRITE) {
 173                 /*
 174                  * Nobody gets write access to a read-only fs.
 175                  */
 176                 if (IS_RDONLY(inode) &&
 177                     (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 178                         return -EROFS;
 179
 180                 /*
 181                  * Nobody gets write access to an immutable file.
 182                  */
 183                 if (IS_IMMUTABLE(inode))
 184                         return -EACCES;
 185         }
 186
 187         if (current->fsuid == inode->i_uid)
 188                 mode >>= 6;
 189         else if (in_group_p(inode->i_gid))
 190                 mode >>= 3;
 191
 192         /*
 193          * If the DACs are ok we don't need any capability check.
 194          */
 195         if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
 196                 return 0;
 197
 198         /*
 199          * Read/write DACs are always overridable.
 200          * Executable DACs are overridable if at least one exec bit is set.
 201          */
 202         if (!(mask & MAY_EXEC) ||
 203             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
 204                 if (capable(CAP_DAC_OVERRIDE))
 205                         return 0;
 206
 207         /*
 208          * Searching includes executable on directories, else just read.
 209          */
 210         if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
 211                 if (capable(CAP_DAC_READ_SEARCH))
 212                         return 0;
 213
 214         return -EACCES;
 215 }
 216
 217 int permission(struct inode * inode,int mask, struct nameidata *nd)
 218 {
 219         int retval;
 220         int submask;
 221         umode_t mode = inode->i_mode;
 222
 223         /* Ordinary permission routines do not understand MAY_APPEND. */
 224         submask = mask & ~MAY_APPEND;
 225
 226         if (nd && (mask & MAY_WRITE) && MNT_IS_RDONLY(nd->mnt) &&
 227                 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 228                 return -EROFS;
 229
 230         if (inode->i_op && inode->i_op->permission)
 231                 retval = inode->i_op->permission(inode, submask, nd);
 232         else
 233                 retval = vfs_permission(inode, submask);
 234         if (retval)
 235                 return retval;
 236
 237         return security_inode_permission(inode, mask, nd);
 238 }
 239
 240 /*
 241  * get_write_access() gets write permission for a file.
 242  * put_write_access() releases this write permission.
 243  * This is used for regular files.
 244  * We cannot support write (and maybe mmap read-write shared) accesses and
 245  * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
 246  * can have the following values:
 247  * 0: no writers, no VM_DENYWRITE mappings
 248  * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
 249  * > 0: (i_writecount) users are writing to the file.
 250  *
 251  * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 252  * except for the cases where we don't hold i_writecount yet. Then we need to
 253  * use {get,deny}_write_access() - these functions check the sign and refuse
 254  * to do the change if sign is wrong. Exclusion between them is provided by
 255  * the inode->i_lock spinlock.
 256  */
 257
 258 int get_write_access(struct inode * inode)
 259 {
 260         spin_lock(&inode->i_lock);
 261         if (atomic_read(&inode->i_writecount) < 0) {
 262                 spin_unlock(&inode->i_lock);
 263                 return -ETXTBSY;
 264         }
 265         atomic_inc(&inode->i_writecount);
 266         spin_unlock(&inode->i_lock);
 267
 268         return 0;
 269 }
 270
 271 int deny_write_access(struct file * file)
 272 {
 273         struct inode *inode = file->f_dentry->d_inode;
 274
 275         spin_lock(&inode->i_lock);
 276         if (atomic_read(&inode->i_writecount) > 0) {
 277                 spin_unlock(&inode->i_lock);
 278                 return -ETXTBSY;
 279         }
 280         atomic_dec(&inode->i_writecount);
 281         spin_unlock(&inode->i_lock);
 282
 283         return 0;
 284 }
 285
 286 void path_release(struct nameidata *nd)
 287 {
 288         dput(nd->dentry);
 289         mntput(nd->mnt);
 290 }
 291
 292 /*
 293  * umount() mustn't call path_release()/mntput() as that would clear
 294  * mnt_expiry_mark
 295  */
 296 void path_release_on_umount(struct nameidata *nd)
 297 {
 298         dput(nd->dentry);
 299         _mntput(nd->mnt);
 300 }
 301
 302 /*
 303  * Internal lookup() using the new generic dcache.
 304  * SMP-safe
 305  */
 306 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 307 {
 308         struct dentry * dentry = __d_lookup(parent, name);
 309
 310         /* lockess __d_lookup may fail due to concurrent d_move()
 311          * in some unrelated directory, so try with d_lookup
 312          */
 313         if (!dentry)
 314                 dentry = d_lookup(parent, name);
 315
 316         if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
 317                 if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) {
 318                         dput(dentry);
 319                         dentry = NULL;
 320                 }
 321         }
 322         return dentry;
 323 }
 324
 325 /*
 326  * Short-cut version of permission(), for calling by
 327  * path_walk(), when dcache lock is held.  Combines parts
 328  * of permission() and vfs_permission(), and tests ONLY for
 329  * MAY_EXEC permission.
 330  *
 331  * If appropriate, check DAC only.  If not appropriate, or
 332  * short-cut DAC fails, then call permission() to do more
 333  * complete permission check.
 334  */
 335 static inline int exec_permission_lite(struct inode *inode,
 336                                        struct nameidata *nd)
 337 {
 338         umode_t mode = inode->i_mode;
 339
 340         if (inode->i_op && inode->i_op->permission)
 341                 return -EAGAIN;
 342
 343         if (current->fsuid == inode->i_uid)
 344                 mode >>= 6;
 345         else if (in_group_p(inode->i_gid))
 346                 mode >>= 3;
 347
 348         if (mode & MAY_EXEC)
 349                 goto ok;
 350
 351         if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
 352                 goto ok;
 353
 354         if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
 355                 goto ok;
 356
 357         if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
 358                 goto ok;
 359
 360         return -EACCES;
 361 ok:
 362         return security_inode_permission(inode, MAY_EXEC, nd);
 363 }
 364
 365 /*
 366  * This is called when everything else fails, and we actually have
 367  * to go to the low-level filesystem to find out what we should do..
 368  *
 369  * We get the directory semaphore, and after getting that we also
 370  * make sure that nobody added the entry to the dcache in the meantime..
 371  * SMP-safe
 372  */
 373 static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 374 {
 375         struct dentry * result;
 376         struct inode *dir = parent->d_inode;
 377
 378         down(&dir->i_sem);
 379         /*
 380          * First re-do the cached lookup just in case it was created
 381          * while we waited for the directory semaphore..
 382          *
 383          * FIXME! This could use version numbering or similar to
 384          * avoid unnecessary cache lookups.
 385          *
 386          * The "dcache_lock" is purely to protect the RCU list walker
 387          * from concurrent renames at this point (we mustn't get false
 388          * negatives from the RCU list walk here, unlike the optimistic
 389          * fast walk).
 390          *
 391          * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
 392          */
 393         result = d_lookup(parent, name);
 394         if (!result) {
 395                 struct dentry * dentry = d_alloc(parent, name);
 396                 result = ERR_PTR(-ENOMEM);
 397                 if (dentry) {
 398                         result = dir->i_op->lookup(dir, dentry, nd);
 399                         if (result)
 400                                 dput(dentry);
 401                         else
 402                                 result = dentry;
 403                 }
 404                 up(&dir->i_sem);
 405                 return result;
 406         }
 407
 408         /*
 409          * Uhhuh! Nasty case: the cache was re-populated while
 410          * we waited on the semaphore. Need to revalidate.
 411          */
 412         up(&dir->i_sem);
 413         if (result->d_op && result->d_op->d_revalidate) {
 414                 if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
 415                         dput(result);
 416                         result = ERR_PTR(-ENOENT);
 417                 }
 418         }
 419         return result;
 420 }
 421
 422 static int __emul_lookup_dentry(const char *, struct nameidata *);
 423
 424 /* SMP-safe */
 425 static inline int
 426 walk_init_root(const char *name, struct nameidata *nd)
 427 {
 428         read_lock(&current->fs->lock);
 429         if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
 430                 nd->mnt = mntget(current->fs->altrootmnt);
 431                 nd->dentry = dget(current->fs->altroot);
 432                 read_unlock(&current->fs->lock);
 433                 if (__emul_lookup_dentry(name,nd))
 434                         return 0;
 435                 read_lock(&current->fs->lock);
 436         }
 437         nd->mnt = mntget(current->fs->rootmnt);
 438         nd->dentry = dget(current->fs->root);
 439         read_unlock(&current->fs->lock);
 440         return 1;
 441 }
 442
 443 static inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 444 {
 445         int res = 0;
 446         char *name;
 447         if (IS_ERR(link))
 448                 goto fail;
 449
 450         if (*link == '/') {
 451                 path_release(nd);
 452                 if (!walk_init_root(link, nd))
 453                         /* weird __emul_prefix() stuff did it */
 454                         goto out;
 455         }
 456         res = link_path_walk(link, nd);
 457 out:
 458         if (nd->depth || res || nd->last_type!=LAST_NORM)
 459                 return res;
 460         /*
 461          * If it is an iterative symlinks resolution in open_namei() we
 462          * have to copy the last component. And all that crap because of
 463          * bloody create() on broken symlinks. Furrfu...
 464          */
 465         name = __getname();
 466         if (unlikely(!name)) {
 467                 path_release(nd);
 468                 return -ENOMEM;
 469         }
 470         strcpy(name, nd->last.name);
 471         nd->last.name = name;
 472         return 0;
 473 fail:
 474         path_release(nd);
 475         return PTR_ERR(link);
 476 }
 477
 478 /*
 479  * This limits recursive symlink follows to 8, while
 480  * limiting consecutive symlinks to 40.
 481  *
 482  * Without that kind of total limit, nasty chains of consecutive
 483  * symlinks can cause almost arbitrarily long lookups.
 484  */
 485 static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
 486 {
 487         int err = -ELOOP;
 488         if (current->link_count >= MAX_NESTED_LINKS)
 489                 goto loop;
 490         if (current->total_link_count >= 40)
 491                 goto loop;
 492         BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 493         cond_resched();
 494         err = security_inode_follow_link(dentry, nd);
 495         if (err)
 496                 goto loop;
 497         current->link_count++;
 498         current->total_link_count++;
 499         nd->depth++;
 500         touch_atime(nd->mnt, dentry);
 501         nd_set_link(nd, NULL);
 502         err = dentry->d_inode->i_op->follow_link(dentry, nd);
 503         if (!err) {
 504                 char *s = nd_get_link(nd);
 505                 if (s)
 506                         err = __vfs_follow_link(nd, s);
 507                 if (dentry->d_inode->i_op->put_link)
 508                         dentry->d_inode->i_op->put_link(dentry, nd);
 509         }
 510         current->link_count--;
 511         nd->depth--;
 512         return err;
 513 loop:
 514         path_release(nd);
 515         return err;
 516 }
 517
 518 int follow_up(struct vfsmount **mnt, struct dentry **dentry)
 519 {
 520         struct vfsmount *parent;
 521         struct dentry *mountpoint;
 522         spin_lock(&vfsmount_lock);
 523         parent=(*mnt)->mnt_parent;
 524         if (parent == *mnt) {
 525                 spin_unlock(&vfsmount_lock);
 526                 return 0;
 527         }
 528         mntget(parent);
 529         mountpoint=dget((*mnt)->mnt_mountpoint);
 530         spin_unlock(&vfsmount_lock);
 531         dput(*dentry);
 532         *dentry = mountpoint;
 533         mntput(*mnt);
 534         *mnt = parent;
 535         return 1;
 536 }
 537
 538 /* no need for dcache_lock, as serialization is taken care in
 539  * namespace.c
 540  */
 541 static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 542 {
 543         int res = 0;
 544         while (d_mountpoint(*dentry)) {
 545                 struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
 546                 if (!mounted)
 547                         break;
 548                 mntput(*mnt);
 549                 *mnt = mounted;
 550                 dput(*dentry);
 551                 *dentry = dget(mounted->mnt_root);
 552                 res = 1;
 553         }
 554         return res;
 555 }
 556
 557 /* no need for dcache_lock, as serialization is taken care in
 558  * namespace.c
 559  */
 560 static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
 561 {
 562         struct vfsmount *mounted;
 563
 564         mounted = lookup_mnt(*mnt, *dentry);
 565         if (mounted) {
 566                 mntput(*mnt);
 567                 *mnt = mounted;
 568                 dput(*dentry);
 569                 *dentry = dget(mounted->mnt_root);
 570                 return 1;
 571         }
 572         return 0;
 573 }
 574
 575 int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 576 {
 577         return __follow_down(mnt,dentry);
 578 }
 579
 580 static inline void follow_dotdot(struct vfsmount **mnt, struct dentry **dentry)
 581 {
 582         while(1) {
 583                 struct vfsmount *parent;
 584                 struct dentry *old = *dentry;
 585
 586                 read_lock(&current->fs->lock);
 587                 if (*dentry == current->fs->root &&
 588                     *mnt == current->fs->rootmnt) {
 589                         read_unlock(&current->fs->lock);
 590                         break;
 591                 }
 592                 read_unlock(&current->fs->lock);
 593                 spin_lock(&dcache_lock);
 594                 if (*dentry != (*mnt)->mnt_root) {
 595                         *dentry = dget((*dentry)->d_parent);
 596                         spin_unlock(&dcache_lock);
 597                         dput(old);
 598                         break;
 599                 }
 600                 spin_unlock(&dcache_lock);
 601                 spin_lock(&vfsmount_lock);
 602                 parent = (*mnt)->mnt_parent;
 603                 if (parent == *mnt) {
 604                         spin_unlock(&vfsmount_lock);
 605                         break;
 606                 }
 607                 mntget(parent);
 608                 *dentry = dget((*mnt)->mnt_mountpoint);
 609                 spin_unlock(&vfsmount_lock);
 610                 dput(old);
 611                 mntput(*mnt);
 612                 *mnt = parent;
 613         }
 614         follow_mount(mnt, dentry);
 615 }
 616
 617 struct path {
 618         struct vfsmount *mnt;
 619         struct dentry *dentry;
 620 };
 621
 622 /*
 623  *  It's more convoluted than I'd like it to be, but... it's still fairly
 624  *  small and for now I'd prefer to have fast path as straight as possible.
 625  *  It _is_ time-critical.
 626  */
 627 static int do_lookup(struct nameidata *nd, struct qstr *name,
 628                      struct path *path)
 629 {
 630         struct vfsmount *mnt = nd->mnt;
 631         struct dentry *dentry = __d_lookup(nd->dentry, name);
 632
 633         if (!dentry)
 634                 goto need_lookup;
 635         if (dentry->d_op && dentry->d_op->d_revalidate)
 636                 goto need_revalidate;
 637 done:
 638         path->mnt = mnt;
 639         path->dentry = dentry;
 640         return 0;
 641
 642 need_lookup:
 643         dentry = real_lookup(nd->dentry, name, nd);
 644         if (IS_ERR(dentry))
 645                 goto fail;
 646         goto done;
 647
 648 need_revalidate:
 649         if (dentry->d_op->d_revalidate(dentry, nd))
 650                 goto done;
 651         if (d_invalidate(dentry))
 652                 goto done;
 653         dput(dentry);
 654         goto need_lookup;
 655
 656 fail:
 657         return PTR_ERR(dentry);
 658 }
 659
 660 /*
 661  * Name resolution.
 662  *
 663  * This is the basic name resolution function, turning a pathname
 664  * into the final dentry.
 665  *
 666  * We expect 'base' to be positive and a directory.
 667  */
 668 int fastcall link_path_walk(const char * name, struct nameidata *nd)
 669 {
 670         struct path next;
 671         struct inode *inode;
 672         int err, atomic;
 673         unsigned int lookup_flags = nd->flags;
 674
 675         atomic = (lookup_flags & LOOKUP_ATOMIC);
 676
 677         while (*name=='/')
 678                 name++;
 679         if (!*name)
 680                 goto return_reval;
 681
 682         inode = nd->dentry->d_inode;
 683         if (nd->depth)
 684                 lookup_flags = LOOKUP_FOLLOW;
 685
 686         /* At this point we know we have a real path component. */
 687         for(;;) {
 688                 unsigned long hash;
 689                 struct qstr this;
 690                 unsigned int c;
 691
 692                 err = exec_permission_lite(inode, nd);
 693                 if (err == -EAGAIN) {
 694                         err = permission(inode, MAY_EXEC, nd);
 695                 }
 696                 if (err)
 697                         break;
 698
 699                 this.name = name;
 700                 c = *(const unsigned char *)name;
 701
 702                 hash = init_name_hash();
 703                 do {
 704                         name++;
 705                         hash = partial_name_hash(c, hash);
 706                         c = *(const unsigned char *)name;
 707                 } while (c && (c != '/'));
 708                 this.len = name - (const char *) this.name;
 709                 this.hash = end_name_hash(hash);
 710
 711                 /* remove trailing slashes? */
 712                 if (!c)
 713                         goto last_component;
 714                 while (*++name == '/');
 715                 if (!*name)
 716                         goto last_with_slashes;
 717
 718                 /*
 719                  * "." and ".." are special - ".." especially so because it has
 720                  * to be able to know about the current root directory and
 721                  * parent relationships.
 722                  */
 723                 if (this.name[0] == '.') switch (this.len) {
 724                         default:
 725                                 break;
 726                         case 2:
 727                                 if (this.name[1] != '.')
 728                                         break;
 729                                 follow_dotdot(&nd->mnt, &nd->dentry);
 730                                 inode = nd->dentry->d_inode;
 731                                 /* fallthrough */
 732                         case 1:
 733                                 continue;
 734                 }
 735                 /*
 736                  * See if the low-level filesystem might want
 737                  * to use its own hash..
 738                  */
 739                 if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
 740                         err = nd->dentry->d_op->d_hash(nd->dentry, &this);
 741                         if (err < 0)
 742                                 break;
 743                 }
 744                 err = -EWOULDBLOCKIO;
 745                 if (atomic)
 746                         break;
 747                 nd->flags |= LOOKUP_CONTINUE;
 748                 /* This does the actual lookups.. */
 749                 err = do_lookup(nd, &this, &next);
 750                 if (err)
 751                         break;
 752                 /* Check mountpoints.. */
 753                 follow_mount(&next.mnt, &next.dentry);
 754
 755                 err = -ENOENT;
 756                 inode = next.dentry->d_inode;
 757                 if (!inode)
 758                         goto out_dput;
 759                 err = -ENOTDIR;
 760                 if (!inode->i_op)
 761                         goto out_dput;
 762
 763                 if (inode->i_op->follow_link) {
 764                         mntget(next.mnt);
 765                         err = do_follow_link(next.dentry, nd);
 766                         dput(next.dentry);
 767                         mntput(next.mnt);
 768                         if (err)
 769                                 goto return_err;
 770                         err = -ENOENT;
 771                         inode = nd->dentry->d_inode;
 772                         if (!inode)
 773                                 break;
 774                         err = -ENOTDIR;
 775                         if (!inode->i_op)
 776                                 break;
 777                 } else {
 778                         dput(nd->dentry);
 779                         nd->mnt = next.mnt;
 780                         nd->dentry = next.dentry;
 781                 }
 782                 err = -ENOTDIR;
 783                 if (!inode->i_op->lookup)
 784                         break;
 785                 continue;
 786                 /* here ends the main loop */
 787
 788 last_with_slashes:
 789                 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 790 last_component:
 791                 nd->flags &= ~LOOKUP_CONTINUE;
 792                 if (lookup_flags & LOOKUP_PARENT)
 793                         goto lookup_parent;
 794                 if (this.name[0] == '.') switch (this.len) {
 795                         default:
 796                                 break;
 797                         case 2:
 798                                 if (this.name[1] != '.')
 799                                         break;
 800                                 follow_dotdot(&nd->mnt, &nd->dentry);
 801                                 inode = nd->dentry->d_inode;
 802                                 /* fallthrough */
 803                         case 1:
 804                                 goto return_reval;
 805                 }
 806                 if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
 807                         err = nd->dentry->d_op->d_hash(nd->dentry, &this);
 808                         if (err < 0)
 809                                 break;
 810                 }
 811                 err = -EWOULDBLOCKIO;
 812                 if (atomic)
 813                         break;
 814                 err = do_lookup(nd, &this, &next);
 815                 if (err)
 816                         break;
 817                 follow_mount(&next.mnt, &next.dentry);
 818                 inode = next.dentry->d_inode;
 819                 if ((lookup_flags & LOOKUP_FOLLOW)
 820                     && inode && inode->i_op && inode->i_op->follow_link) {
 821                         mntget(next.mnt);
 822                         err = do_follow_link(next.dentry, nd);
 823                         dput(next.dentry);
 824                         mntput(next.mnt);
 825                         if (err)
 826                                 goto return_err;
 827                         inode = nd->dentry->d_inode;
 828                 } else {
 829                         dput(nd->dentry);
 830                         nd->mnt = next.mnt;
 831                         nd->dentry = next.dentry;
 832                 }
 833                 err = -ENOENT;
 834                 if (!inode)
 835                         break;
 836                 if (lookup_flags & LOOKUP_DIRECTORY) {
 837                         err = -ENOTDIR;
 838                         if (!inode->i_op || !inode->i_op->lookup)
 839                                 break;
 840                 }
 841                 goto return_base;
 842 lookup_parent:
 843                 nd->last = this;
 844                 nd->last_type = LAST_NORM;
 845                 if (this.name[0] != '.')
 846                         goto return_base;
 847                 if (this.len == 1)
 848                         nd->last_type = LAST_DOT;
 849                 else if (this.len == 2 && this.name[1] == '.')
 850                         nd->last_type = LAST_DOTDOT;
 851                 else
 852                         goto return_base;
 853 return_reval:
 854                 /*
 855                  * We bypassed the ordinary revalidation routines.
 856                  * We may need to check the cached dentry for staleness.
 857                  */
 858                 if (nd->dentry && nd->dentry->d_sb &&
 859                     (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
 860                         err = -ESTALE;
 861                         /* Note: we do not d_invalidate() */
 862                         if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
 863                                 break;
 864                 }
 865 return_base:
 866                 return 0;
 867 out_dput:
 868                 dput(next.dentry);
 869                 break;
 870         }
 871         path_release(nd);
 872 return_err:
 873         return err;
 874 }
 875
 876 int fastcall path_walk(const char * name, struct nameidata *nd)
 877 {
 878         current->total_link_count = 0;
 879         return link_path_walk(name, nd);
 880 }
 881
 882 /* SMP-safe */
 883 /* returns 1 if everything is done */
 884 static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
 885 {
 886         if (path_walk(name, nd))
 887                 return 0;               /* something went wrong... */
 888
 889         if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) {
 890                 struct dentry *old_dentry = nd->dentry;
 891                 struct vfsmount *old_mnt = nd->mnt;
 892                 struct qstr last = nd->last;
 893                 int last_type = nd->last_type;
 894                 /*
 895                  * NAME was not found in alternate root or it's a directory.  Try to find
 896                  * it in the normal root:
 897                  */
 898                 nd->last_type = LAST_ROOT;
 899                 read_lock(&current->fs->lock);
 900                 nd->mnt = mntget(current->fs->rootmnt);
 901                 nd->dentry = dget(current->fs->root);
 902                 read_unlock(&current->fs->lock);
 903                 if (path_walk(name, nd) == 0) {
 904                         if (nd->dentry->d_inode) {
 905                                 dput(old_dentry);
 906                                 mntput(old_mnt);
 907                                 return 1;
 908                         }
 909                         path_release(nd);
 910                 }
 911                 nd->dentry = old_dentry;
 912                 nd->mnt = old_mnt;
 913                 nd->last = last;
 914                 nd->last_type = last_type;
 915         }
 916         return 1;
 917 }
 918
 919 void set_fs_altroot(void)
 920 {
 921         char *emul = __emul_prefix();
 922         struct nameidata nd;
 923         struct vfsmount *mnt = NULL, *oldmnt;
 924         struct dentry *dentry = NULL, *olddentry;
 925         int err;
 926
 927         if (!emul)
 928                 goto set_it;
 929         err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
 930         if (!err) {
 931                 mnt = nd.mnt;
 932                 dentry = nd.dentry;
 933         }
 934 set_it:
 935         write_lock(&current->fs->lock);
 936         oldmnt = current->fs->altrootmnt;
 937         olddentry = current->fs->altroot;
 938         current->fs->altrootmnt = mnt;
 939         current->fs->altroot = dentry;
 940         write_unlock(&current->fs->lock);
 941         if (olddentry) {
 942                 dput(olddentry);
 943                 mntput(oldmnt);
 944         }
 945 }
 946
 947 int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
 948 {
 949         int retval;
 950
 951         nd->last_type = LAST_ROOT; /* if there are only slashes... */
 952         nd->flags = flags;
 953         nd->depth = 0;
 954
 955         read_lock(&current->fs->lock);
 956         if (*name=='/') {
 957                 if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
 958                         nd->mnt = mntget(current->fs->altrootmnt);
 959                         nd->dentry = dget(current->fs->altroot);
 960                         read_unlock(&current->fs->lock);
 961                         if (__emul_lookup_dentry(name,nd))
 962                                 return 0;
 963                         read_lock(&current->fs->lock);
 964                 }
 965                 nd->mnt = mntget(current->fs->rootmnt);
 966                 nd->dentry = dget(current->fs->root);
 967         } else {
 968                 nd->mnt = mntget(current->fs->pwdmnt);
 969                 nd->dentry = dget(current->fs->pwd);
 970         }
 971         read_unlock(&current->fs->lock);
 972         current->total_link_count = 0;
 973         retval = link_path_walk(name, nd);
 974         if (unlikely(current->audit_context
 975                      && nd && nd->dentry && nd->dentry->d_inode))
 976                 audit_inode(name,
 977                             nd->dentry->d_inode->i_ino,
 978                             nd->dentry->d_inode->i_rdev);
 979         return retval;
 980 }
 981
 982 /*
 983  * Restricted form of lookup. Doesn't follow links, single-component only,
 984  * needs parent already locked. Doesn't follow mounts.
 985  * SMP-safe.
 986  */
 987 static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
 988 {
 989         struct dentry * dentry;
 990         struct inode *inode;
 991         int err;
 992
 993         inode = base->d_inode;
 994         err = permission(inode, MAY_EXEC, nd);
 995         dentry = ERR_PTR(err);
 996         if (err)
 997                 goto out;
 998
 999         /*
1000          * See if the low-level filesystem might want
1001          * to use its own hash..
1002          */
1003         if (base->d_op && base->d_op->d_hash) {
1004                 err = base->d_op->d_hash(base, name);
1005                 dentry = ERR_PTR(err);
1006                 if (err < 0)
1007                         goto out;
1008         }
1009
1010         dentry = cached_lookup(base, name, nd);
1011         if (!dentry) {
1012                 struct dentry *new = d_alloc(base, name);
1013                 dentry = ERR_PTR(-ENOMEM);
1014                 if (!new)
1015                         goto out;
1016                 dentry = inode->i_op->lookup(inode, new, nd);
1017                 if (!dentry)
1018                         dentry = new;
1019                 else
1020                         dput(new);
1021         }
1022 out:
1023         return dentry;
1024 }
1025
1026 struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
1027 {
1028         return __lookup_hash(name, base, NULL);
1029 }
1030
1031 /* SMP-safe */
1032 struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
1033 {
1034         unsigned long hash;
1035         struct qstr this;
1036         unsigned int c;
1037
1038         this.name = name;
1039         this.len = len;
1040         if (!len)
1041                 goto access;
1042
1043         hash = init_name_hash();
1044         while (len--) {
1045                 c = *(const unsigned char *)name++;
1046                 if (c == '/' || c == '\0')
1047                         goto access;
1048                 hash = partial_name_hash(c, hash);
1049         }
1050         this.hash = end_name_hash(hash);
1051
1052         return lookup_hash(&this, base);
1053 access:
1054         return ERR_PTR(-EACCES);
1055 }
1056
1057 /*
1058  *      namei()
1059  *
1060  * is used by most simple commands to get the inode of a specified name.
1061  * Open, link etc use their own routines, but this is enough for things
1062  * like 'chmod' etc.
1063  *
1064  * namei exists in two versions: namei/lnamei. The only difference is
1065  * that namei follows links, while lnamei does not.
1066  * SMP-safe
1067  */
1068 int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
1069 {
1070         char *tmp = getname(name);
1071         int err = PTR_ERR(tmp);
1072
1073         if (!IS_ERR(tmp)) {
1074                 err = path_lookup(tmp, flags, nd);
1075                 putname(tmp);
1076         }
1077         return err;
1078 }
1079
1080 /*
1081  * It's inline, so penalty for filesystems that don't use sticky bit is
1082  * minimal.
1083  */
1084 static inline int check_sticky(struct inode *dir, struct inode *inode)
1085 {
1086         if (!(dir->i_mode & S_ISVTX))
1087                 return 0;
1088         if (inode->i_uid == current->fsuid)
1089                 return 0;
1090         if (dir->i_uid == current->fsuid)
1091                 return 0;
1092         return !capable(CAP_FOWNER);
1093 }
1094
1095 /*
1096  *      Check whether we can remove a link victim from directory dir, check
1097  *  whether the type of victim is right.
1098  *  1. We can't do it if dir is read-only (done in permission())
1099  *  2. We should have write and exec permissions on dir
1100  *  3. We can't remove anything from append-only dir
1101  *  4. We can't do anything with immutable dir (done in permission())
1102  *  5. If the sticky bit on dir is set we should either
1103  *      a. be owner of dir, or
1104  *      b. be owner of victim, or
1105  *      c. have CAP_FOWNER capability
1106  *  6. If the victim is append-only or immutable we can't do antyhing with
1107  *     links pointing to it.
1108  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1109  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1110  *  9. We can't remove a root or mountpoint.
1111  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1112  *     nfs_async_unlink().
1113  */
1114 static inline int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1115 {
1116         int error;
1117         if (!victim->d_inode)
1118                 return -ENOENT;
1119         if (victim->d_parent->d_inode != dir)
1120                 BUG();
1121
1122         error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
1123         if (error)
1124                 return error;
1125         if (IS_APPEND(dir))
1126                 return -EPERM;
1127         if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1128                 IS_IXORUNLINK(victim->d_inode))
1129                 return -EPERM;
1130         if (isdir) {
1131                 if (!S_ISDIR(victim->d_inode->i_mode))
1132                         return -ENOTDIR;
1133                 if (IS_ROOT(victim))
1134                         return -EBUSY;
1135         } else if (S_ISDIR(victim->d_inode->i_mode))
1136                 return -EISDIR;
1137         if (IS_DEADDIR(dir))
1138                 return -ENOENT;
1139         if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1140                 return -EBUSY;
1141         return 0;
1142 }
1143
1144 /*      Check whether we can create an object with dentry child in directory
1145  *  dir.
1146  *  1. We can't do it if child already exists (open has special treatment for
1147  *     this case, but since we are inlined it's OK)
1148  *  2. We can't do it if dir is read-only (done in permission())
1149  *  3. We should have write and exec permissions on dir
1150  *  4. We can't do it if dir is immutable (done in permission())
1151  */
1152 static inline int may_create(struct inode *dir, struct dentry *child,
1153                              struct nameidata *nd)
1154 {
1155         if (child->d_inode)
1156                 return -EEXIST;
1157         if (IS_DEADDIR(dir))
1158                 return -ENOENT;
1159         return permission(dir,MAY_WRITE | MAY_EXEC, nd);
1160 }
1161
1162 static inline int mnt_may_create(struct vfsmount *mnt, struct inode *dir, struct dentry *child) {
1163        if (child->d_inode)
1164                return -EEXIST;
1165        if (IS_DEADDIR(dir))
1166                return -ENOENT;
1167        if (mnt->mnt_flags & MNT_RDONLY)
1168                return -EROFS;
1169        return 0;
1170 }
1171
1172 static inline int mnt_may_unlink(struct vfsmount *mnt, struct inode *dir, struct dentry *child) {
1173        if (!child->d_inode)
1174                return -ENOENT;
1175        if (mnt->mnt_flags & MNT_RDONLY)
1176                return -EROFS;
1177        return 0;
1178 }
1179
1180 /*
1181  * Special case: O_CREAT|O_EXCL implies O_NOFOLLOW for security
1182  * reasons.
1183  *
1184  * O_DIRECTORY translates into forcing a directory lookup.
1185  */
1186 static inline int lookup_flags(unsigned int f)
1187 {
1188         unsigned long retval = LOOKUP_FOLLOW;
1189
1190         if (f & O_NOFOLLOW)
1191                 retval &= ~LOOKUP_FOLLOW;
1192
1193         if ((f & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1194                 retval &= ~LOOKUP_FOLLOW;
1195
1196         if (f & O_DIRECTORY)
1197                 retval |= LOOKUP_DIRECTORY;
1198         if (f & O_ATOMICLOOKUP)
1199                 retval |= LOOKUP_ATOMIC;
1200
1201         return retval;
1202 }
1203
1204 /*
1205  * p1 and p2 should be directories on the same fs.
1206  */
1207 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1208 {
1209         struct dentry *p;
1210
1211         if (p1 == p2) {
1212                 down(&p1->d_inode->i_sem);
1213                 return NULL;
1214         }
1215
1216         down(&p1->d_inode->i_sb->s_vfs_rename_sem);
1217
1218         for (p = p1; p->d_parent != p; p = p->d_parent) {
1219                 if (p->d_parent == p2) {
1220                         down(&p2->d_inode->i_sem);
1221                         down(&p1->d_inode->i_sem);
1222                         return p;
1223                 }
1224         }
1225
1226         for (p = p2; p->d_parent != p; p = p->d_parent) {
1227                 if (p->d_parent == p1) {
1228                         down(&p1->d_inode->i_sem);
1229                         down(&p2->d_inode->i_sem);
1230                         return p;
1231                 }
1232         }
1233
1234         down(&p1->d_inode->i_sem);
1235         down(&p2->d_inode->i_sem);
1236         return NULL;
1237 }
1238
1239 void unlock_rename(struct dentry *p1, struct dentry *p2)
1240 {
1241         up(&p1->d_inode->i_sem);
1242         if (p1 != p2) {
1243                 up(&p2->d_inode->i_sem);
1244                 up(&p1->d_inode->i_sb->s_vfs_rename_sem);
1245         }
1246 }
1247
1248 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1249                 struct nameidata *nd)
1250 {
1251         int error = may_create(dir, dentry, nd);
1252
1253         if (error)
1254                 return error;
1255
1256         if (!dir->i_op || !dir->i_op->create)
1257                 return -EACCES; /* shouldn't it be ENOSYS? */
1258         mode &= S_IALLUGO;
1259         mode |= S_IFREG;
1260         error = security_inode_create(dir, dentry, mode);
1261         if (error)
1262                 return error;
1263         DQUOT_INIT(dir);
1264         error = dir->i_op->create(dir, dentry, mode, nd);
1265         if (!error) {
1266                 inode_dir_notify(dir, DN_CREATE);
1267                 security_inode_post_create(dir, dentry, mode);
1268         }
1269         return error;
1270 }
1271
1272 int may_open(struct nameidata *nd, int acc_mode, int flag)
1273 {
1274         struct dentry *dentry = nd->dentry;
1275         struct inode *inode = dentry->d_inode;
1276         int error;
1277
1278         if (!inode)
1279                 return -ENOENT;
1280
1281         if (S_ISLNK(inode->i_mode))
1282                 return -ELOOP;
1283
1284         if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
1285                 return -EISDIR;
1286
1287         error = permission(inode, acc_mode, nd);
1288         if (error)
1289                 return error;
1290
1291         /*
1292          * FIFO's, sockets and device files are special: they don't
1293          * actually live on the filesystem itself, and as such you
1294          * can write to them even if the filesystem is read-only.
1295          */
1296         if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1297                 flag &= ~O_TRUNC;
1298         } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1299                 if (nd->mnt->mnt_flags & MNT_NODEV)
1300                         return -EACCES;
1301
1302                 flag &= ~O_TRUNC;
1303         } else if ((IS_RDONLY(inode) || (nd && MNT_IS_RDONLY(nd->mnt)))
1304                 && (flag & FMODE_WRITE))
1305                 return -EROFS;
1306         /*
1307          * An append-only file must be opened in append mode for writing.
1308          */
1309         if (IS_APPEND(inode)) {
1310                 if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1311                         return -EPERM;
1312                 if (flag & O_TRUNC)
1313                         return -EPERM;
1314         }
1315
1316         /* O_NOATIME can only be set by the owner or superuser */
1317         if (flag & O_NOATIME)
1318                 if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
1319                         return -EPERM;
1320
1321         /*
1322          * Ensure there are no outstanding leases on the file.
1323          */
1324         error = break_lease(inode, flag);
1325         if (error)
1326                 return error;
1327
1328         if (flag & O_TRUNC) {
1329                 error = get_write_access(inode);
1330                 if (error)
1331                         return error;
1332
1333                 /*
1334                  * Refuse to truncate files with mandatory locks held on them.
1335                  */
1336                 error = locks_verify_locked(inode);
1337                 if (!error) {
1338                         DQUOT_INIT(inode);
1339
1340                         error = do_truncate(dentry, 0);
1341                 }
1342                 put_write_access(inode);
1343                 if (error)
1344                         return error;
1345         } else
1346                 if (flag & FMODE_WRITE)
1347                         DQUOT_INIT(inode);
1348
1349         return 0;
1350 }
1351
1352 /*
1353  *      open_namei()
1354  *
1355  * namei for open - this is in fact almost the whole open-routine.
1356  *
1357  * Note that the low bits of "flag" aren't the same as in the open
1358  * system call - they are 00 - no permissions needed
1359  *                        01 - read permission needed
1360  *                        10 - write permission needed
1361  *                        11 - read/write permissions needed
1362  * which is a lot more logical, and also allows the "no perm" needed
1363  * for symlinks (where the permissions are checked later).
1364  * SMP-safe
1365  */
1366 int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
1367 {
1368         int acc_mode, error = 0;
1369         struct dentry *dentry;
1370         struct dentry *dir;
1371         int count = 0;
1372
1373         acc_mode = ACC_MODE(flag);
1374
1375         /* Allow the LSM permission hook to distinguish append
1376            access from general write access. */
1377         if (flag & O_APPEND)
1378                 acc_mode |= MAY_APPEND;
1379
1380         /* Fill in the open() intent data */
1381         nd->intent.open.flags = flag;
1382         nd->intent.open.create_mode = mode;
1383
1384         /*
1385          * The simplest case - just a plain lookup.
1386          */
1387         if (!(flag & O_CREAT)) {
1388                 error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
1389                 if (error)
1390                         return error;
1391                 goto ok;
1392         }
1393
1394         /*
1395          * Create - we need to know the parent.
1396          */
1397         error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
1398         if (error)
1399                 return error;
1400
1401         /*
1402          * We have the parent and last component. First of all, check
1403          * that we are not asked to creat(2) an obvious directory - that
1404          * will not do.
1405          */
1406         error = -EISDIR;
1407         if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
1408                 goto exit;
1409
1410         dir = nd->dentry;
1411         nd->flags &= ~LOOKUP_PARENT;
1412         down(&dir->d_inode->i_sem);
1413         dentry = __lookup_hash(&nd->last, nd->dentry, nd);
1414
1415 do_last:
1416         error = PTR_ERR(dentry);
1417         if (IS_ERR(dentry)) {
1418                 up(&dir->d_inode->i_sem);
1419                 goto exit;
1420         }
1421
1422         /* Negative dentry, just create the file */
1423         if (!dentry->d_inode) {
1424                 if (!IS_POSIXACL(dir->d_inode))
1425                         mode &= ~current->fs->umask;
1426                 error = vfs_create(dir->d_inode, dentry, mode, nd);
1427                 up(&dir->d_inode->i_sem);
1428                 dput(nd->dentry);
1429                 nd->dentry = dentry;
1430                 if (error)
1431                         goto exit;
1432                 /* Don't check for write permission, don't truncate */
1433                 acc_mode = 0;
1434                 flag &= ~O_TRUNC;
1435                 goto ok;
1436         }
1437
1438         /*
1439          * It already exists.
1440          */
1441         up(&dir->d_inode->i_sem);
1442
1443         error = -EEXIST;
1444         if (flag & O_EXCL)
1445                 goto exit_dput;
1446
1447         if (d_mountpoint(dentry)) {
1448                 error = -ELOOP;
1449                 if (flag & O_NOFOLLOW)
1450                         goto exit_dput;
1451                 while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry));
1452         }
1453         error = -ENOENT;
1454         if (!dentry->d_inode)
1455                 goto exit_dput;
1456         if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link)
1457                 goto do_link;
1458
1459         dput(nd->dentry);
1460         nd->dentry = dentry;
1461         error = -EISDIR;
1462         if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode))
1463                 goto exit;
1464 ok:
1465         error = may_open(nd, acc_mode, flag);
1466         if (error)
1467                 goto exit;
1468         return 0;
1469
1470 exit_dput:
1471         dput(dentry);
1472 exit:
1473         path_release(nd);
1474         return error;
1475
1476 do_link:
1477         error = -ELOOP;
1478         if (flag & O_NOFOLLOW)
1479                 goto exit_dput;
1480         /*
1481          * This is subtle. Instead of calling do_follow_link() we do the
1482          * thing by hands. The reason is that this way we have zero link_count
1483          * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
1484          * After that we have the parent and last component, i.e.
1485          * we are in the same situation as after the first path_walk().
1486          * Well, almost - if the last component is normal we get its copy
1487          * stored in nd->last.name and we will have to putname() it when we
1488          * are done. Procfs-like symlinks just set LAST_BIND.
1489          */
1490         nd->flags |= LOOKUP_PARENT;
1491         error = security_inode_follow_link(dentry, nd);
1492         if (error)
1493                 goto exit_dput;
1494         touch_atime(nd->mnt, dentry);
1495         nd_set_link(nd, NULL);
1496         error = dentry->d_inode->i_op->follow_link(dentry, nd);
1497         if (!error) {
1498                 char *s = nd_get_link(nd);
1499                 if (s)
1500                         error = __vfs_follow_link(nd, s);
1501                 if (dentry->d_inode->i_op->put_link)
1502                         dentry->d_inode->i_op->put_link(dentry, nd);
1503         }
1504         dput(dentry);
1505         if (error)
1506                 return error;
1507         nd->flags &= ~LOOKUP_PARENT;
1508         if (nd->last_type == LAST_BIND) {
1509                 dentry = nd->dentry;
1510                 goto ok;
1511         }
1512         error = -EISDIR;
1513         if (nd->last_type != LAST_NORM)
1514                 goto exit;
1515         if (nd->last.name[nd->last.len]) {
1516                 putname(nd->last.name);
1517                 goto exit;
1518         }
1519         error = -ELOOP;
1520         if (count++==32) {
1521                 putname(nd->last.name);
1522                 goto exit;
1523         }
1524         dir = nd->dentry;
1525         down(&dir->d_inode->i_sem);
1526         dentry = __lookup_hash(&nd->last, nd->dentry, nd);
1527         putname(nd->last.name);
1528         goto do_last;
1529 }
1530
1531 /**
1532  * lookup_create - lookup a dentry, creating it if it doesn't exist
1533  * @nd: nameidata info
1534  * @is_dir: directory flag
1535  *
1536  * Simple function to lookup and return a dentry and create it
1537  * if it doesn't exist.  Is SMP-safe.
1538  */
1539 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1540 {
1541         struct dentry *dentry;
1542         int error;
1543
1544         down(&nd->dentry->d_inode->i_sem);
1545         error = -EEXIST;
1546         if (nd->last_type != LAST_NORM)
1547                 goto out;
1548         nd->flags &= ~LOOKUP_PARENT;
1549         dentry = lookup_hash(&nd->last, nd->dentry);
1550         if (IS_ERR(dentry))
1551                 goto ret;
1552         error = mnt_may_create(nd->mnt, nd->dentry->d_inode, dentry);
1553         if (error)
1554                 goto fail;
1555         error = -ENOENT;
1556         if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
1557                 goto fail;
1558 ret:
1559         return dentry;
1560 fail:
1561         dput(dentry);
1562 out:
1563         return ERR_PTR(error);
1564 }
1565
1566 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1567 {
1568         int error = may_create(dir, dentry, NULL);
1569
1570         if (error)
1571                 return error;
1572
1573         if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1574                 return -EPERM;
1575
1576         if (!dir->i_op || !dir->i_op->mknod)
1577                 return -EPERM;
1578
1579         error = security_inode_mknod(dir, dentry, mode, dev);
1580         if (error)
1581                 return error;
1582
1583         DQUOT_INIT(dir);
1584         error = dir->i_op->mknod(dir, dentry, mode, dev);
1585         if (!error) {
1586                 inode_dir_notify(dir, DN_CREATE);
1587                 security_inode_post_mknod(dir, dentry, mode, dev);
1588         }
1589         return error;
1590 }
1591
1592 asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
1593 {
1594         int error = 0;
1595         char * tmp;
1596         struct dentry * dentry;
1597         struct nameidata nd;
1598
1599         if (S_ISDIR(mode))
1600                 return -EPERM;
1601         tmp = getname(filename);
1602         if (IS_ERR(tmp))
1603                 return PTR_ERR(tmp);
1604
1605         error = path_lookup(tmp, LOOKUP_PARENT, &nd);
1606         if (error)
1607                 goto out;
1608         dentry = lookup_create(&nd, 0);
1609         error = PTR_ERR(dentry);
1610
1611         if (!IS_POSIXACL(nd.dentry->d_inode))
1612                 mode &= ~current->fs->umask;
1613         if (!IS_ERR(dentry)) {
1614                 switch (mode & S_IFMT) {
1615                 case 0: case S_IFREG:
1616                         error = vfs_create(nd.dentry->d_inode,dentry,mode,&nd);
1617                         break;
1618                 case S_IFCHR: case S_IFBLK:
1619                         error = vfs_mknod(nd.dentry->d_inode,dentry,mode,
1620                                         new_decode_dev(dev));
1621                         break;
1622                 case S_IFIFO: case S_IFSOCK:
1623                         error = vfs_mknod(nd.dentry->d_inode,dentry,mode,0);
1624                         break;
1625                 case S_IFDIR:
1626                         error = -EPERM;
1627                         break;
1628                 default:
1629                         error = -EINVAL;
1630                 }
1631                 dput(dentry);
1632         }
1633         up(&nd.dentry->d_inode->i_sem);
1634         path_release(&nd);
1635 out:
1636         putname(tmp);
1637
1638         return error;
1639 }
1640
1641 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1642 {
1643         int error = may_create(dir, dentry, NULL);
1644
1645         if (error)
1646                 return error;
1647
1648         if (!dir->i_op || !dir->i_op->mkdir)
1649                 return -EPERM;
1650
1651         mode &= (S_IRWXUGO|S_ISVTX);
1652         error = security_inode_mkdir(dir, dentry, mode);
1653         if (error)
1654                 return error;
1655
1656         DQUOT_INIT(dir);
1657         error = dir->i_op->mkdir(dir, dentry, mode);
1658         if (!error) {
1659                 inode_dir_notify(dir, DN_CREATE);
1660                 security_inode_post_mkdir(dir,dentry, mode);
1661         }
1662         return error;
1663 }
1664
1665 asmlinkage long sys_mkdir(const char __user * pathname, int mode)
1666 {
1667         int error = 0;
1668         char * tmp;
1669
1670         tmp = getname(pathname);
1671         error = PTR_ERR(tmp);
1672         if (!IS_ERR(tmp)) {
1673                 struct dentry *dentry;
1674                 struct nameidata nd;
1675
1676                 error = path_lookup(tmp, LOOKUP_PARENT, &nd);
1677                 if (error)
1678                         goto out;
1679                 dentry = lookup_create(&nd, 1);
1680                 error = PTR_ERR(dentry);
1681                 if (!IS_ERR(dentry)) {
1682                         if (!IS_POSIXACL(nd.dentry->d_inode))
1683                                 mode &= ~current->fs->umask;
1684                         error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
1685                         dput(dentry);
1686                 }
1687                 up(&nd.dentry->d_inode->i_sem);
1688                 path_release(&nd);
1689 out:
1690                 putname(tmp);
1691         }
1692
1693         return error;
1694 }
1695
1696 /*
1697  * We try to drop the dentry early: we should have
1698  * a usage count of 2 if we're the only user of this
1699  * dentry, and if that is true (possibly after pruning
1700  * the dcache), then we drop the dentry now.
1701  *
1702  * A low-level filesystem can, if it choses, legally
1703  * do a
1704  *
1705  *      if (!d_unhashed(dentry))
1706  *              return -EBUSY;
1707  *
1708  * if it cannot handle the case of removing a directory
1709  * that is still in use by something else..
1710  */
1711 static void d_unhash(struct dentry *dentry)
1712 {
1713         dget(dentry);
1714         spin_lock(&dcache_lock);
1715         switch (atomic_read(&dentry->d_count)) {
1716         default:
1717                 spin_unlock(&dcache_lock);
1718                 shrink_dcache_parent(dentry);
1719                 spin_lock(&dcache_lock);
1720                 if (atomic_read(&dentry->d_count) != 2)
1721                         break;
1722         case 2:
1723                 __d_drop(dentry);
1724         }
1725         spin_unlock(&dcache_lock);
1726 }
1727
1728 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
1729 {
1730         int error = may_delete(dir, dentry, 1);
1731
1732         if (error)
1733                 return error;
1734
1735         if (!dir->i_op || !dir->i_op->rmdir)
1736                 return -EPERM;
1737
1738         DQUOT_INIT(dir);
1739
1740         down(&dentry->d_inode->i_sem);
1741         d_unhash(dentry);
1742         if (d_mountpoint(dentry))
1743                 error = -EBUSY;
1744         else {
1745                 error = security_inode_rmdir(dir, dentry);
1746                 if (!error) {
1747                         error = dir->i_op->rmdir(dir, dentry);
1748                         if (!error)
1749                                 dentry->d_inode->i_flags |= S_DEAD;
1750                 }
1751         }
1752         up(&dentry->d_inode->i_sem);
1753         if (!error) {
1754                 inode_dir_notify(dir, DN_DELETE);
1755                 d_delete(dentry);
1756         }
1757         dput(dentry);
1758
1759         return error;
1760 }
1761
1762 asmlinkage long sys_rmdir(const char __user * pathname)
1763 {
1764         int error = 0;
1765         char * name;
1766         struct dentry *dentry;
1767         struct nameidata nd;
1768
1769         name = getname(pathname);
1770         if(IS_ERR(name))
1771                 return PTR_ERR(name);
1772
1773         error = path_lookup(name, LOOKUP_PARENT, &nd);
1774         if (error)
1775                 goto exit;
1776
1777         switch(nd.last_type) {
1778                 case LAST_DOTDOT:
1779                         error = -ENOTEMPTY;
1780                         goto exit1;
1781                 case LAST_DOT:
1782                         error = -EINVAL;
1783                         goto exit1;
1784                 case LAST_ROOT:
1785                         error = -EBUSY;
1786                         goto exit1;
1787         }
1788         down(&nd.dentry->d_inode->i_sem);
1789         dentry = lookup_hash(&nd.last, nd.dentry);
1790         error = PTR_ERR(dentry);
1791         if (!IS_ERR(dentry)) {
1792                 error = mnt_may_unlink(nd.mnt, nd.dentry->d_inode, dentry);
1793                 if (error)
1794                         goto exit2;
1795                 error = vfs_rmdir(nd.dentry->d_inode, dentry);
1796         exit2:
1797                 dput(dentry);
1798         }
1799         up(&nd.dentry->d_inode->i_sem);
1800 exit1:
1801         path_release(&nd);
1802 exit:
1803         putname(name);
1804         return error;
1805 }
1806
1807 int vfs_unlink(struct inode *dir, struct dentry *dentry)
1808 {
1809         int error = may_delete(dir, dentry, 0);
1810
1811         if (error)
1812                 return error;
1813
1814         if (!dir->i_op || !dir->i_op->unlink)
1815                 return -EPERM;
1816
1817         DQUOT_INIT(dir);
1818
1819         down(&dentry->d_inode->i_sem);
1820         if (d_mountpoint(dentry))
1821                 error = -EBUSY;
1822         else {
1823                 error = security_inode_unlink(dir, dentry);
1824                 if (!error)
1825                         error = dir->i_op->unlink(dir, dentry);
1826         }
1827         up(&dentry->d_inode->i_sem);
1828
1829         /* We don't d_delete() NFS sillyrenamed files--they still exist. */
1830         if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
1831                 d_delete(dentry);
1832                 inode_dir_notify(dir, DN_DELETE);
1833         }
1834         return error;
1835 }
1836
1837 /*
1838  * Make sure that the actual truncation of the file will occur outside its
1839  * directory's i_sem.  Truncate can take a long time if there is a lot of
1840  * writeout happening, and we don't want to prevent access to the directory
1841  * while waiting on the I/O.
1842  */
1843 asmlinkage long sys_unlink(const char __user * pathname)
1844 {
1845         int error = 0;
1846         char * name;
1847         struct dentry *dentry;
1848         struct nameidata nd;
1849         struct inode *inode = NULL;
1850
1851         name = getname(pathname);
1852         if(IS_ERR(name))
1853                 return PTR_ERR(name);
1854
1855         error = path_lookup(name, LOOKUP_PARENT, &nd);
1856         if (error)
1857                 goto exit;
1858         error = -EISDIR;
1859         if (nd.last_type != LAST_NORM)
1860                 goto exit1;
1861         down(&nd.dentry->d_inode->i_sem);
1862         dentry = lookup_hash(&nd.last, nd.dentry);
1863         error = PTR_ERR(dentry);
1864         if (!IS_ERR(dentry)) {
1865                 /* Why not before? Because we want correct error value */
1866                 if (nd.last.name[nd.last.len])
1867                         goto slashes;
1868                 error = mnt_may_unlink(nd.mnt, nd.dentry->d_inode, dentry);
1869                 if (error)
1870                         goto exit2;
1871                 inode = dentry->d_inode;
1872                 if (inode)
1873                         atomic_inc(&inode->i_count);
1874                 error = vfs_unlink(nd.dentry->d_inode, dentry);
1875         exit2:
1876                 dput(dentry);
1877         }
1878         up(&nd.dentry->d_inode->i_sem);
1879 exit1:
1880         path_release(&nd);
1881 exit:
1882         putname(name);
1883
1884         if (inode)
1885                 iput(inode);    /* truncate the inode here */
1886         return error;
1887
1888 slashes:
1889         error = !dentry->d_inode ? -ENOENT :
1890                 S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
1891         goto exit2;
1892 }
1893
1894 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
1895 {
1896         int error = may_create(dir, dentry, NULL);
1897
1898         if (error)
1899                 return error;
1900
1901         if (!dir->i_op || !dir->i_op->symlink)
1902                 return -EPERM;
1903
1904         error = security_inode_symlink(dir, dentry, oldname);
1905         if (error)
1906                 return error;
1907
1908         DQUOT_INIT(dir);
1909         error = dir->i_op->symlink(dir, dentry, oldname);
1910         if (!error) {
1911                 inode_dir_notify(dir, DN_CREATE);
1912                 security_inode_post_symlink(dir, dentry, oldname);
1913         }
1914         return error;
1915 }
1916
1917 asmlinkage long sys_symlink(const char __user * oldname, const char __user * newname)
1918 {
1919         int error = 0;
1920         char * from;
1921         char * to;
1922
1923         from = getname(oldname);
1924         if(IS_ERR(from))
1925                 return PTR_ERR(from);
1926         to = getname(newname);
1927         error = PTR_ERR(to);
1928         if (!IS_ERR(to)) {
1929                 struct dentry *dentry;
1930                 struct nameidata nd;
1931
1932                 error = path_lookup(to, LOOKUP_PARENT, &nd);
1933                 if (error)
1934                         goto out;
1935                 dentry = lookup_create(&nd, 0);
1936                 error = PTR_ERR(dentry);
1937                 if (!IS_ERR(dentry)) {
1938                         error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
1939                         dput(dentry);
1940                 }
1941                 up(&nd.dentry->d_inode->i_sem);
1942                 path_release(&nd);
1943 out:
1944                 putname(to);
1945         }
1946         putname(from);
1947         return error;
1948 }
1949
1950 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
1951 {
1952         struct inode *inode = old_dentry->d_inode;
1953         int error;
1954
1955         if (!inode)
1956                 return -ENOENT;
1957
1958         error = may_create(dir, new_dentry, NULL);
1959         if (error)
1960                 return error;
1961
1962         if (dir->i_sb != inode->i_sb)
1963                 return -EXDEV;
1964
1965         /*
1966          * A link to an append-only or immutable file cannot be created.
1967          */
1968         if (IS_APPEND(inode) || IS_IXORUNLINK(inode))
1969                 return -EPERM;
1970         if (!dir->i_op || !dir->i_op->link)
1971                 return -EPERM;
1972         if (S_ISDIR(old_dentry->d_inode->i_mode))
1973                 return -EPERM;
1974
1975         error = security_inode_link(old_dentry, dir, new_dentry);
1976         if (error)
1977                 return error;
1978
1979         down(&old_dentry->d_inode->i_sem);
1980         DQUOT_INIT(dir);
1981         error = dir->i_op->link(old_dentry, dir, new_dentry);
1982         up(&old_dentry->d_inode->i_sem);
1983         if (!error) {
1984                 inode_dir_notify(dir, DN_CREATE);
1985                 security_inode_post_link(old_dentry, dir, new_dentry);
1986         }
1987         return error;
1988 }
1989
1990 /*
1991  * Hardlinks are often used in delicate situations.  We avoid
1992  * security-related surprises by not following symlinks on the
1993  * newname.  --KAB
1994  *
1995  * We don't follow them on the oldname either to be compatible
1996  * with linux 2.0, and to avoid hard-linking to directories
1997  * and other special files.  --ADM
1998  */
1999 asmlinkage long sys_link(const char __user * oldname, const char __user * newname)
2000 {
2001         struct dentry *new_dentry;
2002         struct nameidata nd, old_nd;
2003         int error;
2004         char * to;
2005
2006         to = getname(newname);
2007         if (IS_ERR(to))
2008                 return PTR_ERR(to);
2009
2010         error = __user_walk(oldname, 0, &old_nd);
2011         if (error)
2012                 goto exit;
2013         error = path_lookup(to, LOOKUP_PARENT, &nd);
2014         if (error)
2015                 goto out;
2016         /*
2017          * We allow hard-links to be created to a bind-mount as long
2018          * as the bind-mount is not read-only.  Checking for cross-dev
2019          * links is subsumed by the superblock check in vfs_link().
2020          */
2021         error = -EROFS;
2022         if (MNT_IS_RDONLY(old_nd.mnt))
2023                 goto out_release;
2024         new_dentry = lookup_create(&nd, 0);
2025         error = PTR_ERR(new_dentry);
2026         if (!IS_ERR(new_dentry)) {
2027                 error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
2028                 dput(new_dentry);
2029         }
2030         up(&nd.dentry->d_inode->i_sem);
2031 out_release:
2032         path_release(&nd);
2033 out:
2034         path_release(&old_nd);
2035 exit:
2036         putname(to);
2037
2038         return error;
2039 }
2040
2041 /*
2042  * The worst of all namespace operations - renaming directory. "Perverted"
2043  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
2044  * Problems:
2045  *      a) we can get into loop creation. Check is done in is_subdir().
2046  *      b) race potential - two innocent renames can create a loop together.
2047  *         That's where 4.4 screws up. Current fix: serialization on
2048  *         sb->s_vfs_rename_sem. We might be more accurate, but that's another
2049  *         story.
2050  *      c) we have to lock _three_ objects - parents and victim (if it exists).
2051  *         And that - after we got ->i_sem on parents (until then we don't know
2052  *         whether the target exists).  Solution: try to be smart with locking
2053  *         order for inodes.  We rely on the fact that tree topology may change
2054  *         only under ->s_vfs_rename_sem _and_ that parent of the object we
2055  *         move will be locked.  Thus we can rank directories by the tree
2056  *         (ancestors first) and rank all non-directories after them.
2057  *         That works since everybody except rename does "lock parent, lookup,
2058  *         lock child" and rename is under ->s_vfs_rename_sem.
2059  *         HOWEVER, it relies on the assumption that any object with ->lookup()
2060  *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
2061  *         we'd better make sure that there's no link(2) for them.
2062  *      d) some filesystems don't support opened-but-unlinked directories,
2063  *         either because of layout or because they are not ready to deal with
2064  *         all cases correctly. The latter will be fixed (taking this sort of
2065  *         stuff into VFS), but the former is not going away. Solution: the same
2066  *         trick as in rmdir().
2067  *      e) conversion from fhandle to dentry may come in the wrong moment - when
2068  *         we are removing the target. Solution: we will have to grab ->i_sem
2069  *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2070  *         ->i_sem on parents, which works but leads to some truely excessive
2071  *         locking].
2072  */
2073 int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2074                struct inode *new_dir, struct dentry *new_dentry)
2075 {
2076         int error = 0;
2077         struct inode *target;
2078
2079         /*
2080          * If we are going to change the parent - check write permissions,
2081          * we'll need to flip '..'.
2082          */
2083         if (new_dir != old_dir) {
2084                 error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
2085                 if (error)
2086                         return error;
2087         }
2088
2089         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2090         if (error)
2091                 return error;
2092
2093         target = new_dentry->d_inode;
2094         if (target) {
2095                 down(&target->i_sem);
2096                 d_unhash(new_dentry);
2097         }
2098         if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2099                 error = -EBUSY;
2100         else
2101                 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2102         if (target) {
2103                 if (!error)
2104                         target->i_flags |= S_DEAD;
2105                 up(&target->i_sem);
2106                 if (d_unhashed(new_dentry))
2107                         d_rehash(new_dentry);
2108                 dput(new_dentry);
2109         }
2110         if (!error) {
2111                 d_move(old_dentry,new_dentry);
2112                 security_inode_post_rename(old_dir, old_dentry,
2113                                            new_dir, new_dentry);
2114         }
2115         return error;
2116 }
2117
2118 int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2119                struct inode *new_dir, struct dentry *new_dentry)
2120 {
2121         struct inode *target;
2122         int error;
2123
2124         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2125         if (error)
2126                 return error;
2127
2128         dget(new_dentry);
2129         target = new_dentry->d_inode;
2130         if (target)
2131                 down(&target->i_sem);
2132         if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2133                 error = -EBUSY;
2134         else
2135                 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2136         if (!error) {
2137                 /* The following d_move() should become unconditional */
2138                 if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME))
2139                         d_move(old_dentry, new_dentry);
2140                 security_inode_post_rename(old_dir, old_dentry, new_dir, new_dentry);
2141         }
2142         if (target)
2143                 up(&target->i_sem);
2144         dput(new_dentry);
2145         return error;
2146 }
2147
2148 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2149                struct inode *new_dir, struct dentry *new_dentry)
2150 {
2151         int error;
2152         int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2153
2154         if (old_dentry->d_inode == new_dentry->d_inode)
2155                 return 0;
2156
2157         error = may_delete(old_dir, old_dentry, is_dir);
2158         if (error)
2159                 return error;
2160
2161         if (!new_dentry->d_inode)
2162                 error = may_create(new_dir, new_dentry, NULL);
2163         else
2164                 error = may_delete(new_dir, new_dentry, is_dir);
2165         if (error)
2166                 return error;
2167
2168         if (!old_dir->i_op || !old_dir->i_op->rename)
2169                 return -EPERM;
2170
2171         DQUOT_INIT(old_dir);
2172         DQUOT_INIT(new_dir);
2173
2174         if (is_dir)
2175                 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2176         else
2177                 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2178         if (!error) {
2179                 if (old_dir == new_dir)
2180                         inode_dir_notify(old_dir, DN_RENAME);
2181                 else {
2182                         inode_dir_notify(old_dir, DN_DELETE);
2183                         inode_dir_notify(new_dir, DN_CREATE);
2184                 }
2185         }
2186         return error;
2187 }
2188
2189 static inline int do_rename(const char * oldname, const char * newname)
2190 {
2191         int error = 0;
2192         struct dentry * old_dir, * new_dir;
2193         struct dentry * old_dentry, *new_dentry;
2194         struct dentry * trap;
2195         struct nameidata oldnd, newnd;
2196
2197         error = path_lookup(oldname, LOOKUP_PARENT, &oldnd);
2198         if (error)
2199                 goto exit;
2200
2201         error = path_lookup(newname, LOOKUP_PARENT, &newnd);
2202         if (error)
2203                 goto exit1;
2204
2205         error = -EXDEV;
2206         if (oldnd.mnt != newnd.mnt)
2207                 goto exit2;
2208
2209         old_dir = oldnd.dentry;
2210         error = -EBUSY;
2211         if (oldnd.last_type != LAST_NORM)
2212                 goto exit2;
2213
2214         new_dir = newnd.dentry;
2215         if (newnd.last_type != LAST_NORM)
2216                 goto exit2;
2217
2218         trap = lock_rename(new_dir, old_dir);
2219
2220         old_dentry = lookup_hash(&oldnd.last, old_dir);
2221         error = PTR_ERR(old_dentry);
2222         if (IS_ERR(old_dentry))
2223                 goto exit3;
2224         /* source must exist */
2225         error = -ENOENT;
2226         if (!old_dentry->d_inode)
2227                 goto exit4;
2228         /* unless the source is a directory trailing slashes give -ENOTDIR */
2229         if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2230                 error = -ENOTDIR;
2231                 if (oldnd.last.name[oldnd.last.len])
2232                         goto exit4;
2233                 if (newnd.last.name[newnd.last.len])
2234                         goto exit4;
2235         }
2236         /* source should not be ancestor of target */
2237         error = -EINVAL;
2238         if (old_dentry == trap)
2239                 goto exit4;
2240         error = -EROFS;
2241         if (MNT_IS_RDONLY(newnd.mnt))
2242                 goto exit4;
2243         new_dentry = lookup_hash(&newnd.last, new_dir);
2244         error = PTR_ERR(new_dentry);
2245         if (IS_ERR(new_dentry))
2246                 goto exit4;
2247         /* target should not be an ancestor of source */
2248         error = -ENOTEMPTY;
2249         if (new_dentry == trap)
2250                 goto exit5;
2251
2252         error = vfs_rename(old_dir->d_inode, old_dentry,
2253                                    new_dir->d_inode, new_dentry);
2254 exit5:
2255         dput(new_dentry);
2256 exit4:
2257         dput(old_dentry);
2258 exit3:
2259         unlock_rename(new_dir, old_dir);
2260 exit2:
2261         path_release(&newnd);
2262 exit1:
2263         path_release(&oldnd);
2264 exit:
2265         return error;
2266 }
2267
2268 asmlinkage long sys_rename(const char __user * oldname, const char __user * newname)
2269 {
2270         int error;
2271         char * from;
2272         char * to;
2273
2274         from = getname(oldname);
2275         if(IS_ERR(from))
2276                 return PTR_ERR(from);
2277         to = getname(newname);
2278         error = PTR_ERR(to);
2279         if (!IS_ERR(to)) {
2280                 error = do_rename(from,to);
2281                 putname(to);
2282         }
2283         putname(from);
2284         return error;
2285 }
2286
2287 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
2288 {
2289         int len;
2290
2291         len = PTR_ERR(link);
2292         if (IS_ERR(link))
2293                 goto out;
2294
2295         len = strlen(link);
2296         if (len > (unsigned) buflen)
2297                 len = buflen;
2298         if (copy_to_user(buffer, link, len))
2299                 len = -EFAULT;
2300 out:
2301         return len;
2302 }
2303
2304 /*
2305  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
2306  * have ->follow_link() touching nd only in nd_set_link().  Using (or not
2307  * using) it for any given inode is up to filesystem.
2308  */
2309 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2310 {
2311         struct nameidata nd;
2312         int res;
2313         nd.depth = 0;
2314         res = dentry->d_inode->i_op->follow_link(dentry, &nd);
2315         if (!res) {
2316                 res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
2317                 if (dentry->d_inode->i_op->put_link)
2318                         dentry->d_inode->i_op->put_link(dentry, &nd);
2319         }
2320         return res;
2321 }
2322
2323 int vfs_follow_link(struct nameidata *nd, const char *link)
2324 {
2325         return __vfs_follow_link(nd, link);
2326 }
2327
2328 /* get the link contents into pagecache */
2329 static char *page_getlink(struct dentry * dentry, struct page **ppage)
2330 {
2331         struct page * page;
2332         struct address_space *mapping = dentry->d_inode->i_mapping;
2333         page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage,
2334                                 NULL);
2335         if (IS_ERR(page))
2336                 goto sync_fail;
2337         wait_on_page_locked(page);
2338         if (!PageUptodate(page))
2339                 goto async_fail;
2340         *ppage = page;
2341         return kmap(page);
2342
2343 async_fail:
2344         page_cache_release(page);
2345         return ERR_PTR(-EIO);
2346
2347 sync_fail:
2348         return (char*)page;
2349 }
2350
2351 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2352 {
2353         struct page *page = NULL;
2354         char *s = page_getlink(dentry, &page);
2355         int res = vfs_readlink(dentry,buffer,buflen,s);
2356         if (page) {
2357                 kunmap(page);
2358                 page_cache_release(page);
2359         }
2360         return res;
2361 }
2362
2363 int page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
2364 {
2365         struct page *page;
2366         nd_set_link(nd, page_getlink(dentry, &page));
2367         return 0;
2368 }
2369
2370 void page_put_link(struct dentry *dentry, struct nameidata *nd)
2371 {
2372         if (!IS_ERR(nd_get_link(nd))) {
2373                 struct page *page;
2374                 page = find_get_page(dentry->d_inode->i_mapping, 0);
2375                 if (!page)
2376                         BUG();
2377                 kunmap(page);
2378                 page_cache_release(page);
2379                 page_cache_release(page);
2380         }
2381 }
2382
2383 int page_follow_link(struct dentry *dentry, struct nameidata *nd)
2384 {
2385         struct page *page = NULL;
2386         char *s = page_getlink(dentry, &page);
2387         int res = __vfs_follow_link(nd, s);
2388         if (page) {
2389                 kunmap(page);
2390                 page_cache_release(page);
2391         }
2392         return res;
2393 }
2394
2395 int page_symlink(struct inode *inode, const char *symname, int len)
2396 {
2397         struct address_space *mapping = inode->i_mapping;
2398         struct page *page = grab_cache_page(mapping, 0);
2399         int err = -ENOMEM;
2400         char *kaddr;
2401
2402         if (!page)
2403                 goto fail;
2404         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2405         if (err)
2406                 goto fail_map;
2407         kaddr = kmap_atomic(page, KM_USER0);
2408         memcpy(kaddr, symname, len-1);
2409         kunmap_atomic(kaddr, KM_USER0);
2410         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2411         /*
2412          * Notice that we are _not_ going to block here - end of page is
2413          * unmapped, so this will only try to map the rest of page, see
2414          * that it is unmapped (typically even will not look into inode -
2415          * ->i_size will be enough for everything) and zero it out.
2416          * OTOH it's obviously correct and should make the page up-to-date.
2417          */
2418         if (!PageUptodate(page)) {
2419                 err = mapping->a_ops->readpage(NULL, page);
2420                 wait_on_page_locked(page);
2421         } else {
2422                 unlock_page(page);
2423         }
2424         page_cache_release(page);
2425         if (err < 0)
2426                 goto fail;
2427         mark_inode_dirty(inode);
2428         return 0;
2429 fail_map:
2430         unlock_page(page);
2431         page_cache_release(page);
2432 fail:
2433         return err;
2434 }
2435
2436 struct inode_operations page_symlink_inode_operations = {
2437         .readlink       = generic_readlink,
2438         .follow_link    = page_follow_link_light,
2439         .put_link       = page_put_link,
2440 };
2441
2442 EXPORT_SYMBOL(__user_walk);
2443 EXPORT_SYMBOL(follow_down);
2444 EXPORT_SYMBOL(follow_up);
2445 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
2446 EXPORT_SYMBOL(getname);
2447 EXPORT_SYMBOL(lock_rename);
2448 EXPORT_SYMBOL(lookup_create);
2449 EXPORT_SYMBOL(lookup_hash);
2450 EXPORT_SYMBOL(lookup_one_len);
2451 EXPORT_SYMBOL(page_follow_link);
2452 EXPORT_SYMBOL(page_follow_link_light);
2453 EXPORT_SYMBOL(page_put_link);
2454 EXPORT_SYMBOL(page_readlink);
2455 EXPORT_SYMBOL(page_symlink);
2456 EXPORT_SYMBOL(page_symlink_inode_operations);
2457 EXPORT_SYMBOL(path_lookup);
2458 EXPORT_SYMBOL(path_release);
2459 EXPORT_SYMBOL(path_walk);
2460 EXPORT_SYMBOL(permission);
2461 EXPORT_SYMBOL(unlock_rename);
2462 EXPORT_SYMBOL(vfs_create);
2463 EXPORT_SYMBOL(vfs_follow_link);
2464 EXPORT_SYMBOL(vfs_link);
2465 EXPORT_SYMBOL(vfs_mkdir);
2466 EXPORT_SYMBOL(vfs_mknod);
2467 EXPORT_SYMBOL(vfs_permission);
2468 EXPORT_SYMBOL(vfs_readlink);
2469 EXPORT_SYMBOL(vfs_rename);
2470 EXPORT_SYMBOL(vfs_rmdir);
2471 EXPORT_SYMBOL(vfs_symlink);
2472 EXPORT_SYMBOL(vfs_unlink);
2473 EXPORT_SYMBOL(generic_readlink);