fs/namei.c

   1 /*
   2  *  linux/fs/namei.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  * Some corrections by tytso.
   9  */
  10
  11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12  * lookup logic.
  13  */
  14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  15  */
  16
  17 #include <linux/init.h>
  18 #include <linux/module.h>
  19 #include <linux/slab.h>
  20 #include <linux/fs.h>
  21 #include <linux/namei.h>
  22 #include <linux/quotaops.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/fsnotify.h>
  25 #include <linux/smp_lock.h>
  26 #include <linux/personality.h>
  27 #include <linux/security.h>
  28 #include <linux/syscalls.h>
  29 #include <linux/mount.h>
  30 #include <linux/audit.h>
  31 #include <linux/capability.h>
  32 #include <linux/file.h>
  33 #include <linux/fcntl.h>
  34 #include <linux/namei.h>
  35 #include <linux/proc_fs.h>
  36 #include <linux/vserver/inode.h>
  37 #include <linux/vserver/debug.h>
  38 #include <asm/namei.h>
  39 #include <asm/uaccess.h>
  40
  41 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
  42
  43 /* [Feb-1997 T. Schoebel-Theuer]
  44  * Fundamental changes in the pathname lookup mechanisms (namei)
  45  * were necessary because of omirr.  The reason is that omirr needs
  46  * to know the _real_ pathname, not the user-supplied one, in case
  47  * of symlinks (and also when transname replacements occur).
  48  *
  49  * The new code replaces the old recursive symlink resolution with
  50  * an iterative one (in case of non-nested symlink chains).  It does
  51  * this with calls to <fs>_follow_link().
  52  * As a side effect, dir_namei(), _namei() and follow_link() are now
  53  * replaced with a single function lookup_dentry() that can handle all
  54  * the special cases of the former code.
  55  *
  56  * With the new dcache, the pathname is stored at each inode, at least as
  57  * long as the refcount of the inode is positive.  As a side effect, the
  58  * size of the dcache depends on the inode cache and thus is dynamic.
  59  *
  60  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  61  * resolution to correspond with current state of the code.
  62  *
  63  * Note that the symlink resolution is not *completely* iterative.
  64  * There is still a significant amount of tail- and mid- recursion in
  65  * the algorithm.  Also, note that <fs>_readlink() is not used in
  66  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  67  * may return different results than <fs>_follow_link().  Many virtual
  68  * filesystems (including /proc) exhibit this behavior.
  69  */
  70
  71 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  72  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  73  * and the name already exists in form of a symlink, try to create the new
  74  * name indicated by the symlink. The old code always complained that the
  75  * name already exists, due to not following the symlink even if its target
  76  * is nonexistent.  The new semantics affects also mknod() and link() when
  77  * the name is a symlink pointing to a non-existant name.
  78  *
  79  * I don't know which semantics is the right one, since I have no access
  80  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  81  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  82  * "old" one. Personally, I think the new semantics is much more logical.
  83  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  84  * file does succeed in both HP-UX and SunOs, but not in Solaris
  85  * and in the old Linux semantics.
  86  */
  87
  88 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  89  * semantics.  See the comments in "open_namei" and "do_link" below.
  90  *
  91  * [10-Sep-98 Alan Modra] Another symlink change.
  92  */
  93
  94 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  95  *      inside the path - always follow.
  96  *      in the last component in creation/removal/renaming - never follow.
  97  *      if LOOKUP_FOLLOW passed - follow.
  98  *      if the pathname has trailing slashes - follow.
  99  *      otherwise - don't follow.
 100  * (applied in that order).
 101  *
 102  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 103  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 104  * During the 2.4 we need to fix the userland stuff depending on it -
 105  * hopefully we will be able to get rid of that wart in 2.5. So far only
 106  * XEmacs seems to be relying on it...
 107  */
 108 /*
 109  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 110  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 111  * any extra contention...
 112  */
 113
 114 /* In order to reduce some races, while at the same time doing additional
 115  * checking and hopefully speeding things up, we copy filenames to the
 116  * kernel data space before using them..
 117  *
 118  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 119  * PATH_MAX includes the nul terminator --RR.
 120  */
 121 static int do_getname(const char __user *filename, char *page)
 122 {
 123         int retval;
 124         unsigned long len = PATH_MAX;
 125
 126         if (!segment_eq(get_fs(), KERNEL_DS)) {
 127                 if ((unsigned long) filename >= TASK_SIZE)
 128                         return -EFAULT;
 129                 if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
 130                         len = TASK_SIZE - (unsigned long) filename;
 131         }
 132
 133         retval = strncpy_from_user(page, filename, len);
 134         if (retval > 0) {
 135                 if (retval < len)
 136                         return 0;
 137                 return -ENAMETOOLONG;
 138         } else if (!retval)
 139                 retval = -ENOENT;
 140         return retval;
 141 }
 142
 143 char * getname(const char __user * filename)
 144 {
 145         char *tmp, *result;
 146
 147         result = ERR_PTR(-ENOMEM);
 148         tmp = __getname();
 149         if (tmp)  {
 150                 int retval = do_getname(filename, tmp);
 151
 152                 result = tmp;
 153                 if (retval < 0) {
 154                         __putname(tmp);
 155                         result = ERR_PTR(retval);
 156                 }
 157         }
 158         audit_getname(result);
 159         return result;
 160 }
 161
 162 #ifdef CONFIG_AUDITSYSCALL
 163 void putname(const char *name)
 164 {
 165         if (unlikely(!audit_dummy_context()))
 166                 audit_putname(name);
 167         else
 168                 __putname(name);
 169 }
 170 EXPORT_SYMBOL(putname);
 171 #endif
 172
 173
 174 /**
 175  * generic_permission  -  check for access rights on a Posix-like filesystem
 176  * @inode:      inode to check access rights for
 177  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 178  * @check_acl:  optional callback to check for Posix ACLs
 179  *
 180  * Used to check for read/write/execute permissions on a file.
 181  * We use "fsuid" for this, letting us set arbitrary permissions
 182  * for filesystem access without changing the "normal" uids which
 183  * are used for other things..
 184  */
 185 int generic_permission(struct inode *inode, int mask,
 186                 int (*check_acl)(struct inode *inode, int mask))
 187 {
 188         umode_t                 mode = inode->i_mode;
 189
 190         if (current->fsuid == inode->i_uid)
 191                 mode >>= 6;
 192         else {
 193                 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
 194                         int error = check_acl(inode, mask);
 195                         if (error == -EACCES)
 196                                 goto check_capabilities;
 197                         else if (error != -EAGAIN)
 198                                 return error;
 199                 }
 200
 201                 if (in_group_p(inode->i_gid))
 202                         mode >>= 3;
 203         }
 204
 205         /*
 206          * If the DACs are ok we don't need any capability check.
 207          */
 208         if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
 209                 return 0;
 210
 211  check_capabilities:
 212         /*
 213          * Read/write DACs are always overridable.
 214          * Executable DACs are overridable if at least one exec bit is set.
 215          */
 216         if (!(mask & MAY_EXEC) ||
 217             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
 218                 if (capable(CAP_DAC_OVERRIDE))
 219                         return 0;
 220
 221         /*
 222          * Searching includes executable on directories, else just read.
 223          */
 224         if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
 225                 if (capable(CAP_DAC_READ_SEARCH))
 226                         return 0;
 227
 228         return -EACCES;
 229 }
 230
 231 static inline int vx_barrier(struct inode *inode)
 232 {
 233         if (IS_BARRIER(inode) && !vx_check(0, VX_ADMIN)) {
 234                 vxwprintk(1, "xid=%d did hit the barrier.",
 235                         vx_current_xid());
 236                 return 1;
 237         }
 238         return 0;
 239 }
 240
 241 static inline int xid_permission(struct inode *inode, int mask, struct nameidata *nd)
 242 {
 243         if (vx_barrier(inode))
 244                 return -EACCES;
 245         if (inode->i_xid == 0)
 246                 return 0;
 247 #ifdef CONFIG_VSERVER_FILESHARING
 248         /* MEF: PlanetLab FS module assumes that any file that can be
 249          * named (e.g., via a cross mount) is not hidden from another
 250          * context or the admin context.
 251          */
 252         if (vx_check(inode->i_xid,VX_STATIC|VX_DYNAMIC))
 253                 return 0;
 254 #endif
 255         if (vx_check(inode->i_xid, VX_ADMIN|VX_WATCH|VX_IDENT))
 256                 return 0;
 257
 258         vxwprintk(1, "xid=%d denied access to %p[#%d,%lu] »%s«.",
 259                 vx_current_xid(), inode, inode->i_xid, inode->i_ino,
 260                 vxd_cond_path(nd));
 261         return -EACCES;
 262 }
 263
 264 int permission(struct inode *inode, int mask, struct nameidata *nd)
 265 {
 266         umode_t mode = inode->i_mode;
 267         int retval, submask;
 268
 269         if (mask & MAY_WRITE) {
 270
 271                 /*
 272                  * Nobody gets write access to a read-only fs.
 273                  */
 274                 if ((IS_RDONLY(inode) || (nd && MNT_IS_RDONLY(nd->mnt))) &&
 275                     (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 276                         return -EROFS;
 277
 278                 /*
 279                  * Nobody gets write access to an immutable file.
 280                  */
 281                 if (IS_IMMUTABLE(inode))
 282                         return -EACCES;
 283         }
 284
 285
 286         /*
 287          * MAY_EXEC on regular files requires special handling: We override
 288          * filesystem execute permissions if the mode bits aren't set.
 289          */
 290         if ((mask & MAY_EXEC) && S_ISREG(mode) && !(mode & S_IXUGO))
 291                 return -EACCES;
 292
 293         /* Ordinary permission routines do not understand MAY_APPEND. */
 294         submask = mask & ~MAY_APPEND;
 295         if ((retval = xid_permission(inode, mask, nd)))
 296                 return retval;
 297         if (inode->i_op && inode->i_op->permission)
 298                 retval = inode->i_op->permission(inode, submask, nd);
 299         else
 300                 retval = generic_permission(inode, submask, NULL);
 301         if (retval)
 302                 return retval;
 303
 304         return security_inode_permission(inode, mask, nd);
 305 }
 306
 307 /**
 308  * vfs_permission  -  check for access rights to a given path
 309  * @nd:         lookup result that describes the path
 310  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 311  *
 312  * Used to check for read/write/execute permissions on a path.
 313  * We use "fsuid" for this, letting us set arbitrary permissions
 314  * for filesystem access without changing the "normal" uids which
 315  * are used for other things.
 316  */
 317 int vfs_permission(struct nameidata *nd, int mask)
 318 {
 319         return permission(nd->dentry->d_inode, mask, nd);
 320 }
 321
 322 /**
 323  * file_permission  -  check for additional access rights to a given file
 324  * @file:       file to check access rights for
 325  * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 326  *
 327  * Used to check for read/write/execute permissions on an already opened
 328  * file.
 329  *
 330  * Note:
 331  *      Do not use this function in new code.  All access checks should
 332  *      be done using vfs_permission().
 333  */
 334 int file_permission(struct file *file, int mask)
 335 {
 336         return permission(file->f_dentry->d_inode, mask, NULL);
 337 }
 338
 339 /*
 340  * get_write_access() gets write permission for a file.
 341  * put_write_access() releases this write permission.
 342  * This is used for regular files.
 343  * We cannot support write (and maybe mmap read-write shared) accesses and
 344  * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
 345  * can have the following values:
 346  * 0: no writers, no VM_DENYWRITE mappings
 347  * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
 348  * > 0: (i_writecount) users are writing to the file.
 349  *
 350  * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 351  * except for the cases where we don't hold i_writecount yet. Then we need to
 352  * use {get,deny}_write_access() - these functions check the sign and refuse
 353  * to do the change if sign is wrong. Exclusion between them is provided by
 354  * the inode->i_lock spinlock.
 355  */
 356
 357 int get_write_access(struct inode * inode)
 358 {
 359         spin_lock(&inode->i_lock);
 360         if (atomic_read(&inode->i_writecount) < 0) {
 361                 spin_unlock(&inode->i_lock);
 362                 return -ETXTBSY;
 363         }
 364         atomic_inc(&inode->i_writecount);
 365         spin_unlock(&inode->i_lock);
 366
 367         return 0;
 368 }
 369
 370 int deny_write_access(struct file * file)
 371 {
 372         struct inode *inode = file->f_dentry->d_inode;
 373
 374         spin_lock(&inode->i_lock);
 375         if (atomic_read(&inode->i_writecount) > 0) {
 376                 spin_unlock(&inode->i_lock);
 377                 return -ETXTBSY;
 378         }
 379         atomic_dec(&inode->i_writecount);
 380         spin_unlock(&inode->i_lock);
 381
 382         return 0;
 383 }
 384
 385 void path_release(struct nameidata *nd)
 386 {
 387         dput(nd->dentry);
 388         mntput(nd->mnt);
 389 }
 390
 391 /*
 392  * umount() mustn't call path_release()/mntput() as that would clear
 393  * mnt_expiry_mark
 394  */
 395 void path_release_on_umount(struct nameidata *nd)
 396 {
 397         dput(nd->dentry);
 398         mntput_no_expire(nd->mnt);
 399 }
 400
 401 /**
 402  * release_open_intent - free up open intent resources
 403  * @nd: pointer to nameidata
 404  */
 405 void release_open_intent(struct nameidata *nd)
 406 {
 407         if (nd->intent.open.file->f_dentry == NULL)
 408                 put_filp(nd->intent.open.file);
 409         else
 410                 fput(nd->intent.open.file);
 411 }
 412
 413 static inline struct dentry *do_revalidate(struct dentry *dentry, struct nameidata *nd)
 414 {
 415         int status = dentry->d_op->d_revalidate(dentry, nd);
 416         if (unlikely(status <= 0)) {
 417                 /*
 418                  * The dentry failed validation.
 419                  * If d_revalidate returned 0 attempt to invalidate
 420                  * the dentry otherwise d_revalidate is asking us
 421                  * to return a fail status.
 422                  */
 423                 if (!status) {
 424                         if (!d_invalidate(dentry)) {
 425                                 dput(dentry);
 426                                 dentry = NULL;
 427                         }
 428                 } else {
 429                         dput(dentry);
 430                         dentry = ERR_PTR(status);
 431                 }
 432         }
 433         return dentry;
 434 }
 435
 436 /*
 437  * Internal lookup() using the new generic dcache.
 438  * SMP-safe
 439  */
 440 static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 441 {
 442         struct dentry * dentry = __d_lookup(parent, name);
 443
 444         /* lockess __d_lookup may fail due to concurrent d_move()
 445          * in some unrelated directory, so try with d_lookup
 446          */
 447         if (!dentry)
 448                 dentry = d_lookup(parent, name);
 449
 450         if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
 451                 dentry = do_revalidate(dentry, nd);
 452
 453         return dentry;
 454 }
 455
 456 /*
 457  * Short-cut version of permission(), for calling by
 458  * path_walk(), when dcache lock is held.  Combines parts
 459  * of permission() and generic_permission(), and tests ONLY for
 460  * MAY_EXEC permission.
 461  *
 462  * If appropriate, check DAC only.  If not appropriate, or
 463  * short-cut DAC fails, then call permission() to do more
 464  * complete permission check.
 465  */
 466 static int exec_permission_lite(struct inode *inode,
 467                                        struct nameidata *nd)
 468 {
 469         umode_t mode = inode->i_mode;
 470
 471         if (vx_barrier(inode))
 472                 return -EACCES;
 473         if (inode->i_op && inode->i_op->permission)
 474                 return -EAGAIN;
 475
 476         if (current->fsuid == inode->i_uid)
 477                 mode >>= 6;
 478         else if (in_group_p(inode->i_gid))
 479                 mode >>= 3;
 480
 481         if (mode & MAY_EXEC)
 482                 goto ok;
 483
 484         if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
 485                 goto ok;
 486
 487         if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
 488                 goto ok;
 489
 490         if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
 491                 goto ok;
 492
 493         return -EACCES;
 494 ok:
 495         return security_inode_permission(inode, MAY_EXEC, nd);
 496 }
 497
 498 /*
 499  * This is called when everything else fails, and we actually have
 500  * to go to the low-level filesystem to find out what we should do..
 501  *
 502  * We get the directory semaphore, and after getting that we also
 503  * make sure that nobody added the entry to the dcache in the meantime..
 504  * SMP-safe
 505  */
 506 static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 507 {
 508         struct dentry * result;
 509         struct inode *dir = parent->d_inode;
 510
 511         mutex_lock(&dir->i_mutex);
 512         /*
 513          * First re-do the cached lookup just in case it was created
 514          * while we waited for the directory semaphore..
 515          *
 516          * FIXME! This could use version numbering or similar to
 517          * avoid unnecessary cache lookups.
 518          *
 519          * The "dcache_lock" is purely to protect the RCU list walker
 520          * from concurrent renames at this point (we mustn't get false
 521          * negatives from the RCU list walk here, unlike the optimistic
 522          * fast walk).
 523          *
 524          * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
 525          */
 526         result = d_lookup(parent, name);
 527         if (!result) {
 528                 struct dentry * dentry = d_alloc(parent, name);
 529                 result = ERR_PTR(-ENOMEM);
 530                 if (dentry) {
 531                         result = dir->i_op->lookup(dir, dentry, nd);
 532                         if (result)
 533                                 dput(dentry);
 534                         else
 535                                 result = dentry;
 536                 }
 537                 mutex_unlock(&dir->i_mutex);
 538                 return result;
 539         }
 540
 541         /*
 542          * Uhhuh! Nasty case: the cache was re-populated while
 543          * we waited on the semaphore. Need to revalidate.
 544          */
 545         mutex_unlock(&dir->i_mutex);
 546         if (result->d_op && result->d_op->d_revalidate) {
 547                 result = do_revalidate(result, nd);
 548                 if (!result)
 549                         result = ERR_PTR(-ENOENT);
 550         }
 551         return result;
 552 }
 553
 554 static int __emul_lookup_dentry(const char *, struct nameidata *);
 555
 556 /* SMP-safe */
 557 static __always_inline int
 558 walk_init_root(const char *name, struct nameidata *nd)
 559 {
 560         read_lock(&current->fs->lock);
 561         if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
 562                 nd->mnt = mntget(current->fs->altrootmnt);
 563                 nd->dentry = dget(current->fs->altroot);
 564                 read_unlock(&current->fs->lock);
 565                 if (__emul_lookup_dentry(name,nd))
 566                         return 0;
 567                 read_lock(&current->fs->lock);
 568         }
 569         nd->mnt = mntget(current->fs->rootmnt);
 570         nd->dentry = dget(current->fs->root);
 571         read_unlock(&current->fs->lock);
 572         return 1;
 573 }
 574
 575 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 576 {
 577         int res = 0;
 578         char *name;
 579         if (IS_ERR(link))
 580                 goto fail;
 581
 582         if (*link == '/') {
 583                 path_release(nd);
 584                 if (!walk_init_root(link, nd))
 585                         /* weird __emul_prefix() stuff did it */
 586                         goto out;
 587         }
 588         res = link_path_walk(link, nd);
 589 out:
 590         if (nd->depth || res || nd->last_type!=LAST_NORM)
 591                 return res;
 592         /*
 593          * If it is an iterative symlinks resolution in open_namei() we
 594          * have to copy the last component. And all that crap because of
 595          * bloody create() on broken symlinks. Furrfu...
 596          */
 597         name = __getname();
 598         if (unlikely(!name)) {
 599                 path_release(nd);
 600                 return -ENOMEM;
 601         }
 602         strcpy(name, nd->last.name);
 603         nd->last.name = name;
 604         return 0;
 605 fail:
 606         path_release(nd);
 607         return PTR_ERR(link);
 608 }
 609
 610 struct path {
 611         struct vfsmount *mnt;
 612         struct dentry *dentry;
 613 };
 614
 615 static inline void dput_path(struct path *path, struct nameidata *nd)
 616 {
 617         dput(path->dentry);
 618         if (path->mnt != nd->mnt)
 619                 mntput(path->mnt);
 620 }
 621
 622 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 623 {
 624         dput(nd->dentry);
 625         if (nd->mnt != path->mnt)
 626                 mntput(nd->mnt);
 627         nd->mnt = path->mnt;
 628         nd->dentry = path->dentry;
 629 }
 630
 631 static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
 632 {
 633         int error;
 634         void *cookie;
 635         struct dentry *dentry = path->dentry;
 636
 637         touch_atime(path->mnt, dentry);
 638         nd_set_link(nd, NULL);
 639
 640         if (path->mnt != nd->mnt) {
 641                 path_to_nameidata(path, nd);
 642                 dget(dentry);
 643         }
 644         mntget(path->mnt);
 645         cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
 646         error = PTR_ERR(cookie);
 647         if (!IS_ERR(cookie)) {
 648                 char *s = nd_get_link(nd);
 649                 error = 0;
 650                 if (s)
 651                         error = __vfs_follow_link(nd, s);
 652                 if (dentry->d_inode->i_op->put_link)
 653                         dentry->d_inode->i_op->put_link(dentry, nd, cookie);
 654         }
 655         dput(dentry);
 656         mntput(path->mnt);
 657
 658         return error;
 659 }
 660
 661 /*
 662  * This limits recursive symlink follows to 8, while
 663  * limiting consecutive symlinks to 40.
 664  *
 665  * Without that kind of total limit, nasty chains of consecutive
 666  * symlinks can cause almost arbitrarily long lookups.
 667  */
 668 static inline int do_follow_link(struct path *path, struct nameidata *nd)
 669 {
 670         int err = -ELOOP;
 671         if (current->link_count >= MAX_NESTED_LINKS)
 672                 goto loop;
 673         if (current->total_link_count >= 40)
 674                 goto loop;
 675         BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 676         cond_resched();
 677         err = security_inode_follow_link(path->dentry, nd);
 678         if (err)
 679                 goto loop;
 680         current->link_count++;
 681         current->total_link_count++;
 682         nd->depth++;
 683         err = __do_follow_link(path, nd);
 684         current->link_count--;
 685         nd->depth--;
 686         return err;
 687 loop:
 688         dput_path(path, nd);
 689         path_release(nd);
 690         return err;
 691 }
 692
 693 int follow_up(struct vfsmount **mnt, struct dentry **dentry)
 694 {
 695         struct vfsmount *parent;
 696         struct dentry *mountpoint;
 697         spin_lock(&vfsmount_lock);
 698         parent=(*mnt)->mnt_parent;
 699         if (parent == *mnt) {
 700                 spin_unlock(&vfsmount_lock);
 701                 return 0;
 702         }
 703         mntget(parent);
 704         mountpoint=dget((*mnt)->mnt_mountpoint);
 705         spin_unlock(&vfsmount_lock);
 706         dput(*dentry);
 707         *dentry = mountpoint;
 708         mntput(*mnt);
 709         *mnt = parent;
 710         return 1;
 711 }
 712
 713 /* no need for dcache_lock, as serialization is taken care in
 714  * namespace.c
 715  */
 716 static int __follow_mount(struct path *path)
 717 {
 718         int res = 0;
 719         while (d_mountpoint(path->dentry)) {
 720                 struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
 721                 if (!mounted)
 722                         break;
 723                 dput(path->dentry);
 724                 if (res)
 725                         mntput(path->mnt);
 726                 path->mnt = mounted;
 727                 path->dentry = dget(mounted->mnt_root);
 728                 res = 1;
 729         }
 730         return res;
 731 }
 732
 733 static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 734 {
 735         while (d_mountpoint(*dentry)) {
 736                 struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
 737                 if (!mounted)
 738                         break;
 739                 dput(*dentry);
 740                 mntput(*mnt);
 741                 *mnt = mounted;
 742                 *dentry = dget(mounted->mnt_root);
 743         }
 744 }
 745
 746 /* no need for dcache_lock, as serialization is taken care in
 747  * namespace.c
 748  */
 749 int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 750 {
 751         struct vfsmount *mounted;
 752
 753         mounted = lookup_mnt(*mnt, *dentry);
 754         if (mounted) {
 755                 dput(*dentry);
 756                 mntput(*mnt);
 757                 *mnt = mounted;
 758                 *dentry = dget(mounted->mnt_root);
 759                 return 1;
 760         }
 761         return 0;
 762 }
 763
 764 static __always_inline void follow_dotdot(struct nameidata *nd)
 765 {
 766         while(1) {
 767                 struct vfsmount *parent;
 768                 struct dentry *old = nd->dentry;
 769
 770                 read_lock(&current->fs->lock);
 771                 if (nd->dentry == current->fs->root &&
 772                     nd->mnt == current->fs->rootmnt) {
 773                         read_unlock(&current->fs->lock);
 774                         /* for sane '/' avoid follow_mount() */
 775                         return;
 776                 }
 777                 read_unlock(&current->fs->lock);
 778                 spin_lock(&dcache_lock);
 779                 if (nd->dentry != nd->mnt->mnt_root) {
 780                         nd->dentry = dget(nd->dentry->d_parent);
 781                         spin_unlock(&dcache_lock);
 782                         dput(old);
 783                         break;
 784                 }
 785                 spin_unlock(&dcache_lock);
 786                 spin_lock(&vfsmount_lock);
 787                 parent = nd->mnt->mnt_parent;
 788                 if (parent == nd->mnt) {
 789                         spin_unlock(&vfsmount_lock);
 790                         break;
 791                 }
 792                 mntget(parent);
 793                 nd->dentry = dget(nd->mnt->mnt_mountpoint);
 794                 spin_unlock(&vfsmount_lock);
 795                 dput(old);
 796                 mntput(nd->mnt);
 797                 nd->mnt = parent;
 798         }
 799         follow_mount(&nd->mnt, &nd->dentry);
 800 }
 801
 802 /*
 803  *  It's more convoluted than I'd like it to be, but... it's still fairly
 804  *  small and for now I'd prefer to have fast path as straight as possible.
 805  *  It _is_ time-critical.
 806  */
 807 static int do_lookup(struct nameidata *nd, struct qstr *name,
 808                      struct path *path, int atomic)
 809 {
 810         struct vfsmount *mnt = nd->mnt;
 811         struct dentry *dentry = __d_lookup(nd->dentry, name);
 812         struct inode *inode;
 813
 814         if (!dentry)
 815                 goto need_lookup;
 816         if (dentry->d_op && dentry->d_op->d_revalidate)
 817                 goto need_revalidate;
 818         inode = dentry->d_inode;
 819         if (!inode)
 820                 goto done;
 821 #ifdef CONFIG_VSERVER_FILESHARING
 822         /* MEF: PlanetLab FS module assumes that any file that can be
 823          * named (e.g., via a cross mount) is not hidden from another
 824          * context or the admin context.
 825          */
 826         if (vx_check(inode->i_xid,VX_STATIC|VX_DYNAMIC|VX_ADMIN)) {
 827                 /* do nothing */
 828         }
 829         else /* do the following check */
 830 #endif
 831         if (!vx_check(inode->i_xid, VX_WATCH|VX_ADMIN|VX_HOSTID|VX_IDENT))
 832                 goto hidden;
 833         if (inode->i_sb->s_magic == PROC_SUPER_MAGIC) {
 834                 struct proc_dir_entry *de = PDE(inode);
 835
 836                 if (de && !vx_hide_check(0, de->vx_flags))
 837                         goto hidden;
 838         }
 839 done:
 840         path->mnt = mnt;
 841         path->dentry = dentry;
 842         __follow_mount(path);
 843         return 0;
 844 hidden:
 845         vxwprintk(1, "xid=%d did lookup hidden %p[#%d,%lu] »%s«.",
 846                 vx_current_xid(), inode, inode->i_xid, inode->i_ino,
 847                 vxd_path(dentry, mnt));
 848         dput(dentry);
 849         return -ENOENT;
 850
 851 need_lookup:
 852         if (atomic)
 853                 return -EWOULDBLOCKIO;
 854         dentry = real_lookup(nd->dentry, name, nd);
 855         if (IS_ERR(dentry))
 856                 goto fail;
 857         goto done;
 858
 859 need_revalidate:
 860         if (atomic)
 861                 return -EWOULDBLOCKIO;
 862         dentry = do_revalidate(dentry, nd);
 863         if (!dentry)
 864                 goto need_lookup;
 865         if (IS_ERR(dentry))
 866                 goto fail;
 867         goto done;
 868
 869 fail:
 870         return PTR_ERR(dentry);
 871 }
 872
 873 /*
 874  * Name resolution.
 875  * This is the basic name resolution function, turning a pathname into
 876  * the final dentry. We expect 'base' to be positive and a directory.
 877  *
 878  * Returns 0 and nd will have valid dentry and mnt on success.
 879  * Returns error and drops reference to input namei data on failure.
 880  */
 881 static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
 882 {
 883         struct path next;
 884         struct inode *inode;
 885         int err, atomic;
 886         unsigned int lookup_flags = nd->flags;
 887
 888         atomic = (lookup_flags & LOOKUP_ATOMIC);
 889
 890         while (*name=='/')
 891                 name++;
 892         if (!*name)
 893                 goto return_reval;
 894
 895         inode = nd->dentry->d_inode;
 896         if (nd->depth)
 897                 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
 898
 899         /* At this point we know we have a real path component. */
 900         for(;;) {
 901                 unsigned long hash;
 902                 struct qstr this;
 903                 unsigned int c;
 904
 905                 nd->flags |= LOOKUP_CONTINUE;
 906                 err = exec_permission_lite(inode, nd);
 907                 if (err == -EAGAIN)
 908                         err = vfs_permission(nd, MAY_EXEC);
 909                 if (err)
 910                         break;
 911
 912                 this.name = name;
 913                 c = *(const unsigned char *)name;
 914
 915                 hash = init_name_hash();
 916                 do {
 917                         name++;
 918                         hash = partial_name_hash(c, hash);
 919                         c = *(const unsigned char *)name;
 920                 } while (c && (c != '/'));
 921                 this.len = name - (const char *) this.name;
 922                 this.hash = end_name_hash(hash);
 923
 924                 /* remove trailing slashes? */
 925                 if (!c)
 926                         goto last_component;
 927                 while (*++name == '/');
 928                 if (!*name)
 929                         goto last_with_slashes;
 930
 931                 /*
 932                  * "." and ".." are special - ".." especially so because it has
 933                  * to be able to know about the current root directory and
 934                  * parent relationships.
 935                  */
 936                 if (this.name[0] == '.') switch (this.len) {
 937                         default:
 938                                 break;
 939                         case 2:
 940                                 if (this.name[1] != '.')
 941                                         break;
 942                                 follow_dotdot(nd);
 943                                 inode = nd->dentry->d_inode;
 944                                 /* fallthrough */
 945                         case 1:
 946                                 continue;
 947                 }
 948                 /*
 949                  * See if the low-level filesystem might want
 950                  * to use its own hash..
 951                  */
 952                 if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
 953                         err = nd->dentry->d_op->d_hash(nd->dentry, &this);
 954                         if (err < 0)
 955                                 break;
 956                 }
 957                 /* This does the actual lookups.. */
 958                 err = do_lookup(nd, &this, &next, atomic);
 959                 if (err)
 960                         break;
 961
 962                 err = -ENOENT;
 963                 inode = next.dentry->d_inode;
 964                 if (!inode)
 965                         goto out_dput;
 966                 err = -ENOTDIR;
 967                 if (!inode->i_op)
 968                         goto out_dput;
 969
 970                 if (inode->i_op->follow_link) {
 971                         err = do_follow_link(&next, nd);
 972                         if (err)
 973                                 goto return_err;
 974                         err = -ENOENT;
 975                         inode = nd->dentry->d_inode;
 976                         if (!inode)
 977                                 break;
 978                         err = -ENOTDIR;
 979                         if (!inode->i_op)
 980                                 break;
 981                 } else
 982                         path_to_nameidata(&next, nd);
 983                 err = -ENOTDIR;
 984                 if (!inode->i_op->lookup)
 985                         break;
 986                 continue;
 987                 /* here ends the main loop */
 988
 989 last_with_slashes:
 990                 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 991 last_component:
 992                 /* Clear LOOKUP_CONTINUE iff it was previously unset */
 993                 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 994                 if (lookup_flags & LOOKUP_PARENT)
 995                         goto lookup_parent;
 996                 if (this.name[0] == '.') switch (this.len) {
 997                         default:
 998                                 break;
 999                         case 2:
1000                                 if (this.name[1] != '.')
1001                                         break;
1002                                 follow_dotdot(nd);
1003                                 inode = nd->dentry->d_inode;
1004                                 /* fallthrough */
1005                         case 1:
1006                                 goto return_reval;
1007                 }
1008                 if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
1009                         err = nd->dentry->d_op->d_hash(nd->dentry, &this);
1010                         if (err < 0)
1011                                 break;
1012                 }
1013                 err = do_lookup(nd, &this, &next, atomic);
1014                 if (err)
1015                         break;
1016                 inode = next.dentry->d_inode;
1017                 if ((lookup_flags & LOOKUP_FOLLOW)
1018                     && inode && inode->i_op && inode->i_op->follow_link) {
1019                         err = do_follow_link(&next, nd);
1020                         if (err)
1021                                 goto return_err;
1022                         inode = nd->dentry->d_inode;
1023                 } else
1024                         path_to_nameidata(&next, nd);
1025                 err = -ENOENT;
1026                 if (!inode)
1027                         break;
1028                 if (lookup_flags & LOOKUP_DIRECTORY) {
1029                         err = -ENOTDIR;
1030                         if (!inode->i_op || !inode->i_op->lookup)
1031                                 break;
1032                 }
1033                 goto return_base;
1034 lookup_parent:
1035                 nd->last = this;
1036                 nd->last_type = LAST_NORM;
1037                 if (this.name[0] != '.')
1038                         goto return_base;
1039                 if (this.len == 1)
1040                         nd->last_type = LAST_DOT;
1041                 else if (this.len == 2 && this.name[1] == '.')
1042                         nd->last_type = LAST_DOTDOT;
1043                 else
1044                         goto return_base;
1045 return_reval:
1046                 /*
1047                  * We bypassed the ordinary revalidation routines.
1048                  * We may need to check the cached dentry for staleness.
1049                  */
1050                 if (nd->dentry && nd->dentry->d_sb &&
1051                     (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
1052                         err = -ESTALE;
1053                         /* Note: we do not d_invalidate() */
1054                         if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
1055                                 break;
1056                 }
1057 return_base:
1058                 return 0;
1059 out_dput:
1060                 dput_path(&next, nd);
1061                 break;
1062         }
1063         path_release(nd);
1064 return_err:
1065         return err;
1066 }
1067
1068 /*
1069  * Wrapper to retry pathname resolution whenever the underlying
1070  * file system returns an ESTALE.
1071  *
1072  * Retry the whole path once, forcing real lookup requests
1073  * instead of relying on the dcache.
1074  */
1075 int fastcall link_path_walk(const char *name, struct nameidata *nd)
1076 {
1077         struct nameidata save = *nd;
1078         int result;
1079
1080         /* make sure the stuff we saved doesn't go away */
1081         dget(save.dentry);
1082         mntget(save.mnt);
1083
1084         result = __link_path_walk(name, nd);
1085         if (result == -ESTALE) {
1086                 *nd = save;
1087                 dget(nd->dentry);
1088                 mntget(nd->mnt);
1089                 nd->flags |= LOOKUP_REVAL;
1090                 result = __link_path_walk(name, nd);
1091         }
1092
1093         dput(save.dentry);
1094         mntput(save.mnt);
1095
1096         return result;
1097 }
1098
1099 int fastcall path_walk(const char * name, struct nameidata *nd)
1100 {
1101         current->total_link_count = 0;
1102         return link_path_walk(name, nd);
1103 }
1104
1105 /*
1106  * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if
1107  * everything is done. Returns 0 and drops input nd, if lookup failed;
1108  */
1109 static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
1110 {
1111         if (path_walk(name, nd))
1112                 return 0;               /* something went wrong... */
1113
1114         if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) {
1115                 struct dentry *old_dentry = nd->dentry;
1116                 struct vfsmount *old_mnt = nd->mnt;
1117                 struct qstr last = nd->last;
1118                 int last_type = nd->last_type;
1119                 /*
1120                  * NAME was not found in alternate root or it's a directory.  Try to find
1121                  * it in the normal root:
1122                  */
1123                 nd->last_type = LAST_ROOT;
1124                 read_lock(&current->fs->lock);
1125                 nd->mnt = mntget(current->fs->rootmnt);
1126                 nd->dentry = dget(current->fs->root);
1127                 read_unlock(&current->fs->lock);
1128                 if (path_walk(name, nd) == 0) {
1129                         if (nd->dentry->d_inode) {
1130                                 dput(old_dentry);
1131                                 mntput(old_mnt);
1132                                 return 1;
1133                         }
1134                         path_release(nd);
1135                 }
1136                 nd->dentry = old_dentry;
1137                 nd->mnt = old_mnt;
1138                 nd->last = last;
1139                 nd->last_type = last_type;
1140         }
1141         return 1;
1142 }
1143
1144 void set_fs_altroot(void)
1145 {
1146         char *emul = __emul_prefix();
1147         struct nameidata nd;
1148         struct vfsmount *mnt = NULL, *oldmnt;
1149         struct dentry *dentry = NULL, *olddentry;
1150         int err;
1151
1152         if (!emul)
1153                 goto set_it;
1154         err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
1155         if (!err) {
1156                 mnt = nd.mnt;
1157                 dentry = nd.dentry;
1158         }
1159 set_it:
1160         write_lock(&current->fs->lock);
1161         oldmnt = current->fs->altrootmnt;
1162         olddentry = current->fs->altroot;
1163         current->fs->altrootmnt = mnt;
1164         current->fs->altroot = dentry;
1165         write_unlock(&current->fs->lock);
1166         if (olddentry) {
1167                 dput(olddentry);
1168                 mntput(oldmnt);
1169         }
1170 }
1171
1172 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1173 static int fastcall do_path_lookup(int dfd, const char *name,
1174                                 unsigned int flags, struct nameidata *nd)
1175 {
1176         int retval = 0;
1177         int fput_needed;
1178         struct file *file;
1179
1180         nd->last_type = LAST_ROOT; /* if there are only slashes... */
1181         nd->flags = flags;
1182         nd->depth = 0;
1183
1184         if (*name=='/') {
1185                 read_lock(&current->fs->lock);
1186                 if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
1187                         nd->mnt = mntget(current->fs->altrootmnt);
1188                         nd->dentry = dget(current->fs->altroot);
1189                         read_unlock(&current->fs->lock);
1190                         if (__emul_lookup_dentry(name,nd))
1191                                 goto out; /* found in altroot */
1192                         read_lock(&current->fs->lock);
1193                 }
1194                 nd->mnt = mntget(current->fs->rootmnt);
1195                 nd->dentry = dget(current->fs->root);
1196                 read_unlock(&current->fs->lock);
1197         } else if (dfd == AT_FDCWD) {
1198                 read_lock(&current->fs->lock);
1199                 nd->mnt = mntget(current->fs->pwdmnt);
1200                 nd->dentry = dget(current->fs->pwd);
1201                 read_unlock(&current->fs->lock);
1202         } else {
1203                 struct dentry *dentry;
1204
1205                 file = fget_light(dfd, &fput_needed);
1206                 retval = -EBADF;
1207                 if (!file)
1208                         goto out_fail;
1209
1210                 dentry = file->f_dentry;
1211
1212                 retval = -ENOTDIR;
1213                 if (!S_ISDIR(dentry->d_inode->i_mode))
1214                         goto fput_fail;
1215
1216                 retval = file_permission(file, MAY_EXEC);
1217                 if (retval)
1218                         goto fput_fail;
1219
1220                 nd->mnt = mntget(file->f_vfsmnt);
1221                 nd->dentry = dget(dentry);
1222
1223                 fput_light(file, fput_needed);
1224         }
1225         current->total_link_count = 0;
1226         retval = link_path_walk(name, nd);
1227 out:
1228         if (likely(retval == 0)) {
1229                 if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
1230                                 nd->dentry->d_inode))
1231                 audit_inode(name, nd->dentry->d_inode);
1232         }
1233 out_fail:
1234         return retval;
1235
1236 fput_fail:
1237         fput_light(file, fput_needed);
1238         goto out_fail;
1239 }
1240
1241 int fastcall path_lookup(const char *name, unsigned int flags,
1242                         struct nameidata *nd)
1243 {
1244         return do_path_lookup(AT_FDCWD, name, flags, nd);
1245 }
1246
1247 static int __path_lookup_intent_open(int dfd, const char *name,
1248                 unsigned int lookup_flags, struct nameidata *nd,
1249                 int open_flags, int create_mode)
1250 {
1251         struct file *filp = get_empty_filp();
1252         int err;
1253
1254         if (filp == NULL)
1255                 return -ENFILE;
1256         nd->intent.open.file = filp;
1257         nd->intent.open.flags = open_flags;
1258         nd->intent.open.create_mode = create_mode;
1259         err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
1260         if (IS_ERR(nd->intent.open.file)) {
1261                 if (err == 0) {
1262                         err = PTR_ERR(nd->intent.open.file);
1263                         path_release(nd);
1264                 }
1265         } else if (err != 0)
1266                 release_open_intent(nd);
1267         return err;
1268 }
1269
1270 /**
1271  * path_lookup_open - lookup a file path with open intent
1272  * @dfd: the directory to use as base, or AT_FDCWD
1273  * @name: pointer to file name
1274  * @lookup_flags: lookup intent flags
1275  * @nd: pointer to nameidata
1276  * @open_flags: open intent flags
1277  */
1278 int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
1279                 struct nameidata *nd, int open_flags)
1280 {
1281         return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
1282                         open_flags, 0);
1283 }
1284
1285 /**
1286  * path_lookup_create - lookup a file path with open + create intent
1287  * @dfd: the directory to use as base, or AT_FDCWD
1288  * @name: pointer to file name
1289  * @lookup_flags: lookup intent flags
1290  * @nd: pointer to nameidata
1291  * @open_flags: open intent flags
1292  * @create_mode: create intent flags
1293  */
1294 static int path_lookup_create(int dfd, const char *name,
1295                               unsigned int lookup_flags, struct nameidata *nd,
1296                               int open_flags, int create_mode)
1297 {
1298         return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
1299                         nd, open_flags, create_mode);
1300 }
1301
1302 int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
1303                 struct nameidata *nd, int open_flags)
1304 {
1305         char *tmp = getname(name);
1306         int err = PTR_ERR(tmp);
1307
1308         if (!IS_ERR(tmp)) {
1309                 err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
1310                 putname(tmp);
1311         }
1312         return err;
1313 }
1314
1315 /*
1316  * Restricted form of lookup. Doesn't follow links, single-component only,
1317  * needs parent already locked. Doesn't follow mounts.
1318  * SMP-safe.
1319  */
1320 static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
1321 {
1322         struct dentry * dentry;
1323         struct inode *inode;
1324         int err;
1325
1326         inode = base->d_inode;
1327         err = permission(inode, MAY_EXEC, nd);
1328         dentry = ERR_PTR(err);
1329         if (err)
1330                 goto out;
1331
1332         /*
1333          * See if the low-level filesystem might want
1334          * to use its own hash..
1335          */
1336         if (base->d_op && base->d_op->d_hash) {
1337                 err = base->d_op->d_hash(base, name);
1338                 dentry = ERR_PTR(err);
1339                 if (err < 0)
1340                         goto out;
1341         }
1342
1343         dentry = cached_lookup(base, name, nd);
1344         if (!dentry) {
1345                 struct dentry *new = d_alloc(base, name);
1346                 dentry = ERR_PTR(-ENOMEM);
1347                 if (!new)
1348                         goto out;
1349                 dentry = inode->i_op->lookup(inode, new, nd);
1350                 if (!dentry)
1351                         dentry = new;
1352                 else
1353                         dput(new);
1354         }
1355 out:
1356         return dentry;
1357 }
1358
1359 static struct dentry *lookup_hash(struct nameidata *nd)
1360 {
1361         return __lookup_hash(&nd->last, nd->dentry, nd);
1362 }
1363
1364 /* SMP-safe */
1365 struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
1366 {
1367         unsigned long hash;
1368         struct qstr this;
1369         unsigned int c;
1370
1371         this.name = name;
1372         this.len = len;
1373         if (!len)
1374                 goto access;
1375
1376         hash = init_name_hash();
1377         while (len--) {
1378                 c = *(const unsigned char *)name++;
1379                 if (c == '/' || c == '\0')
1380                         goto access;
1381                 hash = partial_name_hash(c, hash);
1382         }
1383         this.hash = end_name_hash(hash);
1384
1385         return __lookup_hash(&this, base, NULL);
1386 access:
1387         return ERR_PTR(-EACCES);
1388 }
1389
1390 /*
1391  *      namei()
1392  *
1393  * is used by most simple commands to get the inode of a specified name.
1394  * Open, link etc use their own routines, but this is enough for things
1395  * like 'chmod' etc.
1396  *
1397  * namei exists in two versions: namei/lnamei. The only difference is
1398  * that namei follows links, while lnamei does not.
1399  * SMP-safe
1400  */
1401 int fastcall __user_walk_fd(int dfd, const char __user *name, unsigned flags,
1402                             struct nameidata *nd)
1403 {
1404         char *tmp = getname(name);
1405         int err = PTR_ERR(tmp);
1406
1407         if (!IS_ERR(tmp)) {
1408                 err = do_path_lookup(dfd, tmp, flags, nd);
1409                 putname(tmp);
1410         }
1411         return err;
1412 }
1413
1414 int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
1415 {
1416         return __user_walk_fd(AT_FDCWD, name, flags, nd);
1417 }
1418
1419 /*
1420  * It's inline, so penalty for filesystems that don't use sticky bit is
1421  * minimal.
1422  */
1423 static inline int check_sticky(struct inode *dir, struct inode *inode)
1424 {
1425         if (!(dir->i_mode & S_ISVTX))
1426                 return 0;
1427         if (inode->i_uid == current->fsuid)
1428                 return 0;
1429         if (dir->i_uid == current->fsuid)
1430                 return 0;
1431         return !capable(CAP_FOWNER);
1432 }
1433
1434 /*
1435  *      Check whether we can remove a link victim from directory dir, check
1436  *  whether the type of victim is right.
1437  *  1. We can't do it if dir is read-only (done in permission())
1438  *  2. We should have write and exec permissions on dir
1439  *  3. We can't remove anything from append-only dir
1440  *  4. We can't do anything with immutable dir (done in permission())
1441  *  5. If the sticky bit on dir is set we should either
1442  *      a. be owner of dir, or
1443  *      b. be owner of victim, or
1444  *      c. have CAP_FOWNER capability
1445  *  6. If the victim is append-only or immutable we can't do antyhing with
1446  *     links pointing to it.
1447  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1448  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1449  *  9. We can't remove a root or mountpoint.
1450  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1451  *     nfs_async_unlink().
1452  */
1453 static int may_delete(struct inode *dir, struct dentry *victim,
1454         int isdir, struct nameidata *nd)
1455 {
1456         int error;
1457
1458         if (!victim->d_inode)
1459                 return -ENOENT;
1460
1461         BUG_ON(victim->d_parent->d_inode != dir);
1462         audit_inode_child(victim->d_name.name, victim->d_inode, dir);
1463
1464         error = permission(dir,MAY_WRITE | MAY_EXEC, nd);
1465         if (error)
1466                 return error;
1467         if (IS_APPEND(dir))
1468                 return -EPERM;
1469         if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1470                 IS_IXORUNLINK(victim->d_inode))
1471                 return -EPERM;
1472         if (isdir) {
1473                 if (!S_ISDIR(victim->d_inode->i_mode))
1474                         return -ENOTDIR;
1475                 if (IS_ROOT(victim))
1476                         return -EBUSY;
1477         } else if (S_ISDIR(victim->d_inode->i_mode))
1478                 return -EISDIR;
1479         if (IS_DEADDIR(dir))
1480                 return -ENOENT;
1481         if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1482                 return -EBUSY;
1483         return 0;
1484 }
1485
1486 /*      Check whether we can create an object with dentry child in directory
1487  *  dir.
1488  *  1. We can't do it if child already exists (open has special treatment for
1489  *     this case, but since we are inlined it's OK)
1490  *  2. We can't do it if dir is read-only (done in permission())
1491  *  3. We should have write and exec permissions on dir
1492  *  4. We can't do it if dir is immutable (done in permission())
1493  */
1494 static inline int may_create(struct inode *dir, struct dentry *child,
1495                              struct nameidata *nd)
1496 {
1497         if (child->d_inode)
1498                 return -EEXIST;
1499         if (IS_DEADDIR(dir))
1500                 return -ENOENT;
1501         return permission(dir,MAY_WRITE | MAY_EXEC, nd);
1502 }
1503
1504 /*
1505  * O_DIRECTORY translates into forcing a directory lookup.
1506  */
1507 static inline int lookup_flags(unsigned int f)
1508 {
1509         unsigned long retval = LOOKUP_FOLLOW;
1510
1511         if (f & O_NOFOLLOW)
1512                 retval &= ~LOOKUP_FOLLOW;
1513
1514         if (f & O_DIRECTORY)
1515                 retval |= LOOKUP_DIRECTORY;
1516         if (f & O_ATOMICLOOKUP)
1517                 retval |= LOOKUP_ATOMIC;
1518
1519         return retval;
1520 }
1521
1522 /*
1523  * p1 and p2 should be directories on the same fs.
1524  */
1525 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1526 {
1527         struct dentry *p;
1528
1529         if (p1 == p2) {
1530                 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1531                 return NULL;
1532         }
1533
1534         mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1535
1536         for (p = p1; p->d_parent != p; p = p->d_parent) {
1537                 if (p->d_parent == p2) {
1538                         mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
1539                         mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
1540                         return p;
1541                 }
1542         }
1543
1544         for (p = p2; p->d_parent != p; p = p->d_parent) {
1545                 if (p->d_parent == p1) {
1546                         mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1547                         mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1548                         return p;
1549                 }
1550         }
1551
1552         mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1553         mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1554         return NULL;
1555 }
1556
1557 void unlock_rename(struct dentry *p1, struct dentry *p2)
1558 {
1559         mutex_unlock(&p1->d_inode->i_mutex);
1560         if (p1 != p2) {
1561                 mutex_unlock(&p2->d_inode->i_mutex);
1562                 mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1563         }
1564 }
1565
1566 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1567                 struct nameidata *nd)
1568 {
1569         int error = may_create(dir, dentry, nd);
1570
1571         if (error)
1572                 return error;
1573
1574         if (!dir->i_op || !dir->i_op->create)
1575                 return -EACCES; /* shouldn't it be ENOSYS? */
1576         mode &= S_IALLUGO;
1577         mode |= S_IFREG;
1578         error = security_inode_create(dir, dentry, mode);
1579         if (error)
1580                 return error;
1581         DQUOT_INIT(dir);
1582         error = dir->i_op->create(dir, dentry, mode, nd);
1583         if (!error)
1584                 fsnotify_create(dir, dentry);
1585         return error;
1586 }
1587
1588 int may_open(struct nameidata *nd, int acc_mode, int flag)
1589 {
1590         struct dentry *dentry = nd->dentry;
1591         struct inode *inode = dentry->d_inode;
1592         int error;
1593
1594         if (!inode)
1595                 return -ENOENT;
1596
1597         if (S_ISLNK(inode->i_mode))
1598                 return -ELOOP;
1599
1600         if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
1601                 return -EISDIR;
1602
1603         error = vfs_permission(nd, acc_mode);
1604         if (error)
1605                 return error;
1606
1607         /*
1608          * FIFO's, sockets and device files are special: they don't
1609          * actually live on the filesystem itself, and as such you
1610          * can write to them even if the filesystem is read-only.
1611          */
1612         if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1613                 flag &= ~O_TRUNC;
1614         } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1615                 if (nd->mnt->mnt_flags & MNT_NODEV)
1616                         return -EACCES;
1617
1618                 flag &= ~O_TRUNC;
1619         } else if ((IS_RDONLY(inode) || MNT_IS_RDONLY(nd->mnt))
1620                 && (flag & FMODE_WRITE))
1621                 return -EROFS;
1622         /*
1623          * An append-only file must be opened in append mode for writing.
1624          */
1625         if (IS_APPEND(inode)) {
1626                 if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1627                         return -EPERM;
1628                 if (flag & O_TRUNC)
1629                         return -EPERM;
1630         }
1631
1632         /* O_NOATIME can only be set by the owner or superuser */
1633         if (flag & O_NOATIME)
1634                 if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
1635                         return -EPERM;
1636
1637         /*
1638          * Ensure there are no outstanding leases on the file.
1639          */
1640         error = break_lease(inode, flag);
1641         if (error)
1642                 return error;
1643
1644         if (flag & O_TRUNC) {
1645                 error = get_write_access(inode);
1646                 if (error)
1647                         return error;
1648
1649                 /*
1650                  * Refuse to truncate files with mandatory locks held on them.
1651                  */
1652                 error = locks_verify_locked(inode);
1653                 if (!error) {
1654                         DQUOT_INIT(inode);
1655
1656                         error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL);
1657                 }
1658                 put_write_access(inode);
1659                 if (error)
1660                         return error;
1661         } else
1662                 if (flag & FMODE_WRITE)
1663                         DQUOT_INIT(inode);
1664
1665         return 0;
1666 }
1667
1668 /*
1669  *      open_namei()
1670  *
1671  * namei for open - this is in fact almost the whole open-routine.
1672  *
1673  * Note that the low bits of "flag" aren't the same as in the open
1674  * system call - they are 00 - no permissions needed
1675  *                        01 - read permission needed
1676  *                        10 - write permission needed
1677  *                        11 - read/write permissions needed
1678  * which is a lot more logical, and also allows the "no perm" needed
1679  * for symlinks (where the permissions are checked later).
1680  * SMP-safe
1681  */
1682 int open_namei(int dfd, const char *pathname, int flag,
1683                 int mode, struct nameidata *nd)
1684 {
1685         int acc_mode, error;
1686         struct path path;
1687         struct dentry *dir;
1688         int count = 0;
1689
1690         acc_mode = ACC_MODE(flag);
1691
1692         /* O_TRUNC implies we need access checks for write permissions */
1693         if (flag & O_TRUNC)
1694                 acc_mode |= MAY_WRITE;
1695
1696         /* Allow the LSM permission hook to distinguish append
1697            access from general write access. */
1698         if (flag & O_APPEND)
1699                 acc_mode |= MAY_APPEND;
1700
1701         /*
1702          * The simplest case - just a plain lookup.
1703          */
1704         if (!(flag & O_CREAT)) {
1705                 error = path_lookup_open(dfd, pathname, lookup_flags(flag),
1706                                          nd, flag);
1707                 if (error)
1708                         return error;
1709                 goto ok;
1710         }
1711
1712         /*
1713          * Create - we need to know the parent.
1714          */
1715         error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
1716         if (error)
1717                 return error;
1718
1719         /*
1720          * We have the parent and last component. First of all, check
1721          * that we are not asked to creat(2) an obvious directory - that
1722          * will not do.
1723          */
1724         error = -EISDIR;
1725         if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
1726                 goto exit;
1727
1728         dir = nd->dentry;
1729         nd->flags &= ~LOOKUP_PARENT;
1730         mutex_lock(&dir->d_inode->i_mutex);
1731         path.dentry = lookup_hash(nd);
1732         path.mnt = nd->mnt;
1733
1734 do_last:
1735         error = PTR_ERR(path.dentry);
1736         if (IS_ERR(path.dentry)) {
1737                 mutex_unlock(&dir->d_inode->i_mutex);
1738                 goto exit;
1739         }
1740
1741         if (IS_ERR(nd->intent.open.file)) {
1742                 mutex_unlock(&dir->d_inode->i_mutex);
1743                 error = PTR_ERR(nd->intent.open.file);
1744                 goto exit_dput;
1745         }
1746
1747         /* Negative dentry, just create the file */
1748         if (!path.dentry->d_inode) {
1749                 if (!IS_POSIXACL(dir->d_inode))
1750                         mode &= ~current->fs->umask;
1751                 error = vfs_create(dir->d_inode, path.dentry, mode, nd);
1752                 mutex_unlock(&dir->d_inode->i_mutex);
1753                 dput(nd->dentry);
1754                 nd->dentry = path.dentry;
1755                 if (error)
1756                         goto exit;
1757                 /* Don't check for write permission, don't truncate */
1758                 acc_mode = 0;
1759                 flag &= ~O_TRUNC;
1760                 goto ok;
1761         }
1762
1763         /*
1764          * It already exists.
1765          */
1766         mutex_unlock(&dir->d_inode->i_mutex);
1767         audit_inode_update(path.dentry->d_inode);
1768
1769         error = -EEXIST;
1770         if (flag & O_EXCL)
1771                 goto exit_dput;
1772
1773         if (__follow_mount(&path)) {
1774                 error = -ELOOP;
1775                 if (flag & O_NOFOLLOW)
1776                         goto exit_dput;
1777         }
1778
1779         error = -ENOENT;
1780         if (!path.dentry->d_inode)
1781                 goto exit_dput;
1782         if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
1783                 goto do_link;
1784
1785         path_to_nameidata(&path, nd);
1786         error = -EISDIR;
1787         if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1788                 goto exit;
1789 ok:
1790         error = may_open(nd, acc_mode, flag);
1791         if (error)
1792                 goto exit;
1793         return 0;
1794
1795 exit_dput:
1796         dput_path(&path, nd);
1797 exit:
1798         if (!IS_ERR(nd->intent.open.file))
1799                 release_open_intent(nd);
1800         path_release(nd);
1801         return error;
1802
1803 do_link:
1804         error = -ELOOP;
1805         if (flag & O_NOFOLLOW)
1806                 goto exit_dput;
1807         /*
1808          * This is subtle. Instead of calling do_follow_link() we do the
1809          * thing by hands. The reason is that this way we have zero link_count
1810          * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
1811          * After that we have the parent and last component, i.e.
1812          * we are in the same situation as after the first path_walk().
1813          * Well, almost - if the last component is normal we get its copy
1814          * stored in nd->last.name and we will have to putname() it when we
1815          * are done. Procfs-like symlinks just set LAST_BIND.
1816          */
1817         nd->flags |= LOOKUP_PARENT;
1818         error = security_inode_follow_link(path.dentry, nd);
1819         if (error)
1820                 goto exit_dput;
1821         error = __do_follow_link(&path, nd);
1822         if (error) {
1823                 /* Does someone understand code flow here? Or it is only
1824                  * me so stupid? Anathema to whoever designed this non-sense
1825                  * with "intent.open".
1826                  */
1827                 release_open_intent(nd);
1828                 return error;
1829         }
1830         nd->flags &= ~LOOKUP_PARENT;
1831         if (nd->last_type == LAST_BIND)
1832                 goto ok;
1833         error = -EISDIR;
1834         if (nd->last_type != LAST_NORM)
1835                 goto exit;
1836         if (nd->last.name[nd->last.len]) {
1837                 __putname(nd->last.name);
1838                 goto exit;
1839         }
1840         error = -ELOOP;
1841         if (count++==32) {
1842                 __putname(nd->last.name);
1843                 goto exit;
1844         }
1845         dir = nd->dentry;
1846         mutex_lock(&dir->d_inode->i_mutex);
1847         path.dentry = lookup_hash(nd);
1848         path.mnt = nd->mnt;
1849         __putname(nd->last.name);
1850         goto do_last;
1851 }
1852
1853 /**
1854  * lookup_create - lookup a dentry, creating it if it doesn't exist
1855  * @nd: nameidata info
1856  * @is_dir: directory flag
1857  *
1858  * Simple function to lookup and return a dentry and create it
1859  * if it doesn't exist.  Is SMP-safe.
1860  *
1861  * Returns with nd->dentry->d_inode->i_mutex locked.
1862  */
1863 struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1864 {
1865         struct dentry *dentry = ERR_PTR(-EEXIST);
1866
1867         mutex_lock_nested(&nd->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
1868         /*
1869          * Yucky last component or no last component at all?
1870          * (foo/., foo/.., /////)
1871          */
1872         if (nd->last_type != LAST_NORM)
1873                 goto fail;
1874         nd->flags &= ~LOOKUP_PARENT;
1875         nd->flags |= LOOKUP_CREATE;
1876         nd->intent.open.flags = O_EXCL;
1877
1878         /*
1879          * Do the final lookup.
1880          */
1881         dentry = lookup_hash(nd);
1882         if (IS_ERR(dentry))
1883                 goto fail;
1884
1885         /*
1886          * Special case - lookup gave negative, but... we had foo/bar/
1887          * From the vfs_mknod() POV we just have a negative dentry -
1888          * all is fine. Let's be bastards - you had / on the end, you've
1889          * been asking for (non-existent) directory. -ENOENT for you.
1890          */
1891         if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
1892                 goto enoent;
1893         return dentry;
1894 enoent:
1895         dput(dentry);
1896         dentry = ERR_PTR(-ENOENT);
1897 fail:
1898         return dentry;
1899 }
1900 EXPORT_SYMBOL_GPL(lookup_create);
1901
1902 int vfs_mknod(struct inode *dir, struct dentry *dentry,
1903         int mode, dev_t dev, struct nameidata *nd)
1904 {
1905         int error = may_create(dir, dentry, nd);
1906
1907         if (error)
1908                 return error;
1909
1910         if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1911                 return -EPERM;
1912
1913         if (!dir->i_op || !dir->i_op->mknod)
1914                 return -EPERM;
1915
1916         error = security_inode_mknod(dir, dentry, mode, dev);
1917         if (error)
1918                 return error;
1919
1920         DQUOT_INIT(dir);
1921         error = dir->i_op->mknod(dir, dentry, mode, dev);
1922         if (!error)
1923                 fsnotify_create(dir, dentry);
1924         return error;
1925 }
1926
1927 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
1928                                 unsigned dev)
1929 {
1930         int error = 0;
1931         char * tmp;
1932         struct dentry * dentry;
1933         struct nameidata nd;
1934
1935         if (S_ISDIR(mode))
1936                 return -EPERM;
1937         tmp = getname(filename);
1938         if (IS_ERR(tmp))
1939                 return PTR_ERR(tmp);
1940
1941         error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
1942         if (error)
1943                 goto out;
1944         dentry = lookup_create(&nd, 0);
1945         error = PTR_ERR(dentry);
1946
1947         if (!IS_POSIXACL(nd.dentry->d_inode))
1948                 mode &= ~current->fs->umask;
1949         if (!IS_ERR(dentry)) {
1950                 switch (mode & S_IFMT) {
1951                 case 0: case S_IFREG:
1952                         error = vfs_create(nd.dentry->d_inode,dentry,mode,&nd);
1953                         break;
1954                 case S_IFCHR: case S_IFBLK:
1955                         error = vfs_mknod(nd.dentry->d_inode, dentry, mode,
1956                                         new_decode_dev(dev), &nd);
1957                         break;
1958                 case S_IFIFO: case S_IFSOCK:
1959                         error = vfs_mknod(nd.dentry->d_inode, dentry, mode,
1960                                         0, &nd);
1961                         break;
1962                 case S_IFDIR:
1963                         error = -EPERM;
1964                         break;
1965                 default:
1966                         error = -EINVAL;
1967                 }
1968                 dput(dentry);
1969         }
1970         mutex_unlock(&nd.dentry->d_inode->i_mutex);
1971         path_release(&nd);
1972 out:
1973         putname(tmp);
1974
1975         return error;
1976 }
1977
1978 asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
1979 {
1980         return sys_mknodat(AT_FDCWD, filename, mode, dev);
1981 }
1982
1983 int vfs_mkdir(struct inode *dir, struct dentry *dentry,
1984         int mode, struct nameidata *nd)
1985 {
1986         int error = may_create(dir, dentry, nd);
1987
1988         if (error)
1989                 return error;
1990
1991         if (!dir->i_op || !dir->i_op->mkdir)
1992                 return -EPERM;
1993
1994         mode &= (S_IRWXUGO|S_ISVTX);
1995         error = security_inode_mkdir(dir, dentry, mode);
1996         if (error)
1997                 return error;
1998
1999         DQUOT_INIT(dir);
2000         error = dir->i_op->mkdir(dir, dentry, mode);
2001         if (!error)
2002                 fsnotify_mkdir(dir, dentry);
2003         return error;
2004 }
2005
2006 asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
2007 {
2008         int error = 0;
2009         char * tmp;
2010
2011         tmp = getname(pathname);
2012         error = PTR_ERR(tmp);
2013         if (!IS_ERR(tmp)) {
2014                 struct dentry *dentry;
2015                 struct nameidata nd;
2016
2017                 error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
2018                 if (error)
2019                         goto out;
2020                 dentry = lookup_create(&nd, 1);
2021                 error = PTR_ERR(dentry);
2022                 if (!IS_ERR(dentry)) {
2023                         if (!IS_POSIXACL(nd.dentry->d_inode))
2024                                 mode &= ~current->fs->umask;
2025                         error = vfs_mkdir(nd.dentry->d_inode, dentry,
2026                                 mode, &nd);
2027                         dput(dentry);
2028                 }
2029                 mutex_unlock(&nd.dentry->d_inode->i_mutex);
2030                 path_release(&nd);
2031 out:
2032                 putname(tmp);
2033         }
2034
2035         return error;
2036 }
2037
2038 asmlinkage long sys_mkdir(const char __user *pathname, int mode)
2039 {
2040         return sys_mkdirat(AT_FDCWD, pathname, mode);
2041 }
2042
2043 /*
2044  * We try to drop the dentry early: we should have
2045  * a usage count of 2 if we're the only user of this
2046  * dentry, and if that is true (possibly after pruning
2047  * the dcache), then we drop the dentry now.
2048  *
2049  * A low-level filesystem can, if it choses, legally
2050  * do a
2051  *
2052  *      if (!d_unhashed(dentry))
2053  *              return -EBUSY;
2054  *
2055  * if it cannot handle the case of removing a directory
2056  * that is still in use by something else..
2057  */
2058 void dentry_unhash(struct dentry *dentry)
2059 {
2060         dget(dentry);
2061         if (atomic_read(&dentry->d_count))
2062                 shrink_dcache_parent(dentry);
2063         spin_lock(&dcache_lock);
2064         spin_lock(&dentry->d_lock);
2065         if (atomic_read(&dentry->d_count) == 2)
2066                 __d_drop(dentry);
2067         spin_unlock(&dentry->d_lock);
2068         spin_unlock(&dcache_lock);
2069 }
2070
2071 int vfs_rmdir(struct inode *dir, struct dentry *dentry,
2072         struct nameidata *nd)
2073 {
2074         int error = may_delete(dir, dentry, 1, nd);
2075
2076         if (error)
2077                 return error;
2078
2079         if (!dir->i_op || !dir->i_op->rmdir)
2080                 return -EPERM;
2081
2082         DQUOT_INIT(dir);
2083
2084         mutex_lock(&dentry->d_inode->i_mutex);
2085         dentry_unhash(dentry);
2086         if (d_mountpoint(dentry))
2087                 error = -EBUSY;
2088         else {
2089                 error = security_inode_rmdir(dir, dentry);
2090                 if (!error) {
2091                         error = dir->i_op->rmdir(dir, dentry);
2092                         if (!error)
2093                                 dentry->d_inode->i_flags |= S_DEAD;
2094                 }
2095         }
2096         mutex_unlock(&dentry->d_inode->i_mutex);
2097         if (!error) {
2098                 d_delete(dentry);
2099         }
2100         dput(dentry);
2101
2102         return error;
2103 }
2104
2105 static long do_rmdir(int dfd, const char __user *pathname)
2106 {
2107         int error = 0;
2108         char * name;
2109         struct dentry *dentry;
2110         struct nameidata nd;
2111
2112         name = getname(pathname);
2113         if(IS_ERR(name))
2114                 return PTR_ERR(name);
2115
2116         error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
2117         if (error)
2118                 goto exit;
2119
2120         switch(nd.last_type) {
2121                 case LAST_DOTDOT:
2122                         error = -ENOTEMPTY;
2123                         goto exit1;
2124                 case LAST_DOT:
2125                         error = -EINVAL;
2126                         goto exit1;
2127                 case LAST_ROOT:
2128                         error = -EBUSY;
2129                         goto exit1;
2130         }
2131         mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2132         dentry = lookup_hash(&nd);
2133         error = PTR_ERR(dentry);
2134         if (!IS_ERR(dentry)) {
2135                 error = vfs_rmdir(nd.dentry->d_inode, dentry, &nd);
2136                 dput(dentry);
2137         }
2138         mutex_unlock(&nd.dentry->d_inode->i_mutex);
2139 exit1:
2140         path_release(&nd);
2141 exit:
2142         putname(name);
2143         return error;
2144 }
2145
2146 asmlinkage long sys_rmdir(const char __user *pathname)
2147 {
2148         return do_rmdir(AT_FDCWD, pathname);
2149 }
2150
2151 int vfs_unlink(struct inode *dir, struct dentry *dentry,
2152         struct nameidata *nd)
2153 {
2154         int error = may_delete(dir, dentry, 0, nd);
2155
2156         if (error)
2157                 return error;
2158
2159         if (!dir->i_op || !dir->i_op->unlink)
2160                 return -EPERM;
2161
2162         DQUOT_INIT(dir);
2163
2164         mutex_lock(&dentry->d_inode->i_mutex);
2165         if (d_mountpoint(dentry))
2166                 error = -EBUSY;
2167         else {
2168                 error = security_inode_unlink(dir, dentry);
2169                 if (!error)
2170                         error = dir->i_op->unlink(dir, dentry);
2171         }
2172         mutex_unlock(&dentry->d_inode->i_mutex);
2173
2174         /* We don't d_delete() NFS sillyrenamed files--they still exist. */
2175         if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
2176                 d_delete(dentry);
2177         }
2178
2179         return error;
2180 }
2181
2182 /*
2183  * Make sure that the actual truncation of the file will occur outside its
2184  * directory's i_mutex.  Truncate can take a long time if there is a lot of
2185  * writeout happening, and we don't want to prevent access to the directory
2186  * while waiting on the I/O.
2187  */
2188 static long do_unlinkat(int dfd, const char __user *pathname)
2189 {
2190         int error = 0;
2191         char * name;
2192         struct dentry *dentry;
2193         struct nameidata nd;
2194         struct inode *inode = NULL;
2195
2196         name = getname(pathname);
2197         if(IS_ERR(name))
2198                 return PTR_ERR(name);
2199
2200         error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
2201         if (error)
2202                 goto exit;
2203         error = -EISDIR;
2204         if (nd.last_type != LAST_NORM)
2205                 goto exit1;
2206         mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2207         dentry = lookup_hash(&nd);
2208         error = PTR_ERR(dentry);
2209         if (!IS_ERR(dentry)) {
2210                 /* Why not before? Because we want correct error value */
2211                 if (nd.last.name[nd.last.len])
2212                         goto slashes;
2213                 inode = dentry->d_inode;
2214                 if (inode)
2215                         atomic_inc(&inode->i_count);
2216                 error = vfs_unlink(nd.dentry->d_inode, dentry, &nd);
2217         exit2:
2218                 dput(dentry);
2219         }
2220         mutex_unlock(&nd.dentry->d_inode->i_mutex);
2221         if (inode)
2222                 iput(inode);    /* truncate the inode here */
2223 exit1:
2224         path_release(&nd);
2225 exit:
2226         putname(name);
2227         return error;
2228
2229 slashes:
2230         error = !dentry->d_inode ? -ENOENT :
2231                 S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2232         goto exit2;
2233 }
2234
2235 asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
2236 {
2237         if ((flag & ~AT_REMOVEDIR) != 0)
2238                 return -EINVAL;
2239
2240         if (flag & AT_REMOVEDIR)
2241                 return do_rmdir(dfd, pathname);
2242
2243         return do_unlinkat(dfd, pathname);
2244 }
2245
2246 asmlinkage long sys_unlink(const char __user *pathname)
2247 {
2248         return do_unlinkat(AT_FDCWD, pathname);
2249 }
2250
2251 int vfs_symlink(struct inode *dir, struct dentry *dentry,
2252         const char *oldname, int mode, struct nameidata *nd)
2253 {
2254         int error = may_create(dir, dentry, nd);
2255
2256         if (error)
2257                 return error;
2258
2259         if (!dir->i_op || !dir->i_op->symlink)
2260                 return -EPERM;
2261
2262         error = security_inode_symlink(dir, dentry, oldname);
2263         if (error)
2264                 return error;
2265
2266         DQUOT_INIT(dir);
2267         error = dir->i_op->symlink(dir, dentry, oldname);
2268         if (!error)
2269                 fsnotify_create(dir, dentry);
2270         return error;
2271 }
2272
2273 asmlinkage long sys_symlinkat(const char __user *oldname,
2274                               int newdfd, const char __user *newname)
2275 {
2276         int error = 0;
2277         char * from;
2278         char * to;
2279
2280         from = getname(oldname);
2281         if(IS_ERR(from))
2282                 return PTR_ERR(from);
2283         to = getname(newname);
2284         error = PTR_ERR(to);
2285         if (!IS_ERR(to)) {
2286                 struct dentry *dentry;
2287                 struct nameidata nd;
2288
2289                 error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
2290                 if (error)
2291                         goto out;
2292                 dentry = lookup_create(&nd, 0);
2293                 error = PTR_ERR(dentry);
2294                 if (!IS_ERR(dentry)) {
2295                         error = vfs_symlink(nd.dentry->d_inode, dentry,
2296                                 from, S_IALLUGO, &nd);
2297                         dput(dentry);
2298                 }
2299                 mutex_unlock(&nd.dentry->d_inode->i_mutex);
2300                 path_release(&nd);
2301 out:
2302                 putname(to);
2303         }
2304         putname(from);
2305         return error;
2306 }
2307
2308 asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
2309 {
2310         return sys_symlinkat(oldname, AT_FDCWD, newname);
2311 }
2312
2313 int vfs_link(struct dentry *old_dentry, struct inode *dir,
2314         struct dentry *new_dentry, struct nameidata *nd)
2315 {
2316         struct inode *inode = old_dentry->d_inode;
2317         int error;
2318
2319         if (!inode)
2320                 return -ENOENT;
2321
2322         error = may_create(dir, new_dentry, nd);
2323         if (error)
2324                 return error;
2325
2326         if (dir->i_sb != inode->i_sb)
2327                 return -EXDEV;
2328
2329         /*
2330          * A link to an append-only or immutable file cannot be created.
2331          */
2332         if (IS_APPEND(inode) || IS_IXORUNLINK(inode))
2333                 return -EPERM;
2334         if (!dir->i_op || !dir->i_op->link)
2335                 return -EPERM;
2336         if (S_ISDIR(old_dentry->d_inode->i_mode))
2337                 return -EPERM;
2338
2339         error = security_inode_link(old_dentry, dir, new_dentry);
2340         if (error)
2341                 return error;
2342
2343         mutex_lock(&old_dentry->d_inode->i_mutex);
2344         DQUOT_INIT(dir);
2345         error = dir->i_op->link(old_dentry, dir, new_dentry);
2346         mutex_unlock(&old_dentry->d_inode->i_mutex);
2347         if (!error)
2348                 fsnotify_create(dir, new_dentry);
2349         return error;
2350 }
2351
2352 /*
2353  * Hardlinks are often used in delicate situations.  We avoid
2354  * security-related surprises by not following symlinks on the
2355  * newname.  --KAB
2356  *
2357  * We don't follow them on the oldname either to be compatible
2358  * with linux 2.0, and to avoid hard-linking to directories
2359  * and other special files.  --ADM
2360  */
2361 asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2362                            int newdfd, const char __user *newname,
2363                            int flags)
2364 {
2365         struct dentry *new_dentry;
2366         struct nameidata nd, old_nd;
2367         int error;
2368         char * to;
2369
2370         if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
2371                 return -EINVAL;
2372
2373         to = getname(newname);
2374         if (IS_ERR(to))
2375                 return PTR_ERR(to);
2376
2377         error = __user_walk_fd(olddfd, oldname,
2378                                flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
2379                                &old_nd);
2380         if (error)
2381                 goto exit;
2382         error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
2383         if (error)
2384                 goto out;
2385         /*
2386          * We allow hard-links to be created to a bind-mount as long
2387          * as the bind-mount is not read-only.  Checking for cross-dev
2388          * links is subsumed by the superblock check in vfs_link().
2389          */
2390         error = -EROFS;
2391         if (MNT_IS_RDONLY(old_nd.mnt))
2392                 goto out_release;
2393         new_dentry = lookup_create(&nd, 0);
2394         error = PTR_ERR(new_dentry);
2395         if (!IS_ERR(new_dentry)) {
2396                 error = vfs_link(old_nd.dentry, nd.dentry->d_inode,
2397                         new_dentry, &nd);
2398                 dput(new_dentry);
2399         }
2400         mutex_unlock(&nd.dentry->d_inode->i_mutex);
2401 out_release:
2402         path_release(&nd);
2403 out:
2404         path_release(&old_nd);
2405 exit:
2406         putname(to);
2407
2408         return error;
2409 }
2410
2411 asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
2412 {
2413         return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2414 }
2415
2416 /*
2417  * The worst of all namespace operations - renaming directory. "Perverted"
2418  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
2419  * Problems:
2420  *      a) we can get into loop creation. Check is done in is_subdir().
2421  *      b) race potential - two innocent renames can create a loop together.
2422  *         That's where 4.4 screws up. Current fix: serialization on
2423  *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
2424  *         story.
2425  *      c) we have to lock _three_ objects - parents and victim (if it exists).
2426  *         And that - after we got ->i_mutex on parents (until then we don't know
2427  *         whether the target exists).  Solution: try to be smart with locking
2428  *         order for inodes.  We rely on the fact that tree topology may change
2429  *         only under ->s_vfs_rename_mutex _and_ that parent of the object we
2430  *         move will be locked.  Thus we can rank directories by the tree
2431  *         (ancestors first) and rank all non-directories after them.
2432  *         That works since everybody except rename does "lock parent, lookup,
2433  *         lock child" and rename is under ->s_vfs_rename_mutex.
2434  *         HOWEVER, it relies on the assumption that any object with ->lookup()
2435  *         has no more than 1 dentry.  If "hybrid" objects will ever appear,
2436  *         we'd better make sure that there's no link(2) for them.
2437  *      d) some filesystems don't support opened-but-unlinked directories,
2438  *         either because of layout or because they are not ready to deal with
2439  *         all cases correctly. The latter will be fixed (taking this sort of
2440  *         stuff into VFS), but the former is not going away. Solution: the same
2441  *         trick as in rmdir().
2442  *      e) conversion from fhandle to dentry may come in the wrong moment - when
2443  *         we are removing the target. Solution: we will have to grab ->i_mutex
2444  *         in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2445  *         ->i_mutex on parents, which works but leads to some truely excessive
2446  *         locking].
2447  */
2448 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2449                           struct inode *new_dir, struct dentry *new_dentry)
2450 {
2451         int error = 0;
2452         struct inode *target;
2453
2454         /*
2455          * If we are going to change the parent - check write permissions,
2456          * we'll need to flip '..'.
2457          */
2458         if (new_dir != old_dir) {
2459                 error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
2460                 if (error)
2461                         return error;
2462         }
2463
2464         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2465         if (error)
2466                 return error;
2467
2468         target = new_dentry->d_inode;
2469         if (target) {
2470                 mutex_lock(&target->i_mutex);
2471                 dentry_unhash(new_dentry);
2472         }
2473         if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2474                 error = -EBUSY;
2475         else
2476                 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2477         if (target) {
2478                 if (!error)
2479                         target->i_flags |= S_DEAD;
2480                 mutex_unlock(&target->i_mutex);
2481                 if (d_unhashed(new_dentry))
2482                         d_rehash(new_dentry);
2483                 dput(new_dentry);
2484         }
2485         if (!error)
2486                 d_move(old_dentry,new_dentry);
2487         return error;
2488 }
2489
2490 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2491                             struct inode *new_dir, struct dentry *new_dentry)
2492 {
2493         struct inode *target;
2494         int error;
2495
2496         error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2497         if (error)
2498                 return error;
2499
2500         dget(new_dentry);
2501         target = new_dentry->d_inode;
2502         if (target)
2503                 mutex_lock(&target->i_mutex);
2504         if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2505                 error = -EBUSY;
2506         else
2507                 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2508         if (!error) {
2509                 /* The following d_move() should become unconditional */
2510                 if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME))
2511                         d_move(old_dentry, new_dentry);
2512         }
2513         if (target)
2514                 mutex_unlock(&target->i_mutex);
2515         dput(new_dentry);
2516         return error;
2517 }
2518
2519 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2520                struct inode *new_dir, struct dentry *new_dentry)
2521 {
2522         int error;
2523         int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2524         const char *old_name;
2525
2526         if (old_dentry->d_inode == new_dentry->d_inode)
2527                 return 0;
2528
2529         error = may_delete(old_dir, old_dentry, is_dir, NULL);
2530         if (error)
2531                 return error;
2532
2533         if (!new_dentry->d_inode)
2534                 error = may_create(new_dir, new_dentry, NULL);
2535         else
2536                 error = may_delete(new_dir, new_dentry, is_dir, NULL);
2537         if (error)
2538                 return error;
2539
2540         if (!old_dir->i_op || !old_dir->i_op->rename)
2541                 return -EPERM;
2542
2543         DQUOT_INIT(old_dir);
2544         DQUOT_INIT(new_dir);
2545
2546         old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2547
2548         if (is_dir)
2549                 error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2550         else
2551                 error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2552         if (!error) {
2553                 const char *new_name = old_dentry->d_name.name;
2554                 fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
2555                               new_dentry->d_inode, old_dentry->d_inode);
2556         }
2557         fsnotify_oldname_free(old_name);
2558
2559         return error;
2560 }
2561
2562 static int do_rename(int olddfd, const char *oldname,
2563                         int newdfd, const char *newname)
2564 {
2565         int error = 0;
2566         struct dentry * old_dir, * new_dir;
2567         struct dentry * old_dentry, *new_dentry;
2568         struct dentry * trap;
2569         struct nameidata oldnd, newnd;
2570
2571         error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
2572         if (error)
2573                 goto exit;
2574
2575         error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
2576         if (error)
2577                 goto exit1;
2578
2579         error = -EXDEV;
2580         if (oldnd.mnt != newnd.mnt)
2581                 goto exit2;
2582
2583         old_dir = oldnd.dentry;
2584         error = -EBUSY;
2585         if (oldnd.last_type != LAST_NORM)
2586                 goto exit2;
2587
2588         new_dir = newnd.dentry;
2589         if (newnd.last_type != LAST_NORM)
2590                 goto exit2;
2591
2592         trap = lock_rename(new_dir, old_dir);
2593
2594         old_dentry = lookup_hash(&oldnd);
2595         error = PTR_ERR(old_dentry);
2596         if (IS_ERR(old_dentry))
2597                 goto exit3;
2598         /* source must exist */
2599         error = -ENOENT;
2600         if (!old_dentry->d_inode)
2601                 goto exit4;
2602         /* unless the source is a directory trailing slashes give -ENOTDIR */
2603         if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2604                 error = -ENOTDIR;
2605                 if (oldnd.last.name[oldnd.last.len])
2606                         goto exit4;
2607                 if (newnd.last.name[newnd.last.len])
2608                         goto exit4;
2609         }
2610         /* source should not be ancestor of target */
2611         error = -EINVAL;
2612         if (old_dentry == trap)
2613                 goto exit4;
2614         error = -EROFS;
2615         if (MNT_IS_RDONLY(newnd.mnt))
2616                 goto exit4;
2617         new_dentry = lookup_hash(&newnd);
2618         error = PTR_ERR(new_dentry);
2619         if (IS_ERR(new_dentry))
2620                 goto exit4;
2621         /* target should not be an ancestor of source */
2622         error = -ENOTEMPTY;
2623         if (new_dentry == trap)
2624                 goto exit5;
2625
2626         error = vfs_rename(old_dir->d_inode, old_dentry,
2627                                    new_dir->d_inode, new_dentry);
2628 exit5:
2629         dput(new_dentry);
2630 exit4:
2631         dput(old_dentry);
2632 exit3:
2633         unlock_rename(new_dir, old_dir);
2634 exit2:
2635         path_release(&newnd);
2636 exit1:
2637         path_release(&oldnd);
2638 exit:
2639         return error;
2640 }
2641
2642 asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
2643                              int newdfd, const char __user *newname)
2644 {
2645         int error;
2646         char * from;
2647         char * to;
2648
2649         from = getname(oldname);
2650         if(IS_ERR(from))
2651                 return PTR_ERR(from);
2652         to = getname(newname);
2653         error = PTR_ERR(to);
2654         if (!IS_ERR(to)) {
2655                 error = do_rename(olddfd, from, newdfd, to);
2656                 putname(to);
2657         }
2658         putname(from);
2659         return error;
2660 }
2661
2662 asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
2663 {
2664         return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
2665 }
2666
2667 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
2668 {
2669         int len;
2670
2671         len = PTR_ERR(link);
2672         if (IS_ERR(link))
2673                 goto out;
2674
2675         len = strlen(link);
2676         if (len > (unsigned) buflen)
2677                 len = buflen;
2678         if (copy_to_user(buffer, link, len))
2679                 len = -EFAULT;
2680 out:
2681         return len;
2682 }
2683
2684 /*
2685  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
2686  * have ->follow_link() touching nd only in nd_set_link().  Using (or not
2687  * using) it for any given inode is up to filesystem.
2688  */
2689 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2690 {
2691         struct nameidata nd;
2692         void *cookie;
2693
2694         nd.depth = 0;
2695         cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
2696         if (!IS_ERR(cookie)) {
2697                 int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
2698                 if (dentry->d_inode->i_op->put_link)
2699                         dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
2700                 cookie = ERR_PTR(res);
2701         }
2702         return PTR_ERR(cookie);
2703 }
2704
2705 int vfs_follow_link(struct nameidata *nd, const char *link)
2706 {
2707         return __vfs_follow_link(nd, link);
2708 }
2709
2710 /* get the link contents into pagecache */
2711 static char *page_getlink(struct dentry * dentry, struct page **ppage)
2712 {
2713         struct page * page;
2714         struct address_space *mapping = dentry->d_inode->i_mapping;
2715         page = read_mapping_page(mapping, 0, NULL);
2716         if (IS_ERR(page))
2717                 goto sync_fail;
2718         wait_on_page_locked(page);
2719         if (!PageUptodate(page))
2720                 goto async_fail;
2721         *ppage = page;
2722         return kmap(page);
2723
2724 async_fail:
2725         page_cache_release(page);
2726         return ERR_PTR(-EIO);
2727
2728 sync_fail:
2729         return (char*)page;
2730 }
2731
2732 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2733 {
2734         struct page *page = NULL;
2735         char *s = page_getlink(dentry, &page);
2736         int res = vfs_readlink(dentry,buffer,buflen,s);
2737         if (page) {
2738                 kunmap(page);
2739                 page_cache_release(page);
2740         }
2741         return res;
2742 }
2743
2744 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
2745 {
2746         struct page *page = NULL;
2747         nd_set_link(nd, page_getlink(dentry, &page));
2748         return page;
2749 }
2750
2751 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2752 {
2753         struct page *page = cookie;
2754
2755         if (page) {
2756                 kunmap(page);
2757                 page_cache_release(page);
2758         }
2759 }
2760
2761 int __page_symlink(struct inode *inode, const char *symname, int len,
2762                 gfp_t gfp_mask)
2763 {
2764         struct address_space *mapping = inode->i_mapping;
2765         struct page *page;
2766         int err = -ENOMEM;
2767         char *kaddr;
2768
2769 retry:
2770         page = find_or_create_page(mapping, 0, gfp_mask);
2771         if (!page)
2772                 goto fail;
2773         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2774         if (err == AOP_TRUNCATED_PAGE) {
2775                 page_cache_release(page);
2776                 goto retry;
2777         }
2778         if (err)
2779                 goto fail_map;
2780         kaddr = kmap_atomic(page, KM_USER0);
2781         memcpy(kaddr, symname, len-1);
2782         kunmap_atomic(kaddr, KM_USER0);
2783         err = mapping->a_ops->commit_write(NULL, page, 0, len-1);
2784         if (err == AOP_TRUNCATED_PAGE) {
2785                 page_cache_release(page);
2786                 goto retry;
2787         }
2788         if (err)
2789                 goto fail_map;
2790         /*
2791          * Notice that we are _not_ going to block here - end of page is
2792          * unmapped, so this will only try to map the rest of page, see
2793          * that it is unmapped (typically even will not look into inode -
2794          * ->i_size will be enough for everything) and zero it out.
2795          * OTOH it's obviously correct and should make the page up-to-date.
2796          */
2797         if (!PageUptodate(page)) {
2798                 err = mapping->a_ops->readpage(NULL, page);
2799                 if (err != AOP_TRUNCATED_PAGE)
2800                         wait_on_page_locked(page);
2801         } else {
2802                 unlock_page(page);
2803         }
2804         page_cache_release(page);
2805         if (err < 0)
2806                 goto fail;
2807         mark_inode_dirty(inode);
2808         return 0;
2809 fail_map:
2810         unlock_page(page);
2811         page_cache_release(page);
2812 fail:
2813         return err;
2814 }
2815
2816 int page_symlink(struct inode *inode, const char *symname, int len)
2817 {
2818         return __page_symlink(inode, symname, len,
2819                         mapping_gfp_mask(inode->i_mapping));
2820 }
2821
2822 struct inode_operations page_symlink_inode_operations = {
2823         .readlink       = generic_readlink,
2824         .follow_link    = page_follow_link_light,
2825         .put_link       = page_put_link,
2826 };
2827
2828 EXPORT_SYMBOL(__user_walk);
2829 EXPORT_SYMBOL(__user_walk_fd);
2830 EXPORT_SYMBOL(follow_down);
2831 EXPORT_SYMBOL(follow_up);
2832 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
2833 EXPORT_SYMBOL(getname);
2834 EXPORT_SYMBOL(lock_rename);
2835 EXPORT_SYMBOL(lookup_one_len);
2836 EXPORT_SYMBOL(page_follow_link_light);
2837 EXPORT_SYMBOL(page_put_link);
2838 EXPORT_SYMBOL(page_readlink);
2839 EXPORT_SYMBOL(__page_symlink);
2840 EXPORT_SYMBOL(page_symlink);
2841 EXPORT_SYMBOL(page_symlink_inode_operations);
2842 EXPORT_SYMBOL(path_lookup);
2843 EXPORT_SYMBOL(path_release);
2844 EXPORT_SYMBOL(path_walk);
2845 EXPORT_SYMBOL(permission);
2846 EXPORT_SYMBOL(vfs_permission);
2847 EXPORT_SYMBOL(file_permission);
2848 EXPORT_SYMBOL(unlock_rename);
2849 EXPORT_SYMBOL(vfs_create);
2850 EXPORT_SYMBOL(vfs_follow_link);
2851 EXPORT_SYMBOL(vfs_link);
2852 EXPORT_SYMBOL(vfs_mkdir);
2853 EXPORT_SYMBOL(vfs_mknod);
2854 EXPORT_SYMBOL(generic_permission);
2855 EXPORT_SYMBOL(vfs_readlink);
2856 EXPORT_SYMBOL(vfs_rename);
2857 EXPORT_SYMBOL(vfs_rmdir);
2858 EXPORT_SYMBOL(vfs_symlink);
2859 EXPORT_SYMBOL(vfs_unlink);
2860 EXPORT_SYMBOL(dentry_unhash);
2861 EXPORT_SYMBOL(generic_readlink);