fs/xfs/xfs_vnodeops.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include <linux/capability.h>
  20
  21 #include "xfs.h"
  22 #include "xfs_fs.h"
  23 #include "xfs_types.h"
  24 #include "xfs_bit.h"
  25 #include "xfs_log.h"
  26 #include "xfs_inum.h"
  27 #include "xfs_trans.h"
  28 #include "xfs_sb.h"
  29 #include "xfs_ag.h"
  30 #include "xfs_dir.h"
  31 #include "xfs_dir2.h"
  32 #include "xfs_dmapi.h"
  33 #include "xfs_mount.h"
  34 #include "xfs_da_btree.h"
  35 #include "xfs_bmap_btree.h"
  36 #include "xfs_alloc_btree.h"
  37 #include "xfs_ialloc_btree.h"
  38 #include "xfs_dir_sf.h"
  39 #include "xfs_dir2_sf.h"
  40 #include "xfs_attr_sf.h"
  41 #include "xfs_dinode.h"
  42 #include "xfs_inode.h"
  43 #include "xfs_inode_item.h"
  44 #include "xfs_dir_leaf.h"
  45 #include "xfs_itable.h"
  46 #include "xfs_btree.h"
  47 #include "xfs_ialloc.h"
  48 #include "xfs_alloc.h"
  49 #include "xfs_bmap.h"
  50 #include "xfs_attr.h"
  51 #include "xfs_rw.h"
  52 #include "xfs_error.h"
  53 #include "xfs_quota.h"
  54 #include "xfs_utils.h"
  55 #include "xfs_rtalloc.h"
  56 #include "xfs_refcache.h"
  57 #include "xfs_trans_space.h"
  58 #include "xfs_log_priv.h"
  59 #include "xfs_mac.h"
  60
  61
  62 /*
  63  * The maximum pathlen is 1024 bytes. Since the minimum file system
  64  * blocksize is 512 bytes, we can get a max of 2 extents back from
  65  * bmapi.
  66  */
  67 #define SYMLINK_MAPS 2
  68
  69 /*
  70  * For xfs, we check that the file isn't too big to be opened by this kernel.
  71  * No other open action is required for regular files.  Devices are handled
  72  * through the specfs file system, pipes through fifofs.  Device and
  73  * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
  74  * when a new vnode is first looked up or created.
  75  */
  76 STATIC int
  77 xfs_open(
  78         bhv_desc_t      *bdp,
  79         cred_t          *credp)
  80 {
  81         int             mode;
  82         vnode_t         *vp;
  83         xfs_inode_t     *ip;
  84
  85         vp = BHV_TO_VNODE(bdp);
  86         ip = XFS_BHVTOI(bdp);
  87
  88         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
  89                 return XFS_ERROR(EIO);
  90
  91         /*
  92          * If it's a directory with any blocks, read-ahead block 0
  93          * as we're almost certain to have the next operation be a read there.
  94          */
  95         if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
  96                 mode = xfs_ilock_map_shared(ip);
  97                 if (ip->i_d.di_nextents > 0)
  98                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
  99                 xfs_iunlock(ip, mode);
 100         }
 101         return 0;
 102 }
 103
 104
 105 /*
 106  * xfs_getattr
 107  */
 108 STATIC int
 109 xfs_getattr(
 110         bhv_desc_t      *bdp,
 111         vattr_t         *vap,
 112         int             flags,
 113         cred_t          *credp)
 114 {
 115         xfs_inode_t     *ip;
 116         xfs_mount_t     *mp;
 117         vnode_t         *vp;
 118
 119         vp  = BHV_TO_VNODE(bdp);
 120         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 121
 122         ip = XFS_BHVTOI(bdp);
 123         mp = ip->i_mount;
 124
 125         if (XFS_FORCED_SHUTDOWN(mp))
 126                 return XFS_ERROR(EIO);
 127
 128         if (!(flags & ATTR_LAZY))
 129                 xfs_ilock(ip, XFS_ILOCK_SHARED);
 130
 131         vap->va_size = ip->i_d.di_size;
 132         if (vap->va_mask == XFS_AT_SIZE)
 133                 goto all_done;
 134
 135         vap->va_nblocks =
 136                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 137         vap->va_nodeid = ip->i_ino;
 138 #if XFS_BIG_INUMS
 139         vap->va_nodeid += mp->m_inoadd;
 140 #endif
 141         vap->va_nlink = ip->i_d.di_nlink;
 142
 143         /*
 144          * Quick exit for non-stat callers
 145          */
 146         if ((vap->va_mask &
 147             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
 148               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
 149                 goto all_done;
 150
 151         /*
 152          * Copy from in-core inode.
 153          */
 154         vap->va_mode = ip->i_d.di_mode;
 155         vap->va_uid = ip->i_d.di_uid;
 156         vap->va_gid = ip->i_d.di_gid;
 157         vap->va_xid = ip->i_d.di_xid;
 158         vap->va_projid = ip->i_d.di_projid;
 159
 160         /*
 161          * Check vnode type block/char vs. everything else.
 162          */
 163         switch (ip->i_d.di_mode & S_IFMT) {
 164         case S_IFBLK:
 165         case S_IFCHR:
 166                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
 167                 vap->va_blocksize = BLKDEV_IOSIZE;
 168                 break;
 169         default:
 170                 vap->va_rdev = 0;
 171
 172                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
 173                         vap->va_blocksize = xfs_preferred_iosize(mp);
 174                 } else {
 175
 176                         /*
 177                          * If the file blocks are being allocated from a
 178                          * realtime partition, then return the inode's
 179                          * realtime extent size or the realtime volume's
 180                          * extent size.
 181                          */
 182                         vap->va_blocksize = ip->i_d.di_extsize ?
 183                                 (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
 184                                 (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
 185                 }
 186                 break;
 187         }
 188
 189         vn_atime_to_timespec(vp, &vap->va_atime);
 190         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
 191         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
 192         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
 193         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
 194
 195         /*
 196          * Exit for stat callers.  See if any of the rest of the fields
 197          * to be filled in are needed.
 198          */
 199         if ((vap->va_mask &
 200              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 201               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 202                 goto all_done;
 203
 204         /*
 205          * Convert di_flags to xflags.
 206          */
 207         vap->va_xflags = xfs_ip2xflags(ip);
 208
 209         /*
 210          * Exit for inode revalidate.  See if any of the rest of
 211          * the fields to be filled in are needed.
 212          */
 213         if ((vap->va_mask &
 214              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
 215               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
 216                 goto all_done;
 217
 218         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
 219         vap->va_nextents =
 220                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
 221                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
 222                         ip->i_d.di_nextents;
 223         if (ip->i_afp)
 224                 vap->va_anextents =
 225                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
 226                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
 227                                  ip->i_d.di_anextents;
 228         else
 229                 vap->va_anextents = 0;
 230         vap->va_gen = ip->i_d.di_gen;
 231
 232  all_done:
 233         if (!(flags & ATTR_LAZY))
 234                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 235         return 0;
 236 }
 237
 238
 239 /*
 240  * xfs_setattr
 241  */
 242 int
 243 xfs_setattr(
 244         bhv_desc_t              *bdp,
 245         vattr_t                 *vap,
 246         int                     flags,
 247         cred_t                  *credp)
 248 {
 249         xfs_inode_t             *ip;
 250         xfs_trans_t             *tp;
 251         xfs_mount_t             *mp;
 252         int                     mask;
 253         int                     code;
 254         uint                    lock_flags;
 255         uint                    commit_flags=0;
 256         uid_t                   uid=0, iuid=0;
 257         gid_t                   gid=0, igid=0;
 258         xid_t                   xid=0, ixid=0;
 259         int                     timeflags = 0;
 260         vnode_t                 *vp;
 261         xfs_prid_t              projid=0, iprojid=0;
 262         int                     mandlock_before, mandlock_after;
 263         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
 264         int                     file_owner;
 265         int                     need_iolock = 1;
 266
 267         vp = BHV_TO_VNODE(bdp);
 268         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 269
 270         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 271                 return XFS_ERROR(EROFS);
 272
 273         /*
 274          * Cannot set certain attributes.
 275          */
 276         mask = vap->va_mask;
 277         if (mask & XFS_AT_NOSET) {
 278                 return XFS_ERROR(EINVAL);
 279         }
 280
 281         ip = XFS_BHVTOI(bdp);
 282         mp = ip->i_mount;
 283
 284         if (XFS_FORCED_SHUTDOWN(mp))
 285                 return XFS_ERROR(EIO);
 286
 287         /*
 288          * Timestamps do not need to be logged and hence do not
 289          * need to be done within a transaction.
 290          */
 291         if (mask & XFS_AT_UPDTIMES) {
 292                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
 293                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
 294                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
 295                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
 296                 xfs_ichgtime(ip, timeflags);
 297                 return 0;
 298         }
 299
 300         olddquot1 = olddquot2 = NULL;
 301         udqp = gdqp = NULL;
 302
 303         /*
 304          * If disk quotas is on, we make sure that the dquots do exist on disk,
 305          * before we start any other transactions. Trying to do this later
 306          * is messy. We don't care to take a readlock to look at the ids
 307          * in inode here, because we can't hold it across the trans_reserve.
 308          * If the IDs do change before we take the ilock, we're covered
 309          * because the i_*dquot fields will get updated anyway.
 310          */
 311         if (XFS_IS_QUOTA_ON(mp) &&
 312             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
 313                 uint    qflags = 0;
 314
 315                 /* FIXME: handle xid? */
 316                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
 317                         uid = vap->va_uid;
 318                         qflags |= XFS_QMOPT_UQUOTA;
 319                 } else {
 320                         uid = ip->i_d.di_uid;
 321                 }
 322                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
 323                         gid = vap->va_gid;
 324                         qflags |= XFS_QMOPT_GQUOTA;
 325                 }  else {
 326                         gid = ip->i_d.di_gid;
 327                 }
 328                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
 329                         projid = vap->va_projid;
 330                         qflags |= XFS_QMOPT_PQUOTA;
 331                 }  else {
 332                         projid = ip->i_d.di_projid;
 333                 }
 334                 /*
 335                  * We take a reference when we initialize udqp and gdqp,
 336                  * so it is important that we never blindly double trip on
 337                  * the same variable. See xfs_create() for an example.
 338                  */
 339                 ASSERT(udqp == NULL);
 340                 ASSERT(gdqp == NULL);
 341                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
 342                                          &udqp, &gdqp);
 343                 if (code)
 344                         return code;
 345         }
 346
 347         /*
 348          * For the other attributes, we acquire the inode lock and
 349          * first do an error checking pass.
 350          */
 351         tp = NULL;
 352         lock_flags = XFS_ILOCK_EXCL;
 353         ASSERT(flags & ATTR_NOLOCK ? flags & ATTR_DMI : 1);
 354         if (flags & ATTR_NOLOCK)
 355                 need_iolock = 0;
 356         if (!(mask & XFS_AT_SIZE)) {
 357                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
 358                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
 359                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 360                         commit_flags = 0;
 361                         if ((code = xfs_trans_reserve(tp, 0,
 362                                                      XFS_ICHANGE_LOG_RES(mp), 0,
 363                                                      0, 0))) {
 364                                 lock_flags = 0;
 365                                 goto error_return;
 366                         }
 367                 }
 368         } else {
 369                 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
 370                     !(flags & ATTR_DMI)) {
 371                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 372                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
 373                                 vap->va_size, 0, dmflags, NULL);
 374                         if (code) {
 375                                 lock_flags = 0;
 376                                 goto error_return;
 377                         }
 378                 }
 379                 if (need_iolock)
 380                         lock_flags |= XFS_IOLOCK_EXCL;
 381         }
 382
 383         xfs_ilock(ip, lock_flags);
 384
 385         /* boolean: are we the file owner? */
 386         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
 387
 388         /*
 389          * Change various properties of a file.
 390          * Only the owner or users with CAP_FOWNER
 391          * capability may do these things.
 392          */
 393         if (mask &
 394             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
 395              XFS_AT_GID|XFS_AT_PROJID)) {
 396                 /* FIXME: handle xid? */
 397
 398                 /*
 399                  * CAP_FOWNER overrides the following restrictions:
 400                  *
 401                  * The user ID of the calling process must be equal
 402                  * to the file owner ID, except in cases where the
 403                  * CAP_FSETID capability is applicable.
 404                  */
 405                 if (!file_owner && !capable(CAP_FOWNER)) {
 406                         code = XFS_ERROR(EPERM);
 407                         goto error_return;
 408                 }
 409
 410                 /*
 411                  * CAP_FSETID overrides the following restrictions:
 412                  *
 413                  * The effective user ID of the calling process shall match
 414                  * the file owner when setting the set-user-ID and
 415                  * set-group-ID bits on that file.
 416                  *
 417                  * The effective group ID or one of the supplementary group
 418                  * IDs of the calling process shall match the group owner of
 419                  * the file when setting the set-group-ID bit on that file
 420                  */
 421                 if (mask & XFS_AT_MODE) {
 422                         mode_t m = 0;
 423
 424                         if ((vap->va_mode & S_ISUID) && !file_owner)
 425                                 m |= S_ISUID;
 426                         if ((vap->va_mode & S_ISGID) &&
 427                             !in_group_p((gid_t)ip->i_d.di_gid))
 428                                 m |= S_ISGID;
 429 #if 0
 430                         /* Linux allows this, Irix doesn't. */
 431                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
 432                                 m |= S_ISVTX;
 433 #endif
 434                         if (m && !capable(CAP_FSETID))
 435                                 vap->va_mode &= ~m;
 436                 }
 437         }
 438
 439         /*
 440          * Change file ownership.  Must be the owner or privileged.
 441          * If the system was configured with the "restricted_chown"
 442          * option, the owner is not permitted to give away the file,
 443          * and can change the group id only to a group of which he
 444          * or she is a member.
 445          */
 446         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_XID|XFS_AT_PROJID)) {
 447                 /*
 448                  * These IDs could have changed since we last looked at them.
 449                  * But, we're assured that if the ownership did change
 450                  * while we didn't have the inode locked, inode's dquot(s)
 451                  * would have changed also.
 452                  */
 453                 iuid = ip->i_d.di_uid;
 454                 igid = ip->i_d.di_gid;
 455                 ixid = ip->i_d.di_xid;
 456                 iprojid = ip->i_d.di_projid;
 457                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
 458                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
 459                 xid = (mask & XFS_AT_XID) ? vap->va_xid : ixid;
 460                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
 461                          iprojid;
 462
 463                 /*
 464                  * CAP_CHOWN overrides the following restrictions:
 465                  *
 466                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
 467                  * shall override the restriction that a process cannot
 468                  * change the user ID of a file it owns and the restriction
 469                  * that the group ID supplied to the chown() function
 470                  * shall be equal to either the group ID or one of the
 471                  * supplementary group IDs of the calling process.
 472                  */
 473                 if (restricted_chown &&
 474                     (iuid != uid || (igid != gid &&
 475                                      !in_group_p((gid_t)gid))) &&
 476                     !capable(CAP_CHOWN)) {
 477                         code = XFS_ERROR(EPERM);
 478                         goto error_return;
 479                 }
 480                 /*
 481                  * Do a quota reservation only if uid/projid/gid is actually
 482                  * going to change.
 483                  */
 484                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
 485                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 486                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 487                         /* FIXME: handle xid? */
 488                         ASSERT(tp);
 489                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
 490                                                 capable(CAP_FOWNER) ?
 491                                                 XFS_QMOPT_FORCE_RES : 0);
 492                         if (code)       /* out of quota */
 493                                 goto error_return;
 494                 }
 495         }
 496
 497         /*
 498          * Truncate file.  Must have write permission and not be a directory.
 499          */
 500         if (mask & XFS_AT_SIZE) {
 501                 /* Short circuit the truncate case for zero length files */
 502                 if ((vap->va_size == 0) &&
 503                    (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
 504                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 505                         lock_flags &= ~XFS_ILOCK_EXCL;
 506                         if (mask & XFS_AT_CTIME)
 507                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 508                         code = 0;
 509                         goto error_return;
 510                 }
 511
 512                 if (VN_ISDIR(vp)) {
 513                         code = XFS_ERROR(EISDIR);
 514                         goto error_return;
 515                 } else if (!VN_ISREG(vp)) {
 516                         code = XFS_ERROR(EINVAL);
 517                         goto error_return;
 518                 }
 519                 /*
 520                  * Make sure that the dquots are attached to the inode.
 521                  */
 522                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
 523                         goto error_return;
 524         }
 525
 526         /*
 527          * Change file access or modified times.
 528          */
 529         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 530                 if (!file_owner) {
 531                         if ((flags & ATTR_UTIME) &&
 532                             !capable(CAP_FOWNER)) {
 533                                 code = XFS_ERROR(EPERM);
 534                                 goto error_return;
 535                         }
 536                 }
 537         }
 538
 539         /*
 540          * Change extent size or realtime flag.
 541          */
 542         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 543                 /*
 544                  * Can't change extent size if any extents are allocated.
 545                  */
 546                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
 547                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
 548                      vap->va_extsize) ) {
 549                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 550                         goto error_return;
 551                 }
 552
 553                 /*
 554                  * Can't change realtime flag if any extents are allocated.
 555                  */
 556                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
 557                     (mask & XFS_AT_XFLAGS) &&
 558                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
 559                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 560                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
 561                         goto error_return;
 562                 }
 563                 /*
 564                  * Extent size must be a multiple of the appropriate block
 565                  * size, if set at all.
 566                  */
 567                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
 568                         xfs_extlen_t    size;
 569
 570                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
 571                             ((mask & XFS_AT_XFLAGS) &&
 572                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
 573                                 size = mp->m_sb.sb_rextsize <<
 574                                        mp->m_sb.sb_blocklog;
 575                         } else {
 576                                 size = mp->m_sb.sb_blocksize;
 577                         }
 578                         if (vap->va_extsize % size) {
 579                                 code = XFS_ERROR(EINVAL);
 580                                 goto error_return;
 581                         }
 582                 }
 583                 /*
 584                  * If realtime flag is set then must have realtime data.
 585                  */
 586                 if ((mask & XFS_AT_XFLAGS) &&
 587                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
 588                         if ((mp->m_sb.sb_rblocks == 0) ||
 589                             (mp->m_sb.sb_rextsize == 0) ||
 590                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
 591                                 code = XFS_ERROR(EINVAL);
 592                                 goto error_return;
 593                         }
 594                 }
 595
 596                 /*
 597                  * Can't modify an immutable/append-only file unless
 598                  * we have appropriate permission.
 599                  */
 600                 if ((mask & XFS_AT_XFLAGS) &&
 601                     (ip->i_d.di_flags &
 602                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
 603                      (vap->va_xflags &
 604                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
 605                     !capable(CAP_LINUX_IMMUTABLE)) {
 606                         code = XFS_ERROR(EPERM);
 607                         goto error_return;
 608                 }
 609         }
 610
 611         /*
 612          * Now we can make the changes.  Before we join the inode
 613          * to the transaction, if XFS_AT_SIZE is set then take care of
 614          * the part of the truncation that must be done without the
 615          * inode lock.  This needs to be done before joining the inode
 616          * to the transaction, because the inode cannot be unlocked
 617          * once it is a part of the transaction.
 618          */
 619         if (mask & XFS_AT_SIZE) {
 620                 code = 0;
 621                 if ((vap->va_size > ip->i_d.di_size) &&
 622                     (flags & ATTR_NOSIZETOK) == 0) {
 623                         code = xfs_igrow_start(ip, vap->va_size, credp);
 624                 }
 625                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 626                 vn_iowait(vp); /* wait for the completion of any pending DIOs */
 627                 if (!code)
 628                         code = xfs_itruncate_data(ip, vap->va_size);
 629                 if (code) {
 630                         ASSERT(tp == NULL);
 631                         lock_flags &= ~XFS_ILOCK_EXCL;
 632                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
 633                         goto error_return;
 634                 }
 635                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 636                 if ((code = xfs_trans_reserve(tp, 0,
 637                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
 638                                              XFS_TRANS_PERM_LOG_RES,
 639                                              XFS_ITRUNCATE_LOG_COUNT))) {
 640                         xfs_trans_cancel(tp, 0);
 641                         if (need_iolock)
 642                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 643                         return code;
 644                 }
 645                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
 646                 xfs_ilock(ip, XFS_ILOCK_EXCL);
 647         }
 648
 649         if (tp) {
 650                 xfs_trans_ijoin(tp, ip, lock_flags);
 651                 xfs_trans_ihold(tp, ip);
 652         }
 653
 654         /* determine whether mandatory locking mode changes */
 655         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
 656
 657         /*
 658          * Truncate file.  Must have write permission and not be a directory.
 659          */
 660         if (mask & XFS_AT_SIZE) {
 661                 if (vap->va_size > ip->i_d.di_size) {
 662                         xfs_igrow_finish(tp, ip, vap->va_size,
 663                             !(flags & ATTR_DMI));
 664                 } else if ((vap->va_size <= ip->i_d.di_size) ||
 665                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
 666                         /*
 667                          * signal a sync transaction unless
 668                          * we're truncating an already unlinked
 669                          * file on a wsync filesystem
 670                          */
 671                         code = xfs_itruncate_finish(&tp, ip,
 672                                             (xfs_fsize_t)vap->va_size,
 673                                             XFS_DATA_FORK,
 674                                             ((ip->i_d.di_nlink != 0 ||
 675                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
 676                                              ? 1 : 0));
 677                         if (code) {
 678                                 goto abort_return;
 679                         }
 680                 }
 681                 /*
 682                  * Have to do this even if the file's size doesn't change.
 683                  */
 684                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 685         }
 686
 687         /*
 688          * Change file access modes.
 689          */
 690         if (mask & XFS_AT_MODE) {
 691                 ip->i_d.di_mode &= S_IFMT;
 692                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
 693
 694                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 695                 timeflags |= XFS_ICHGTIME_CHG;
 696         }
 697
 698         /*
 699          * Change file ownership.  Must be the owner or privileged.
 700          * If the system was configured with the "restricted_chown"
 701          * option, the owner is not permitted to give away the file,
 702          * and can change the group id only to a group of which he
 703          * or she is a member.
 704          */
 705         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_XID|XFS_AT_PROJID)) {
 706                 /*
 707                  * CAP_FSETID overrides the following restrictions:
 708                  *
 709                  * The set-user-ID and set-group-ID bits of a file will be
 710                  * cleared upon successful return from chown()
 711                  */
 712                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
 713                     !capable(CAP_FSETID)) {
 714                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 715                 }
 716
 717                 /*
 718                  * Change the ownerships and register quota modifications
 719                  * in the transaction.
 720                  */
 721                 if (ixid != xid) {
 722                         if (XFS_IS_GQUOTA_ON(mp)) {
 723                                 /* FIXME: handle xid quota? */
 724                         }
 725                         ip->i_d.di_xid = xid;
 726                 }
 727                 if (iuid != uid) {
 728                         if (XFS_IS_UQUOTA_ON(mp)) {
 729                                 ASSERT(mask & XFS_AT_UID);
 730                                 ASSERT(udqp);
 731                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 732                                                         &ip->i_udquot, udqp);
 733                         }
 734                         ip->i_d.di_uid = uid;
 735                 }
 736                 if (igid != gid) {
 737                         if (XFS_IS_GQUOTA_ON(mp)) {
 738                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
 739                                 ASSERT(mask & XFS_AT_GID);
 740                                 ASSERT(gdqp);
 741                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 742                                                         &ip->i_gdquot, gdqp);
 743                         }
 744                         ip->i_d.di_gid = gid;
 745                 }
 746                 if (iprojid != projid) {
 747                         if (XFS_IS_PQUOTA_ON(mp)) {
 748                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
 749                                 ASSERT(mask & XFS_AT_PROJID);
 750                                 ASSERT(gdqp);
 751                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 752                                                         &ip->i_gdquot, gdqp);
 753                         }
 754                         ip->i_d.di_projid = projid;
 755                         /*
 756                          * We may have to rev the inode as well as
 757                          * the superblock version number since projids didn't
 758                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
 759                          */
 760                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
 761                                 xfs_bump_ino_vers2(tp, ip);
 762                 }
 763
 764                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 765                 timeflags |= XFS_ICHGTIME_CHG;
 766         }
 767
 768
 769         /*
 770          * Change file access or modified times.
 771          */
 772         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
 773                 if (mask & XFS_AT_ATIME) {
 774                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
 775                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
 776                         ip->i_update_core = 1;
 777                         timeflags &= ~XFS_ICHGTIME_ACC;
 778                 }
 779                 if (mask & XFS_AT_MTIME) {
 780                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
 781                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
 782                         timeflags &= ~XFS_ICHGTIME_MOD;
 783                         timeflags |= XFS_ICHGTIME_CHG;
 784                 }
 785                 if (tp && (flags & ATTR_UTIME))
 786                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 787         }
 788
 789         /*
 790          * Change XFS-added attributes.
 791          */
 792         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
 793                 if (mask & XFS_AT_EXTSIZE) {
 794                         /*
 795                          * Converting bytes to fs blocks.
 796                          */
 797                         ip->i_d.di_extsize = vap->va_extsize >>
 798                                 mp->m_sb.sb_blocklog;
 799                 }
 800                 if (mask & XFS_AT_XFLAGS) {
 801                         uint    di_flags;
 802
 803                         /* can't set PREALLOC this way, just preserve it */
 804                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
 805                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 806                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
 807                         if (vap->va_xflags & XFS_XFLAG_IUNLINK)
 808                                 di_flags |= XFS_DIFLAG_IUNLINK;
 809                         if (vap->va_xflags & XFS_XFLAG_BARRIER)
 810                                 di_flags |= XFS_DIFLAG_BARRIER;
 811                         if (vap->va_xflags & XFS_XFLAG_APPEND)
 812                                 di_flags |= XFS_DIFLAG_APPEND;
 813                         if (vap->va_xflags & XFS_XFLAG_SYNC)
 814                                 di_flags |= XFS_DIFLAG_SYNC;
 815                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
 816                                 di_flags |= XFS_DIFLAG_NOATIME;
 817                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
 818                                 di_flags |= XFS_DIFLAG_NODUMP;
 819                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 820                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 821                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 822                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 823                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 824                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
 825                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
 826                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
 827                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 828                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
 829                                 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
 830                                         di_flags |= XFS_DIFLAG_REALTIME;
 831                                         ip->i_iocore.io_flags |= XFS_IOCORE_RT;
 832                                 } else {
 833                                         ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
 834                                 }
 835                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
 836                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 837                         }
 838                         ip->i_d.di_flags = di_flags;
 839                 }
 840                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 841                 timeflags |= XFS_ICHGTIME_CHG;
 842         }
 843
 844         /*
 845          * Change file inode change time only if XFS_AT_CTIME set
 846          * AND we have been called by a DMI function.
 847          */
 848
 849         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
 850                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
 851                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
 852                 ip->i_update_core = 1;
 853                 timeflags &= ~XFS_ICHGTIME_CHG;
 854         }
 855
 856         /*
 857          * Send out timestamp changes that need to be set to the
 858          * current time.  Not done when called by a DMI function.
 859          */
 860         if (timeflags && !(flags & ATTR_DMI))
 861                 xfs_ichgtime(ip, timeflags);
 862
 863         XFS_STATS_INC(xs_ig_attrchg);
 864
 865         /*
 866          * If this is a synchronous mount, make sure that the
 867          * transaction goes to disk before returning to the user.
 868          * This is slightly sub-optimal in that truncates require
 869          * two sync transactions instead of one for wsync filesystems.
 870          * One for the truncate and one for the timestamps since we
 871          * don't want to change the timestamps unless we're sure the
 872          * truncate worked.  Truncates are less than 1% of the laddis
 873          * mix so this probably isn't worth the trouble to optimize.
 874          */
 875         code = 0;
 876         if (tp) {
 877                 if (mp->m_flags & XFS_MOUNT_WSYNC)
 878                         xfs_trans_set_sync(tp);
 879
 880                 code = xfs_trans_commit(tp, commit_flags, NULL);
 881         }
 882
 883         /*
 884          * If the (regular) file's mandatory locking mode changed, then
 885          * notify the vnode.  We do this under the inode lock to prevent
 886          * racing calls to vop_vnode_change.
 887          */
 888         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 889         if (mandlock_before != mandlock_after) {
 890                 VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
 891                                  mandlock_after);
 892         }
 893
 894         xfs_iunlock(ip, lock_flags);
 895
 896         /*
 897          * Release any dquot(s) the inode had kept before chown.
 898          */
 899         XFS_QM_DQRELE(mp, olddquot1);
 900         XFS_QM_DQRELE(mp, olddquot2);
 901         XFS_QM_DQRELE(mp, udqp);
 902         XFS_QM_DQRELE(mp, gdqp);
 903
 904         if (code) {
 905                 return code;
 906         }
 907
 908         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
 909             !(flags & ATTR_DMI)) {
 910                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
 911                                         NULL, DM_RIGHT_NULL, NULL, NULL,
 912                                         0, 0, AT_DELAY_FLAG(flags));
 913         }
 914         return 0;
 915
 916  abort_return:
 917         commit_flags |= XFS_TRANS_ABORT;
 918         /* FALLTHROUGH */
 919  error_return:
 920         XFS_QM_DQRELE(mp, udqp);
 921         XFS_QM_DQRELE(mp, gdqp);
 922         if (tp) {
 923                 xfs_trans_cancel(tp, commit_flags);
 924         }
 925         if (lock_flags != 0) {
 926                 xfs_iunlock(ip, lock_flags);
 927         }
 928         return code;
 929 }
 930
 931
 932 /*
 933  * xfs_access
 934  * Null conversion from vnode mode bits to inode mode bits, as in efs.
 935  */
 936 STATIC int
 937 xfs_access(
 938         bhv_desc_t      *bdp,
 939         int             mode,
 940         cred_t          *credp)
 941 {
 942         xfs_inode_t     *ip;
 943         int             error;
 944
 945         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
 946                                                (inst_t *)__return_address);
 947
 948         ip = XFS_BHVTOI(bdp);
 949         xfs_ilock(ip, XFS_ILOCK_SHARED);
 950         error = xfs_iaccess(ip, mode, credp);
 951         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 952         return error;
 953 }
 954
 955
 956 /*
 957  * xfs_readlink
 958  *
 959  */
 960 STATIC int
 961 xfs_readlink(
 962         bhv_desc_t      *bdp,
 963         uio_t           *uiop,
 964         int             ioflags,
 965         cred_t          *credp)
 966 {
 967         xfs_inode_t     *ip;
 968         int             count;
 969         xfs_off_t       offset;
 970         int             pathlen;
 971         vnode_t         *vp;
 972         int             error = 0;
 973         xfs_mount_t     *mp;
 974         int             nmaps;
 975         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
 976         xfs_daddr_t     d;
 977         int             byte_cnt;
 978         int             n;
 979         xfs_buf_t       *bp;
 980
 981         vp = BHV_TO_VNODE(bdp);
 982         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 983
 984         ip = XFS_BHVTOI(bdp);
 985         mp = ip->i_mount;
 986
 987         if (XFS_FORCED_SHUTDOWN(mp))
 988                 return XFS_ERROR(EIO);
 989
 990         xfs_ilock(ip, XFS_ILOCK_SHARED);
 991
 992         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
 993
 994         offset = uiop->uio_offset;
 995         count = uiop->uio_resid;
 996
 997         if (offset < 0) {
 998                 error = XFS_ERROR(EINVAL);
 999                 goto error_return;
1000         }
1001         if (count <= 0) {
1002                 error = 0;
1003                 goto error_return;
1004         }
1005
1006         /*
1007          * See if the symlink is stored inline.
1008          */
1009         pathlen = (int)ip->i_d.di_size;
1010
1011         if (ip->i_df.if_flags & XFS_IFINLINE) {
1012                 error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
1013         }
1014         else {
1015                 /*
1016                  * Symlink not inline.  Call bmap to get it in.
1017                  */
1018                 nmaps = SYMLINK_MAPS;
1019
1020                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1021                                   0, NULL, 0, mval, &nmaps, NULL);
1022
1023                 if (error) {
1024                         goto error_return;
1025                 }
1026
1027                 for (n = 0; n < nmaps; n++) {
1028                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1029                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1030                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1031                                       BTOBB(byte_cnt), 0);
1032                         error = XFS_BUF_GETERROR(bp);
1033                         if (error) {
1034                                 xfs_ioerror_alert("xfs_readlink",
1035                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1036                                 xfs_buf_relse(bp);
1037                                 goto error_return;
1038                         }
1039                         if (pathlen < byte_cnt)
1040                                 byte_cnt = pathlen;
1041                         pathlen -= byte_cnt;
1042
1043                         error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1044                         xfs_buf_relse (bp);
1045                 }
1046
1047         }
1048
1049 error_return:
1050         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1051         return error;
1052 }
1053
1054
1055 /*
1056  * xfs_fsync
1057  *
1058  * This is called to sync the inode and its data out to disk.
1059  * We need to hold the I/O lock while flushing the data, and
1060  * the inode lock while flushing the inode.  The inode lock CANNOT
1061  * be held while flushing the data, so acquire after we're done
1062  * with that.
1063  */
1064 STATIC int
1065 xfs_fsync(
1066         bhv_desc_t      *bdp,
1067         int             flag,
1068         cred_t          *credp,
1069         xfs_off_t       start,
1070         xfs_off_t       stop)
1071 {
1072         xfs_inode_t     *ip;
1073         xfs_trans_t     *tp;
1074         int             error;
1075         int             log_flushed = 0, changed = 1;
1076
1077         vn_trace_entry(BHV_TO_VNODE(bdp),
1078                         __FUNCTION__, (inst_t *)__return_address);
1079
1080         ip = XFS_BHVTOI(bdp);
1081
1082         ASSERT(start >= 0 && stop >= -1);
1083
1084         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1085                 return XFS_ERROR(EIO);
1086
1087         /*
1088          * We always need to make sure that the required inode state
1089          * is safe on disk.  The vnode might be clean but because
1090          * of committed transactions that haven't hit the disk yet.
1091          * Likewise, there could be unflushed non-transactional
1092          * changes to the inode core that have to go to disk.
1093          *
1094          * The following code depends on one assumption:  that
1095          * any transaction that changes an inode logs the core
1096          * because it has to change some field in the inode core
1097          * (typically nextents or nblocks).  That assumption
1098          * implies that any transactions against an inode will
1099          * catch any non-transactional updates.  If inode-altering
1100          * transactions exist that violate this assumption, the
1101          * code breaks.  Right now, it figures that if the involved
1102          * update_* field is clear and the inode is unpinned, the
1103          * inode is clean.  Either it's been flushed or it's been
1104          * committed and the commit has hit the disk unpinning the inode.
1105          * (Note that xfs_inode_item_format() called at commit clears
1106          * the update_* fields.)
1107          */
1108         xfs_ilock(ip, XFS_ILOCK_SHARED);
1109
1110         /* If we are flushing data then we care about update_size
1111          * being set, otherwise we care about update_core
1112          */
1113         if ((flag & FSYNC_DATA) ?
1114                         (ip->i_update_size == 0) :
1115                         (ip->i_update_core == 0)) {
1116                 /*
1117                  * Timestamps/size haven't changed since last inode
1118                  * flush or inode transaction commit.  That means
1119                  * either nothing got written or a transaction
1120                  * committed which caught the updates.  If the
1121                  * latter happened and the transaction hasn't
1122                  * hit the disk yet, the inode will be still
1123                  * be pinned.  If it is, force the log.
1124                  */
1125
1126                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1127
1128                 if (xfs_ipincount(ip)) {
1129                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1130                                       XFS_LOG_FORCE |
1131                                       ((flag & FSYNC_WAIT)
1132                                        ? XFS_LOG_SYNC : 0),
1133                                       &log_flushed);
1134                 } else {
1135                         /*
1136                          * If the inode is not pinned and nothing
1137                          * has changed we don't need to flush the
1138                          * cache.
1139                          */
1140                         changed = 0;
1141                 }
1142                 error = 0;
1143         } else  {
1144                 /*
1145                  * Kick off a transaction to log the inode
1146                  * core to get the updates.  Make it
1147                  * sync if FSYNC_WAIT is passed in (which
1148                  * is done by everybody but specfs).  The
1149                  * sync transaction will also force the log.
1150                  */
1151                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1152                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1153                 if ((error = xfs_trans_reserve(tp, 0,
1154                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1155                                 0, 0, 0)))  {
1156                         xfs_trans_cancel(tp, 0);
1157                         return error;
1158                 }
1159                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1160
1161                 /*
1162                  * Note - it's possible that we might have pushed
1163                  * ourselves out of the way during trans_reserve
1164                  * which would flush the inode.  But there's no
1165                  * guarantee that the inode buffer has actually
1166                  * gone out yet (it's delwri).  Plus the buffer
1167                  * could be pinned anyway if it's part of an
1168                  * inode in another recent transaction.  So we
1169                  * play it safe and fire off the transaction anyway.
1170                  */
1171                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1172                 xfs_trans_ihold(tp, ip);
1173                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1174                 if (flag & FSYNC_WAIT)
1175                         xfs_trans_set_sync(tp);
1176                 error = _xfs_trans_commit(tp, 0, NULL, &log_flushed);
1177
1178                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1179         }
1180
1181         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1182                 /*
1183                  * If the log write didn't issue an ordered tag we need
1184                  * to flush the disk cache for the data device now.
1185                  */
1186                 if (!log_flushed)
1187                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1188
1189                 /*
1190                  * If this inode is on the RT dev we need to flush that
1191                  * cache as well.
1192                  */
1193                 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1194                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1195         }
1196
1197         return error;
1198 }
1199
1200 /*
1201  * This is called by xfs_inactive to free any blocks beyond eof,
1202  * when the link count isn't zero.
1203  */
1204 STATIC int
1205 xfs_inactive_free_eofblocks(
1206         xfs_mount_t     *mp,
1207         xfs_inode_t     *ip)
1208 {
1209         xfs_trans_t     *tp;
1210         int             error;
1211         xfs_fileoff_t   end_fsb;
1212         xfs_fileoff_t   last_fsb;
1213         xfs_filblks_t   map_len;
1214         int             nimaps;
1215         xfs_bmbt_irec_t imap;
1216
1217         /*
1218          * Figure out if there are any blocks beyond the end
1219          * of the file.  If not, then there is nothing to do.
1220          */
1221         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
1222         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1223         map_len = last_fsb - end_fsb;
1224         if (map_len <= 0)
1225                 return 0;
1226
1227         nimaps = 1;
1228         xfs_ilock(ip, XFS_ILOCK_SHARED);
1229         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1230                           NULL, 0, &imap, &nimaps, NULL);
1231         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1232
1233         if (!error && (nimaps != 0) &&
1234             (imap.br_startblock != HOLESTARTBLOCK ||
1235              ip->i_delayed_blks)) {
1236                 /*
1237                  * Attach the dquots to the inode up front.
1238                  */
1239                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1240                         return error;
1241
1242                 /*
1243                  * There are blocks after the end of file.
1244                  * Free them up now by truncating the file to
1245                  * its current size.
1246                  */
1247                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1248
1249                 /*
1250                  * Do the xfs_itruncate_start() call before
1251                  * reserving any log space because
1252                  * itruncate_start will call into the buffer
1253                  * cache and we can't
1254                  * do that within a transaction.
1255                  */
1256                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1257                 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1258                                     ip->i_d.di_size);
1259
1260                 error = xfs_trans_reserve(tp, 0,
1261                                           XFS_ITRUNCATE_LOG_RES(mp),
1262                                           0, XFS_TRANS_PERM_LOG_RES,
1263                                           XFS_ITRUNCATE_LOG_COUNT);
1264                 if (error) {
1265                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1266                         xfs_trans_cancel(tp, 0);
1267                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1268                         return error;
1269                 }
1270
1271                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1272                 xfs_trans_ijoin(tp, ip,
1273                                 XFS_IOLOCK_EXCL |
1274                                 XFS_ILOCK_EXCL);
1275                 xfs_trans_ihold(tp, ip);
1276
1277                 error = xfs_itruncate_finish(&tp, ip,
1278                                              ip->i_d.di_size,
1279                                              XFS_DATA_FORK,
1280                                              0);
1281                 /*
1282                  * If we get an error at this point we
1283                  * simply don't bother truncating the file.
1284                  */
1285                 if (error) {
1286                         xfs_trans_cancel(tp,
1287                                          (XFS_TRANS_RELEASE_LOG_RES |
1288                                           XFS_TRANS_ABORT));
1289                 } else {
1290                         error = xfs_trans_commit(tp,
1291                                                 XFS_TRANS_RELEASE_LOG_RES,
1292                                                 NULL);
1293                 }
1294                 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1295         }
1296         return error;
1297 }
1298
1299 /*
1300  * Free a symlink that has blocks associated with it.
1301  */
1302 STATIC int
1303 xfs_inactive_symlink_rmt(
1304         xfs_inode_t     *ip,
1305         xfs_trans_t     **tpp)
1306 {
1307         xfs_buf_t       *bp;
1308         int             committed;
1309         int             done;
1310         int             error;
1311         xfs_fsblock_t   first_block;
1312         xfs_bmap_free_t free_list;
1313         int             i;
1314         xfs_mount_t     *mp;
1315         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1316         int             nmaps;
1317         xfs_trans_t     *ntp;
1318         int             size;
1319         xfs_trans_t     *tp;
1320
1321         tp = *tpp;
1322         mp = ip->i_mount;
1323         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1324         /*
1325          * We're freeing a symlink that has some
1326          * blocks allocated to it.  Free the
1327          * blocks here.  We know that we've got
1328          * either 1 or 2 extents and that we can
1329          * free them all in one bunmapi call.
1330          */
1331         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1332         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1333                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1334                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1335                 xfs_trans_cancel(tp, 0);
1336                 *tpp = NULL;
1337                 return error;
1338         }
1339         /*
1340          * Lock the inode, fix the size, and join it to the transaction.
1341          * Hold it so in the normal path, we still have it locked for
1342          * the second transaction.  In the error paths we need it
1343          * held so the cancel won't rele it, see below.
1344          */
1345         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1346         size = (int)ip->i_d.di_size;
1347         ip->i_d.di_size = 0;
1348         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1349         xfs_trans_ihold(tp, ip);
1350         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1351         /*
1352          * Find the block(s) so we can inval and unmap them.
1353          */
1354         done = 0;
1355         XFS_BMAP_INIT(&free_list, &first_block);
1356         nmaps = ARRAY_SIZE(mval);
1357         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1358                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1359                         &free_list)))
1360                 goto error0;
1361         /*
1362          * Invalidate the block(s).
1363          */
1364         for (i = 0; i < nmaps; i++) {
1365                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1366                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1367                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1368                 xfs_trans_binval(tp, bp);
1369         }
1370         /*
1371          * Unmap the dead block(s) to the free_list.
1372          */
1373         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1374                         &first_block, &free_list, &done)))
1375                 goto error1;
1376         ASSERT(done);
1377         /*
1378          * Commit the first transaction.  This logs the EFI and the inode.
1379          */
1380         if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
1381                 goto error1;
1382         /*
1383          * The transaction must have been committed, since there were
1384          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1385          * The new tp has the extent freeing and EFDs.
1386          */
1387         ASSERT(committed);
1388         /*
1389          * The first xact was committed, so add the inode to the new one.
1390          * Mark it dirty so it will be logged and moved forward in the log as
1391          * part of every commit.
1392          */
1393         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1394         xfs_trans_ihold(tp, ip);
1395         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1396         /*
1397          * Get a new, empty transaction to return to our caller.
1398          */
1399         ntp = xfs_trans_dup(tp);
1400         /*
1401          * Commit the transaction containing extent freeing and EFDs.
1402          * If we get an error on the commit here or on the reserve below,
1403          * we need to unlock the inode since the new transaction doesn't
1404          * have the inode attached.
1405          */
1406         error = xfs_trans_commit(tp, 0, NULL);
1407         tp = ntp;
1408         if (error) {
1409                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1410                 goto error0;
1411         }
1412         /*
1413          * Remove the memory for extent descriptions (just bookkeeping).
1414          */
1415         if (ip->i_df.if_bytes)
1416                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1417         ASSERT(ip->i_df.if_bytes == 0);
1418         /*
1419          * Put an itruncate log reservation in the new transaction
1420          * for our caller.
1421          */
1422         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1423                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1424                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1425                 goto error0;
1426         }
1427         /*
1428          * Return with the inode locked but not joined to the transaction.
1429          */
1430         *tpp = tp;
1431         return 0;
1432
1433  error1:
1434         xfs_bmap_cancel(&free_list);
1435  error0:
1436         /*
1437          * Have to come here with the inode locked and either
1438          * (held and in the transaction) or (not in the transaction).
1439          * If the inode isn't held then cancel would iput it, but
1440          * that's wrong since this is inactive and the vnode ref
1441          * count is 0 already.
1442          * Cancel won't do anything to the inode if held, but it still
1443          * needs to be locked until the cancel is done, if it was
1444          * joined to the transaction.
1445          */
1446         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1447         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1448         *tpp = NULL;
1449         return error;
1450
1451 }
1452
1453 STATIC int
1454 xfs_inactive_symlink_local(
1455         xfs_inode_t     *ip,
1456         xfs_trans_t     **tpp)
1457 {
1458         int             error;
1459
1460         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1461         /*
1462          * We're freeing a symlink which fit into
1463          * the inode.  Just free the memory used
1464          * to hold the old symlink.
1465          */
1466         error = xfs_trans_reserve(*tpp, 0,
1467                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1468                                   0, XFS_TRANS_PERM_LOG_RES,
1469                                   XFS_ITRUNCATE_LOG_COUNT);
1470
1471         if (error) {
1472                 xfs_trans_cancel(*tpp, 0);
1473                 *tpp = NULL;
1474                 return error;
1475         }
1476         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1477
1478         /*
1479          * Zero length symlinks _can_ exist.
1480          */
1481         if (ip->i_df.if_bytes > 0) {
1482                 xfs_idata_realloc(ip,
1483                                   -(ip->i_df.if_bytes),
1484                                   XFS_DATA_FORK);
1485                 ASSERT(ip->i_df.if_bytes == 0);
1486         }
1487         return 0;
1488 }
1489
1490 /*
1491  *
1492  */
1493 STATIC int
1494 xfs_inactive_attrs(
1495         xfs_inode_t     *ip,
1496         xfs_trans_t     **tpp)
1497 {
1498         xfs_trans_t     *tp;
1499         int             error;
1500         xfs_mount_t     *mp;
1501
1502         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1503         tp = *tpp;
1504         mp = ip->i_mount;
1505         ASSERT(ip->i_d.di_forkoff != 0);
1506         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1507         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1508
1509         error = xfs_attr_inactive(ip);
1510         if (error) {
1511                 *tpp = NULL;
1512                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1513                 return error; /* goto out */
1514         }
1515
1516         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1517         error = xfs_trans_reserve(tp, 0,
1518                                   XFS_IFREE_LOG_RES(mp),
1519                                   0, XFS_TRANS_PERM_LOG_RES,
1520                                   XFS_INACTIVE_LOG_COUNT);
1521         if (error) {
1522                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1523                 xfs_trans_cancel(tp, 0);
1524                 *tpp = NULL;
1525                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1526                 return error;
1527         }
1528
1529         xfs_ilock(ip, XFS_ILOCK_EXCL);
1530         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1531         xfs_trans_ihold(tp, ip);
1532         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1533
1534         ASSERT(ip->i_d.di_anextents == 0);
1535
1536         *tpp = tp;
1537         return 0;
1538 }
1539
1540 STATIC int
1541 xfs_release(
1542         bhv_desc_t      *bdp)
1543 {
1544         xfs_inode_t     *ip;
1545         vnode_t         *vp;
1546         xfs_mount_t     *mp;
1547         int             error;
1548
1549         vp = BHV_TO_VNODE(bdp);
1550         ip = XFS_BHVTOI(bdp);
1551
1552         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
1553                 return 0;
1554         }
1555
1556         /* If this is a read-only mount, don't do this (would generate I/O) */
1557         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1558                 return 0;
1559
1560 #ifdef HAVE_REFCACHE
1561         /* If we are in the NFS reference cache then don't do this now */
1562         if (ip->i_refcache)
1563                 return 0;
1564 #endif
1565
1566         mp = ip->i_mount;
1567
1568         if (ip->i_d.di_nlink != 0) {
1569                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1570                      ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
1571                        ip->i_delayed_blks > 0)) &&
1572                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1573                     (!(ip->i_d.di_flags &
1574                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1575                         if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1576                                 return error;
1577                         /* Update linux inode block count after free above */
1578                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1579                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1580                 }
1581         }
1582
1583         return 0;
1584 }
1585
1586 /*
1587  * xfs_inactive
1588  *
1589  * This is called when the vnode reference count for the vnode
1590  * goes to zero.  If the file has been unlinked, then it must
1591  * now be truncated.  Also, we clear all of the read-ahead state
1592  * kept for the inode here since the file is now closed.
1593  */
1594 STATIC int
1595 xfs_inactive(
1596         bhv_desc_t      *bdp,
1597         cred_t          *credp)
1598 {
1599         xfs_inode_t     *ip;
1600         vnode_t         *vp;
1601         xfs_bmap_free_t free_list;
1602         xfs_fsblock_t   first_block;
1603         int             committed;
1604         xfs_trans_t     *tp;
1605         xfs_mount_t     *mp;
1606         int             error;
1607         int             truncate;
1608
1609         vp = BHV_TO_VNODE(bdp);
1610         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1611
1612         ip = XFS_BHVTOI(bdp);
1613
1614         /*
1615          * If the inode is already free, then there can be nothing
1616          * to clean up here.
1617          */
1618         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1619                 ASSERT(ip->i_df.if_real_bytes == 0);
1620                 ASSERT(ip->i_df.if_broot_bytes == 0);
1621                 return VN_INACTIVE_CACHE;
1622         }
1623
1624         /*
1625          * Only do a truncate if it's a regular file with
1626          * some actual space in it.  It's OK to look at the
1627          * inode's fields without the lock because we're the
1628          * only one with a reference to the inode.
1629          */
1630         truncate = ((ip->i_d.di_nlink == 0) &&
1631             ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) ||
1632              (ip->i_delayed_blks > 0)) &&
1633             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1634
1635         mp = ip->i_mount;
1636
1637         if (ip->i_d.di_nlink == 0 &&
1638             DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1639                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1640         }
1641
1642         error = 0;
1643
1644         /* If this is a read-only mount, don't do this (would generate I/O) */
1645         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1646                 goto out;
1647
1648         if (ip->i_d.di_nlink != 0) {
1649                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1650                      ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
1651                        ip->i_delayed_blks > 0)) &&
1652                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1653                      (!(ip->i_d.di_flags &
1654                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1655                       (ip->i_delayed_blks != 0)))) {
1656                         if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1657                                 return VN_INACTIVE_CACHE;
1658                         /* Update linux inode block count after free above */
1659                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1660                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1661                 }
1662                 goto out;
1663         }
1664
1665         ASSERT(ip->i_d.di_nlink == 0);
1666
1667         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1668                 return VN_INACTIVE_CACHE;
1669
1670         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1671         if (truncate) {
1672                 /*
1673                  * Do the xfs_itruncate_start() call before
1674                  * reserving any log space because itruncate_start
1675                  * will call into the buffer cache and we can't
1676                  * do that within a transaction.
1677                  */
1678                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1679
1680                 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1681
1682                 error = xfs_trans_reserve(tp, 0,
1683                                           XFS_ITRUNCATE_LOG_RES(mp),
1684                                           0, XFS_TRANS_PERM_LOG_RES,
1685                                           XFS_ITRUNCATE_LOG_COUNT);
1686                 if (error) {
1687                         /* Don't call itruncate_cleanup */
1688                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1689                         xfs_trans_cancel(tp, 0);
1690                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1691                         return VN_INACTIVE_CACHE;
1692                 }
1693
1694                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1695                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1696                 xfs_trans_ihold(tp, ip);
1697
1698                 /*
1699                  * normally, we have to run xfs_itruncate_finish sync.
1700                  * But if filesystem is wsync and we're in the inactive
1701                  * path, then we know that nlink == 0, and that the
1702                  * xaction that made nlink == 0 is permanently committed
1703                  * since xfs_remove runs as a synchronous transaction.
1704                  */
1705                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1706                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1707
1708                 if (error) {
1709                         xfs_trans_cancel(tp,
1710                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1711                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1712                         return VN_INACTIVE_CACHE;
1713                 }
1714         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1715
1716                 /*
1717                  * If we get an error while cleaning up a
1718                  * symlink we bail out.
1719                  */
1720                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1721                         xfs_inactive_symlink_rmt(ip, &tp) :
1722                         xfs_inactive_symlink_local(ip, &tp);
1723
1724                 if (error) {
1725                         ASSERT(tp == NULL);
1726                         return VN_INACTIVE_CACHE;
1727                 }
1728
1729                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1730                 xfs_trans_ihold(tp, ip);
1731         } else {
1732                 error = xfs_trans_reserve(tp, 0,
1733                                           XFS_IFREE_LOG_RES(mp),
1734                                           0, XFS_TRANS_PERM_LOG_RES,
1735                                           XFS_INACTIVE_LOG_COUNT);
1736                 if (error) {
1737                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1738                         xfs_trans_cancel(tp, 0);
1739                         return VN_INACTIVE_CACHE;
1740                 }
1741
1742                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1743                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1744                 xfs_trans_ihold(tp, ip);
1745         }
1746
1747         /*
1748          * If there are attributes associated with the file
1749          * then blow them away now.  The code calls a routine
1750          * that recursively deconstructs the attribute fork.
1751          * We need to just commit the current transaction
1752          * because we can't use it for xfs_attr_inactive().
1753          */
1754         if (ip->i_d.di_anextents > 0) {
1755                 error = xfs_inactive_attrs(ip, &tp);
1756                 /*
1757                  * If we got an error, the transaction is already
1758                  * cancelled, and the inode is unlocked. Just get out.
1759                  */
1760                  if (error)
1761                          return VN_INACTIVE_CACHE;
1762         } else if (ip->i_afp) {
1763                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1764         }
1765
1766         /*
1767          * Free the inode.
1768          */
1769         XFS_BMAP_INIT(&free_list, &first_block);
1770         error = xfs_ifree(tp, ip, &free_list);
1771         if (error) {
1772                 /*
1773                  * If we fail to free the inode, shut down.  The cancel
1774                  * might do that, we need to make sure.  Otherwise the
1775                  * inode might be lost for a long time or forever.
1776                  */
1777                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1778                         cmn_err(CE_NOTE,
1779                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1780                                 error, mp->m_fsname);
1781                         xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
1782                 }
1783                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1784         } else {
1785                 /*
1786                  * Credit the quota account(s). The inode is gone.
1787                  */
1788                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1789
1790                 /*
1791                  * Just ignore errors at this point.  There is
1792                  * nothing we can do except to try to keep going.
1793                  */
1794                 (void) xfs_bmap_finish(&tp,  &free_list, first_block,
1795                                        &committed);
1796                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1797         }
1798         /*
1799          * Release the dquots held by inode, if any.
1800          */
1801         XFS_QM_DQDETACH(mp, ip);
1802
1803         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1804
1805  out:
1806         return VN_INACTIVE_CACHE;
1807 }
1808
1809
1810 /*
1811  * xfs_lookup
1812  */
1813 STATIC int
1814 xfs_lookup(
1815         bhv_desc_t              *dir_bdp,
1816         vname_t                 *dentry,
1817         vnode_t                 **vpp,
1818         int                     flags,
1819         vnode_t                 *rdir,
1820         cred_t                  *credp)
1821 {
1822         xfs_inode_t             *dp, *ip;
1823         xfs_ino_t               e_inum;
1824         int                     error;
1825         uint                    lock_mode;
1826         vnode_t                 *dir_vp;
1827
1828         dir_vp = BHV_TO_VNODE(dir_bdp);
1829         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1830
1831         dp = XFS_BHVTOI(dir_bdp);
1832
1833         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1834                 return XFS_ERROR(EIO);
1835
1836         lock_mode = xfs_ilock_map_shared(dp);
1837         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1838         if (!error) {
1839                 *vpp = XFS_ITOV(ip);
1840                 ITRACE(ip);
1841         }
1842         xfs_iunlock_map_shared(dp, lock_mode);
1843         return error;
1844 }
1845
1846
1847 /*
1848  * xfs_create (create a new file).
1849  */
1850 STATIC int
1851 xfs_create(
1852         bhv_desc_t              *dir_bdp,
1853         vname_t                 *dentry,
1854         vattr_t                 *vap,
1855         vnode_t                 **vpp,
1856         cred_t                  *credp)
1857 {
1858         char                    *name = VNAME(dentry);
1859         vnode_t                 *dir_vp;
1860         xfs_inode_t             *dp, *ip;
1861         vnode_t                 *vp=NULL;
1862         xfs_trans_t             *tp;
1863         xfs_mount_t             *mp;
1864         xfs_dev_t               rdev;
1865         int                     error;
1866         xfs_bmap_free_t         free_list;
1867         xfs_fsblock_t           first_block;
1868         boolean_t               dp_joined_to_trans;
1869         int                     dm_event_sent = 0;
1870         uint                    cancel_flags;
1871         int                     committed;
1872         xfs_prid_t              prid;
1873         struct xfs_dquot        *udqp, *gdqp;
1874         uint                    resblks;
1875         int                     dm_di_mode;
1876         int                     namelen;
1877
1878         ASSERT(!*vpp);
1879         dir_vp = BHV_TO_VNODE(dir_bdp);
1880         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1881
1882         dp = XFS_BHVTOI(dir_bdp);
1883         mp = dp->i_mount;
1884
1885         dm_di_mode = vap->va_mode;
1886         namelen = VNAMELEN(dentry);
1887
1888         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1889                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1890                                 dir_vp, DM_RIGHT_NULL, NULL,
1891                                 DM_RIGHT_NULL, name, NULL,
1892                                 dm_di_mode, 0, 0);
1893
1894                 if (error)
1895                         return error;
1896                 dm_event_sent = 1;
1897         }
1898
1899         if (XFS_FORCED_SHUTDOWN(mp))
1900                 return XFS_ERROR(EIO);
1901
1902         /* Return through std_return after this point. */
1903
1904         udqp = gdqp = NULL;
1905         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1906                 prid = dp->i_d.di_projid;
1907         else if (vap->va_mask & XFS_AT_PROJID)
1908                 prid = (xfs_prid_t)vap->va_projid;
1909         else
1910                 prid = (xfs_prid_t)dfltprid;
1911
1912         /*
1913          * Make sure that we have allocated dquot(s) on disk.
1914          */
1915         error = XFS_QM_DQVOPALLOC(mp, dp,
1916                         current_fsuid(credp), current_fsgid(credp), prid,
1917                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1918         if (error)
1919                 goto std_return;
1920
1921         ip = NULL;
1922         dp_joined_to_trans = B_FALSE;
1923
1924         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1925         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1926         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1927         /*
1928          * Initially assume that the file does not exist and
1929          * reserve the resources for that case.  If that is not
1930          * the case we'll drop the one we have and get a more
1931          * appropriate transaction later.
1932          */
1933         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1934                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1935         if (error == ENOSPC) {
1936                 resblks = 0;
1937                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1938                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1939         }
1940         if (error) {
1941                 cancel_flags = 0;
1942                 dp = NULL;
1943                 goto error_return;
1944         }
1945
1946         xfs_ilock(dp, XFS_ILOCK_EXCL);
1947
1948         XFS_BMAP_INIT(&free_list, &first_block);
1949
1950         ASSERT(ip == NULL);
1951
1952         /*
1953          * Reserve disk quota and the inode.
1954          */
1955         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1956         if (error)
1957                 goto error_return;
1958
1959         if (resblks == 0 &&
1960             (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
1961                 goto error_return;
1962         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1963         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
1964                         rdev, credp, prid, resblks > 0,
1965                         &ip, &committed);
1966         if (error) {
1967                 if (error == ENOSPC)
1968                         goto error_return;
1969                 goto abort_return;
1970         }
1971         ITRACE(ip);
1972
1973         /*
1974          * At this point, we've gotten a newly allocated inode.
1975          * It is locked (and joined to the transaction).
1976          */
1977
1978         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1979
1980         /*
1981          * Now we join the directory inode to the transaction.
1982          * We do not do it earlier because xfs_dir_ialloc
1983          * might commit the previous transaction (and release
1984          * all the locks).
1985          */
1986
1987         VN_HOLD(dir_vp);
1988         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1989         dp_joined_to_trans = B_TRUE;
1990
1991         error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
1992                 &first_block, &free_list,
1993                 resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1994         if (error) {
1995                 ASSERT(error != ENOSPC);
1996                 goto abort_return;
1997         }
1998         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1999         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2000
2001         /*
2002          * If this is a synchronous mount, make sure that the
2003          * create transaction goes to disk before returning to
2004          * the user.
2005          */
2006         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2007                 xfs_trans_set_sync(tp);
2008         }
2009
2010         dp->i_gen++;
2011
2012         /*
2013          * Attach the dquot(s) to the inodes and modify them incore.
2014          * These ids of the inode couldn't have changed since the new
2015          * inode has been locked ever since it was created.
2016          */
2017         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2018
2019         /*
2020          * xfs_trans_commit normally decrements the vnode ref count
2021          * when it unlocks the inode. Since we want to return the
2022          * vnode to the caller, we bump the vnode ref count now.
2023          */
2024         IHOLD(ip);
2025         vp = XFS_ITOV(ip);
2026
2027         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2028         if (error) {
2029                 xfs_bmap_cancel(&free_list);
2030                 goto abort_rele;
2031         }
2032
2033         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2034         if (error) {
2035                 IRELE(ip);
2036                 tp = NULL;
2037                 goto error_return;
2038         }
2039
2040         XFS_QM_DQRELE(mp, udqp);
2041         XFS_QM_DQRELE(mp, gdqp);
2042
2043         /*
2044          * Propagate the fact that the vnode changed after the
2045          * xfs_inode locks have been released.
2046          */
2047         VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2048
2049         *vpp = vp;
2050
2051         /* Fallthrough to std_return with error = 0  */
2052
2053 std_return:
2054         if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2055                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2056                                                         DM_EVENT_POSTCREATE)) {
2057                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2058                         dir_vp, DM_RIGHT_NULL,
2059                         *vpp ? vp:NULL,
2060                         DM_RIGHT_NULL, name, NULL,
2061                         dm_di_mode, error, 0);
2062         }
2063         return error;
2064
2065  abort_return:
2066         cancel_flags |= XFS_TRANS_ABORT;
2067         /* FALLTHROUGH */
2068
2069  error_return:
2070         if (tp != NULL)
2071                 xfs_trans_cancel(tp, cancel_flags);
2072
2073         if (!dp_joined_to_trans && (dp != NULL))
2074                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2075         XFS_QM_DQRELE(mp, udqp);
2076         XFS_QM_DQRELE(mp, gdqp);
2077
2078         goto std_return;
2079
2080  abort_rele:
2081         /*
2082          * Wait until after the current transaction is aborted to
2083          * release the inode.  This prevents recursive transactions
2084          * and deadlocks from xfs_inactive.
2085          */
2086         cancel_flags |= XFS_TRANS_ABORT;
2087         xfs_trans_cancel(tp, cancel_flags);
2088         IRELE(ip);
2089
2090         XFS_QM_DQRELE(mp, udqp);
2091         XFS_QM_DQRELE(mp, gdqp);
2092
2093         goto std_return;
2094 }
2095
2096 #ifdef DEBUG
2097 /*
2098  * Some counters to see if (and how often) we are hitting some deadlock
2099  * prevention code paths.
2100  */
2101
2102 int xfs_rm_locks;
2103 int xfs_rm_lock_delays;
2104 int xfs_rm_attempts;
2105 #endif
2106
2107 /*
2108  * The following routine will lock the inodes associated with the
2109  * directory and the named entry in the directory. The locks are
2110  * acquired in increasing inode number.
2111  *
2112  * If the entry is "..", then only the directory is locked. The
2113  * vnode ref count will still include that from the .. entry in
2114  * this case.
2115  *
2116  * There is a deadlock we need to worry about. If the locked directory is
2117  * in the AIL, it might be blocking up the log. The next inode we lock
2118  * could be already locked by another thread waiting for log space (e.g
2119  * a permanent log reservation with a long running transaction (see
2120  * xfs_itruncate_finish)). To solve this, we must check if the directory
2121  * is in the ail and use lock_nowait. If we can't lock, we need to
2122  * drop the inode lock on the directory and try again. xfs_iunlock will
2123  * potentially push the tail if we were holding up the log.
2124  */
2125 STATIC int
2126 xfs_lock_dir_and_entry(
2127         xfs_inode_t     *dp,
2128         vname_t         *dentry,
2129         xfs_inode_t     *ip)    /* inode of entry 'name' */
2130 {
2131         int             attempts;
2132         xfs_ino_t       e_inum;
2133         xfs_inode_t     *ips[2];
2134         xfs_log_item_t  *lp;
2135
2136 #ifdef DEBUG
2137         xfs_rm_locks++;
2138 #endif
2139         attempts = 0;
2140
2141 again:
2142         xfs_ilock(dp, XFS_ILOCK_EXCL);
2143
2144         e_inum = ip->i_ino;
2145
2146         ITRACE(ip);
2147
2148         /*
2149          * We want to lock in increasing inum. Since we've already
2150          * acquired the lock on the directory, we may need to release
2151          * if if the inum of the entry turns out to be less.
2152          */
2153         if (e_inum > dp->i_ino) {
2154                 /*
2155                  * We are already in the right order, so just
2156                  * lock on the inode of the entry.
2157                  * We need to use nowait if dp is in the AIL.
2158                  */
2159
2160                 lp = (xfs_log_item_t *)dp->i_itemp;
2161                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2162                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2163                                 attempts++;
2164 #ifdef DEBUG
2165                                 xfs_rm_attempts++;
2166 #endif
2167
2168                                 /*
2169                                  * Unlock dp and try again.
2170                                  * xfs_iunlock will try to push the tail
2171                                  * if the inode is in the AIL.
2172                                  */
2173
2174                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2175
2176                                 if ((attempts % 5) == 0) {
2177                                         delay(1); /* Don't just spin the CPU */
2178 #ifdef DEBUG
2179                                         xfs_rm_lock_delays++;
2180 #endif
2181                                 }
2182                                 goto again;
2183                         }
2184                 } else {
2185                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2186                 }
2187         } else if (e_inum < dp->i_ino) {
2188                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2189
2190                 ips[0] = ip;
2191                 ips[1] = dp;
2192                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2193         }
2194         /* else  e_inum == dp->i_ino */
2195         /*     This can happen if we're asked to lock /x/..
2196          *     the entry is "..", which is also the parent directory.
2197          */
2198
2199         return 0;
2200 }
2201
2202 #ifdef DEBUG
2203 int xfs_locked_n;
2204 int xfs_small_retries;
2205 int xfs_middle_retries;
2206 int xfs_lots_retries;
2207 int xfs_lock_delays;
2208 #endif
2209
2210 /*
2211  * The following routine will lock n inodes in exclusive mode.
2212  * We assume the caller calls us with the inodes in i_ino order.
2213  *
2214  * We need to detect deadlock where an inode that we lock
2215  * is in the AIL and we start waiting for another inode that is locked
2216  * by a thread in a long running transaction (such as truncate). This can
2217  * result in deadlock since the long running trans might need to wait
2218  * for the inode we just locked in order to push the tail and free space
2219  * in the log.
2220  */
2221 void
2222 xfs_lock_inodes(
2223         xfs_inode_t     **ips,
2224         int             inodes,
2225         int             first_locked,
2226         uint            lock_mode)
2227 {
2228         int             attempts = 0, i, j, try_lock;
2229         xfs_log_item_t  *lp;
2230
2231         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2232
2233         if (first_locked) {
2234                 try_lock = 1;
2235                 i = 1;
2236         } else {
2237                 try_lock = 0;
2238                 i = 0;
2239         }
2240
2241 again:
2242         for (; i < inodes; i++) {
2243                 ASSERT(ips[i]);
2244
2245                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2246                         continue;
2247
2248                 /*
2249                  * If try_lock is not set yet, make sure all locked inodes
2250                  * are not in the AIL.
2251                  * If any are, set try_lock to be used later.
2252                  */
2253
2254                 if (!try_lock) {
2255                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2256                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2257                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2258                                         try_lock++;
2259                                 }
2260                         }
2261                 }
2262
2263                 /*
2264                  * If any of the previous locks we have locked is in the AIL,
2265                  * we must TRY to get the second and subsequent locks. If
2266                  * we can't get any, we must release all we have
2267                  * and try again.
2268                  */
2269
2270                 if (try_lock) {
2271                         /* try_lock must be 0 if i is 0. */
2272                         /*
2273                          * try_lock means we have an inode locked
2274                          * that is in the AIL.
2275                          */
2276                         ASSERT(i != 0);
2277                         if (!xfs_ilock_nowait(ips[i], lock_mode)) {
2278                                 attempts++;
2279
2280                                 /*
2281                                  * Unlock all previous guys and try again.
2282                                  * xfs_iunlock will try to push the tail
2283                                  * if the inode is in the AIL.
2284                                  */
2285
2286                                 for(j = i - 1; j >= 0; j--) {
2287
2288                                         /*
2289                                          * Check to see if we've already
2290                                          * unlocked this one.
2291                                          * Not the first one going back,
2292                                          * and the inode ptr is the same.
2293                                          */
2294                                         if ((j != (i - 1)) && ips[j] ==
2295                                                                 ips[j+1])
2296                                                 continue;
2297
2298                                         xfs_iunlock(ips[j], lock_mode);
2299                                 }
2300
2301                                 if ((attempts % 5) == 0) {
2302                                         delay(1); /* Don't just spin the CPU */
2303 #ifdef DEBUG
2304                                         xfs_lock_delays++;
2305 #endif
2306                                 }
2307                                 i = 0;
2308                                 try_lock = 0;
2309                                 goto again;
2310                         }
2311                 } else {
2312                         xfs_ilock(ips[i], lock_mode);
2313                 }
2314         }
2315
2316 #ifdef DEBUG
2317         if (attempts) {
2318                 if (attempts < 5) xfs_small_retries++;
2319                 else if (attempts < 100) xfs_middle_retries++;
2320                 else xfs_lots_retries++;
2321         } else {
2322                 xfs_locked_n++;
2323         }
2324 #endif
2325 }
2326
2327 #ifdef  DEBUG
2328 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2329 int remove_which_error_return = 0;
2330 #else /* ! DEBUG */
2331 #define REMOVE_DEBUG_TRACE(x)
2332 #endif  /* ! DEBUG */
2333
2334
2335 /*
2336  * xfs_remove
2337  *
2338  */
2339 STATIC int
2340 xfs_remove(
2341         bhv_desc_t              *dir_bdp,
2342         vname_t                 *dentry,
2343         cred_t                  *credp)
2344 {
2345         vnode_t                 *dir_vp;
2346         char                    *name = VNAME(dentry);
2347         xfs_inode_t             *dp, *ip;
2348         xfs_trans_t             *tp = NULL;
2349         xfs_mount_t             *mp;
2350         int                     error = 0;
2351         xfs_bmap_free_t         free_list;
2352         xfs_fsblock_t           first_block;
2353         int                     cancel_flags;
2354         int                     committed;
2355         int                     dm_di_mode = 0;
2356         int                     link_zero;
2357         uint                    resblks;
2358         int                     namelen;
2359
2360         dir_vp = BHV_TO_VNODE(dir_bdp);
2361         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2362
2363         dp = XFS_BHVTOI(dir_bdp);
2364         mp = dp->i_mount;
2365
2366         if (XFS_FORCED_SHUTDOWN(mp))
2367                 return XFS_ERROR(EIO);
2368
2369         namelen = VNAMELEN(dentry);
2370
2371         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2372                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2373                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2374                                         name, NULL, 0, 0, 0);
2375                 if (error)
2376                         return error;
2377         }
2378
2379         /* From this point on, return through std_return */
2380         ip = NULL;
2381
2382         /*
2383          * We need to get a reference to ip before we get our log
2384          * reservation. The reason for this is that we cannot call
2385          * xfs_iget for an inode for which we do not have a reference
2386          * once we've acquired a log reservation. This is because the
2387          * inode we are trying to get might be in xfs_inactive going
2388          * for a log reservation. Since we'll have to wait for the
2389          * inactive code to complete before returning from xfs_iget,
2390          * we need to make sure that we don't have log space reserved
2391          * when we call xfs_iget.  Instead we get an unlocked reference
2392          * to the inode before getting our log reservation.
2393          */
2394         error = xfs_get_dir_entry(dentry, &ip);
2395         if (error) {
2396                 REMOVE_DEBUG_TRACE(__LINE__);
2397                 goto std_return;
2398         }
2399
2400         dm_di_mode = ip->i_d.di_mode;
2401
2402         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2403
2404         ITRACE(ip);
2405
2406         error = XFS_QM_DQATTACH(mp, dp, 0);
2407         if (!error && dp != ip)
2408                 error = XFS_QM_DQATTACH(mp, ip, 0);
2409         if (error) {
2410                 REMOVE_DEBUG_TRACE(__LINE__);
2411                 IRELE(ip);
2412                 goto std_return;
2413         }
2414
2415         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2416         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2417         /*
2418          * We try to get the real space reservation first,
2419          * allowing for directory btree deletion(s) implying
2420          * possible bmap insert(s).  If we can't get the space
2421          * reservation then we use 0 instead, and avoid the bmap
2422          * btree insert(s) in the directory code by, if the bmap
2423          * insert tries to happen, instead trimming the LAST
2424          * block from the directory.
2425          */
2426         resblks = XFS_REMOVE_SPACE_RES(mp);
2427         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2428                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2429         if (error == ENOSPC) {
2430                 resblks = 0;
2431                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2432                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2433         }
2434         if (error) {
2435                 ASSERT(error != ENOSPC);
2436                 REMOVE_DEBUG_TRACE(__LINE__);
2437                 xfs_trans_cancel(tp, 0);
2438                 IRELE(ip);
2439                 return error;
2440         }
2441
2442         error = xfs_lock_dir_and_entry(dp, dentry, ip);
2443         if (error) {
2444                 REMOVE_DEBUG_TRACE(__LINE__);
2445                 xfs_trans_cancel(tp, cancel_flags);
2446                 IRELE(ip);
2447                 goto std_return;
2448         }
2449
2450         /*
2451          * At this point, we've gotten both the directory and the entry
2452          * inodes locked.
2453          */
2454         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2455         if (dp != ip) {
2456                 /*
2457                  * Increment vnode ref count only in this case since
2458                  * there's an extra vnode reference in the case where
2459                  * dp == ip.
2460                  */
2461                 IHOLD(dp);
2462                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2463         }
2464
2465         /*
2466          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2467          */
2468         XFS_BMAP_INIT(&free_list, &first_block);
2469         error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
2470                 &first_block, &free_list, 0);
2471         if (error) {
2472                 ASSERT(error != ENOENT);
2473                 REMOVE_DEBUG_TRACE(__LINE__);
2474                 goto error1;
2475         }
2476         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2477
2478         dp->i_gen++;
2479         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2480
2481         error = xfs_droplink(tp, ip);
2482         if (error) {
2483                 REMOVE_DEBUG_TRACE(__LINE__);
2484                 goto error1;
2485         }
2486
2487         /* Determine if this is the last link while
2488          * we are in the transaction.
2489          */
2490         link_zero = (ip)->i_d.di_nlink==0;
2491
2492         /*
2493          * Take an extra ref on the inode so that it doesn't
2494          * go to xfs_inactive() from within the commit.
2495          */
2496         IHOLD(ip);
2497
2498         /*
2499          * If this is a synchronous mount, make sure that the
2500          * remove transaction goes to disk before returning to
2501          * the user.
2502          */
2503         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2504                 xfs_trans_set_sync(tp);
2505         }
2506
2507         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2508         if (error) {
2509                 REMOVE_DEBUG_TRACE(__LINE__);
2510                 goto error_rele;
2511         }
2512
2513         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2514         if (error) {
2515                 IRELE(ip);
2516                 goto std_return;
2517         }
2518
2519         /*
2520          * Before we drop our extra reference to the inode, purge it
2521          * from the refcache if it is there.  By waiting until afterwards
2522          * to do the IRELE, we ensure that we won't go inactive in the
2523          * xfs_refcache_purge_ip routine (although that would be OK).
2524          */
2525         xfs_refcache_purge_ip(ip);
2526
2527         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2528
2529         /*
2530          * Let interposed file systems know about removed links.
2531          */
2532         VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
2533
2534         IRELE(ip);
2535
2536 /*      Fall through to std_return with error = 0 */
2537  std_return:
2538         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2539                                                 DM_EVENT_POSTREMOVE)) {
2540                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2541                                 dir_vp, DM_RIGHT_NULL,
2542                                 NULL, DM_RIGHT_NULL,
2543                                 name, NULL, dm_di_mode, error, 0);
2544         }
2545         return error;
2546
2547  error1:
2548         xfs_bmap_cancel(&free_list);
2549         cancel_flags |= XFS_TRANS_ABORT;
2550         xfs_trans_cancel(tp, cancel_flags);
2551         goto std_return;
2552
2553  error_rele:
2554         /*
2555          * In this case make sure to not release the inode until after
2556          * the current transaction is aborted.  Releasing it beforehand
2557          * can cause us to go to xfs_inactive and start a recursive
2558          * transaction which can easily deadlock with the current one.
2559          */
2560         xfs_bmap_cancel(&free_list);
2561         cancel_flags |= XFS_TRANS_ABORT;
2562         xfs_trans_cancel(tp, cancel_flags);
2563
2564         /*
2565          * Before we drop our extra reference to the inode, purge it
2566          * from the refcache if it is there.  By waiting until afterwards
2567          * to do the IRELE, we ensure that we won't go inactive in the
2568          * xfs_refcache_purge_ip routine (although that would be OK).
2569          */
2570         xfs_refcache_purge_ip(ip);
2571
2572         IRELE(ip);
2573
2574         goto std_return;
2575 }
2576
2577
2578 /*
2579  * xfs_link
2580  *
2581  */
2582 STATIC int
2583 xfs_link(
2584         bhv_desc_t              *target_dir_bdp,
2585         vnode_t                 *src_vp,
2586         vname_t                 *dentry,
2587         cred_t                  *credp)
2588 {
2589         xfs_inode_t             *tdp, *sip;
2590         xfs_trans_t             *tp;
2591         xfs_mount_t             *mp;
2592         xfs_inode_t             *ips[2];
2593         int                     error;
2594         xfs_bmap_free_t         free_list;
2595         xfs_fsblock_t           first_block;
2596         int                     cancel_flags;
2597         int                     committed;
2598         vnode_t                 *target_dir_vp;
2599         int                     resblks;
2600         char                    *target_name = VNAME(dentry);
2601         int                     target_namelen;
2602
2603         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2604         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2605         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2606
2607         target_namelen = VNAMELEN(dentry);
2608         if (VN_ISDIR(src_vp))
2609                 return XFS_ERROR(EPERM);
2610
2611         sip = xfs_vtoi(src_vp);
2612         tdp = XFS_BHVTOI(target_dir_bdp);
2613         mp = tdp->i_mount;
2614         if (XFS_FORCED_SHUTDOWN(mp))
2615                 return XFS_ERROR(EIO);
2616
2617         if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2618                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2619                                         target_dir_vp, DM_RIGHT_NULL,
2620                                         src_vp, DM_RIGHT_NULL,
2621                                         target_name, NULL, 0, 0, 0);
2622                 if (error)
2623                         return error;
2624         }
2625
2626         /* Return through std_return after this point. */
2627
2628         error = XFS_QM_DQATTACH(mp, sip, 0);
2629         if (!error && sip != tdp)
2630                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2631         if (error)
2632                 goto std_return;
2633
2634         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2635         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2636         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2637         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2638                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2639         if (error == ENOSPC) {
2640                 resblks = 0;
2641                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2642                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2643         }
2644         if (error) {
2645                 cancel_flags = 0;
2646                 goto error_return;
2647         }
2648
2649         if (sip->i_ino < tdp->i_ino) {
2650                 ips[0] = sip;
2651                 ips[1] = tdp;
2652         } else {
2653                 ips[0] = tdp;
2654                 ips[1] = sip;
2655         }
2656
2657         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2658
2659         /*
2660          * Increment vnode ref counts since xfs_trans_commit &
2661          * xfs_trans_cancel will both unlock the inodes and
2662          * decrement the associated ref counts.
2663          */
2664         VN_HOLD(src_vp);
2665         VN_HOLD(target_dir_vp);
2666         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2667         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2668
2669         /*
2670          * If the source has too many links, we can't make any more to it.
2671          */
2672         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2673                 error = XFS_ERROR(EMLINK);
2674                 goto error_return;
2675         }
2676
2677         /*
2678          * If we are using project inheritance, we only allow hard link
2679          * creation in our tree when the project IDs are the same; else
2680          * the tree quota mechanism could be circumvented.
2681          */
2682         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2683                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2684                 error = XFS_ERROR(EXDEV);
2685                 goto error_return;
2686         }
2687
2688         if (resblks == 0 &&
2689             (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
2690                         target_namelen)))
2691                 goto error_return;
2692
2693         XFS_BMAP_INIT(&free_list, &first_block);
2694
2695         error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
2696                                    sip->i_ino, &first_block, &free_list,
2697                                    resblks);
2698         if (error)
2699                 goto abort_return;
2700         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2701         tdp->i_gen++;
2702         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2703
2704         error = xfs_bumplink(tp, sip);
2705         if (error) {
2706                 goto abort_return;
2707         }
2708
2709         /*
2710          * If this is a synchronous mount, make sure that the
2711          * link transaction goes to disk before returning to
2712          * the user.
2713          */
2714         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2715                 xfs_trans_set_sync(tp);
2716         }
2717
2718         error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
2719         if (error) {
2720                 xfs_bmap_cancel(&free_list);
2721                 goto abort_return;
2722         }
2723
2724         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2725         if (error) {
2726                 goto std_return;
2727         }
2728
2729         /* Fall through to std_return with error = 0. */
2730 std_return:
2731         if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2732                                                 DM_EVENT_POSTLINK)) {
2733                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2734                                 target_dir_vp, DM_RIGHT_NULL,
2735                                 src_vp, DM_RIGHT_NULL,
2736                                 target_name, NULL, 0, error, 0);
2737         }
2738         return error;
2739
2740  abort_return:
2741         cancel_flags |= XFS_TRANS_ABORT;
2742         /* FALLTHROUGH */
2743
2744  error_return:
2745         xfs_trans_cancel(tp, cancel_flags);
2746         goto std_return;
2747 }
2748 /*
2749  * xfs_mkdir
2750  *
2751  */
2752 STATIC int
2753 xfs_mkdir(
2754         bhv_desc_t              *dir_bdp,
2755         vname_t                 *dentry,
2756         vattr_t                 *vap,
2757         vnode_t                 **vpp,
2758         cred_t                  *credp)
2759 {
2760         char                    *dir_name = VNAME(dentry);
2761         xfs_inode_t             *dp;
2762         xfs_inode_t             *cdp;   /* inode of created dir */
2763         vnode_t                 *cvp;   /* vnode of created dir */
2764         xfs_trans_t             *tp;
2765         xfs_mount_t             *mp;
2766         int                     cancel_flags;
2767         int                     error;
2768         int                     committed;
2769         xfs_bmap_free_t         free_list;
2770         xfs_fsblock_t           first_block;
2771         vnode_t                 *dir_vp;
2772         boolean_t               dp_joined_to_trans;
2773         boolean_t               created = B_FALSE;
2774         int                     dm_event_sent = 0;
2775         xfs_prid_t              prid;
2776         struct xfs_dquot        *udqp, *gdqp;
2777         uint                    resblks;
2778         int                     dm_di_mode;
2779         int                     dir_namelen;
2780
2781         dir_vp = BHV_TO_VNODE(dir_bdp);
2782         dp = XFS_BHVTOI(dir_bdp);
2783         mp = dp->i_mount;
2784
2785         if (XFS_FORCED_SHUTDOWN(mp))
2786                 return XFS_ERROR(EIO);
2787
2788         dir_namelen = VNAMELEN(dentry);
2789
2790         tp = NULL;
2791         dp_joined_to_trans = B_FALSE;
2792         dm_di_mode = vap->va_mode;
2793
2794         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2795                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2796                                         dir_vp, DM_RIGHT_NULL, NULL,
2797                                         DM_RIGHT_NULL, dir_name, NULL,
2798                                         dm_di_mode, 0, 0);
2799                 if (error)
2800                         return error;
2801                 dm_event_sent = 1;
2802         }
2803
2804         /* Return through std_return after this point. */
2805
2806         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2807
2808         mp = dp->i_mount;
2809         udqp = gdqp = NULL;
2810         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2811                 prid = dp->i_d.di_projid;
2812         else if (vap->va_mask & XFS_AT_PROJID)
2813                 prid = (xfs_prid_t)vap->va_projid;
2814         else
2815                 prid = (xfs_prid_t)dfltprid;
2816
2817         /*
2818          * Make sure that we have allocated dquot(s) on disk.
2819          */
2820         error = XFS_QM_DQVOPALLOC(mp, dp,
2821                         current_fsuid(credp), current_fsgid(credp), prid,
2822                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2823         if (error)
2824                 goto std_return;
2825
2826         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2827         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2828         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2829         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2830                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2831         if (error == ENOSPC) {
2832                 resblks = 0;
2833                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2834                                           XFS_TRANS_PERM_LOG_RES,
2835                                           XFS_MKDIR_LOG_COUNT);
2836         }
2837         if (error) {
2838                 cancel_flags = 0;
2839                 dp = NULL;
2840                 goto error_return;
2841         }
2842
2843         xfs_ilock(dp, XFS_ILOCK_EXCL);
2844
2845         /*
2846          * Check for directory link count overflow.
2847          */
2848         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2849                 error = XFS_ERROR(EMLINK);
2850                 goto error_return;
2851         }
2852
2853         /*
2854          * Reserve disk quota and the inode.
2855          */
2856         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2857         if (error)
2858                 goto error_return;
2859
2860         if (resblks == 0 &&
2861             (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
2862                 goto error_return;
2863         /*
2864          * create the directory inode.
2865          */
2866         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2867                         0, credp, prid, resblks > 0,
2868                 &cdp, NULL);
2869         if (error) {
2870                 if (error == ENOSPC)
2871                         goto error_return;
2872                 goto abort_return;
2873         }
2874         ITRACE(cdp);
2875
2876         /*
2877          * Now we add the directory inode to the transaction.
2878          * We waited until now since xfs_dir_ialloc might start
2879          * a new transaction.  Had we joined the transaction
2880          * earlier, the locks might have gotten released.
2881          */
2882         VN_HOLD(dir_vp);
2883         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2884         dp_joined_to_trans = B_TRUE;
2885
2886         XFS_BMAP_INIT(&free_list, &first_block);
2887
2888         error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
2889                         cdp->i_ino, &first_block, &free_list,
2890                         resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2891         if (error) {
2892                 ASSERT(error != ENOSPC);
2893                 goto error1;
2894         }
2895         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2896
2897         /*
2898          * Bump the in memory version number of the parent directory
2899          * so that other processes accessing it will recognize that
2900          * the directory has changed.
2901          */
2902         dp->i_gen++;
2903
2904         error = XFS_DIR_INIT(mp, tp, cdp, dp);
2905         if (error) {
2906                 goto error2;
2907         }
2908
2909         cdp->i_gen = 1;
2910         error = xfs_bumplink(tp, dp);
2911         if (error) {
2912                 goto error2;
2913         }
2914
2915         cvp = XFS_ITOV(cdp);
2916
2917         created = B_TRUE;
2918
2919         *vpp = cvp;
2920         IHOLD(cdp);
2921
2922         /*
2923          * Attach the dquots to the new inode and modify the icount incore.
2924          */
2925         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2926
2927         /*
2928          * If this is a synchronous mount, make sure that the
2929          * mkdir transaction goes to disk before returning to
2930          * the user.
2931          */
2932         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2933                 xfs_trans_set_sync(tp);
2934         }
2935
2936         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2937         if (error) {
2938                 IRELE(cdp);
2939                 goto error2;
2940         }
2941
2942         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2943         XFS_QM_DQRELE(mp, udqp);
2944         XFS_QM_DQRELE(mp, gdqp);
2945         if (error) {
2946                 IRELE(cdp);
2947         }
2948
2949         /* Fall through to std_return with error = 0 or errno from
2950          * xfs_trans_commit. */
2951
2952 std_return:
2953         if ( (created || (error != 0 && dm_event_sent != 0)) &&
2954                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2955                                                 DM_EVENT_POSTCREATE)) {
2956                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2957                                         dir_vp, DM_RIGHT_NULL,
2958                                         created ? XFS_ITOV(cdp):NULL,
2959                                         DM_RIGHT_NULL,
2960                                         dir_name, NULL,
2961                                         dm_di_mode, error, 0);
2962         }
2963         return error;
2964
2965  error2:
2966  error1:
2967         xfs_bmap_cancel(&free_list);
2968  abort_return:
2969         cancel_flags |= XFS_TRANS_ABORT;
2970  error_return:
2971         xfs_trans_cancel(tp, cancel_flags);
2972         XFS_QM_DQRELE(mp, udqp);
2973         XFS_QM_DQRELE(mp, gdqp);
2974
2975         if (!dp_joined_to_trans && (dp != NULL)) {
2976                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2977         }
2978
2979         goto std_return;
2980 }
2981
2982
2983 /*
2984  * xfs_rmdir
2985  *
2986  */
2987 STATIC int
2988 xfs_rmdir(
2989         bhv_desc_t              *dir_bdp,
2990         vname_t                 *dentry,
2991         cred_t                  *credp)
2992 {
2993         char                    *name = VNAME(dentry);
2994         xfs_inode_t             *dp;
2995         xfs_inode_t             *cdp;   /* child directory */
2996         xfs_trans_t             *tp;
2997         xfs_mount_t             *mp;
2998         int                     error;
2999         xfs_bmap_free_t         free_list;
3000         xfs_fsblock_t           first_block;
3001         int                     cancel_flags;
3002         int                     committed;
3003         vnode_t                 *dir_vp;
3004         int                     dm_di_mode = 0;
3005         int                     last_cdp_link;
3006         int                     namelen;
3007         uint                    resblks;
3008
3009         dir_vp = BHV_TO_VNODE(dir_bdp);
3010         dp = XFS_BHVTOI(dir_bdp);
3011         mp = dp->i_mount;
3012
3013         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3014
3015         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3016                 return XFS_ERROR(EIO);
3017         namelen = VNAMELEN(dentry);
3018
3019         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3020                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3021                                         dir_vp, DM_RIGHT_NULL,
3022                                         NULL, DM_RIGHT_NULL,
3023                                         name, NULL, 0, 0, 0);
3024                 if (error)
3025                         return XFS_ERROR(error);
3026         }
3027
3028         /* Return through std_return after this point. */
3029
3030         cdp = NULL;
3031
3032         /*
3033          * We need to get a reference to cdp before we get our log
3034          * reservation.  The reason for this is that we cannot call
3035          * xfs_iget for an inode for which we do not have a reference
3036          * once we've acquired a log reservation.  This is because the
3037          * inode we are trying to get might be in xfs_inactive going
3038          * for a log reservation.  Since we'll have to wait for the
3039          * inactive code to complete before returning from xfs_iget,
3040          * we need to make sure that we don't have log space reserved
3041          * when we call xfs_iget.  Instead we get an unlocked reference
3042          * to the inode before getting our log reservation.
3043          */
3044         error = xfs_get_dir_entry(dentry, &cdp);
3045         if (error) {
3046                 REMOVE_DEBUG_TRACE(__LINE__);
3047                 goto std_return;
3048         }
3049         mp = dp->i_mount;
3050         dm_di_mode = cdp->i_d.di_mode;
3051
3052         /*
3053          * Get the dquots for the inodes.
3054          */
3055         error = XFS_QM_DQATTACH(mp, dp, 0);
3056         if (!error && dp != cdp)
3057                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3058         if (error) {
3059                 IRELE(cdp);
3060                 REMOVE_DEBUG_TRACE(__LINE__);
3061                 goto std_return;
3062         }
3063
3064         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3065         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3066         /*
3067          * We try to get the real space reservation first,
3068          * allowing for directory btree deletion(s) implying
3069          * possible bmap insert(s).  If we can't get the space
3070          * reservation then we use 0 instead, and avoid the bmap
3071          * btree insert(s) in the directory code by, if the bmap
3072          * insert tries to happen, instead trimming the LAST
3073          * block from the directory.
3074          */
3075         resblks = XFS_REMOVE_SPACE_RES(mp);
3076         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3077                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3078         if (error == ENOSPC) {
3079                 resblks = 0;
3080                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3081                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3082         }
3083         if (error) {
3084                 ASSERT(error != ENOSPC);
3085                 cancel_flags = 0;
3086                 IRELE(cdp);
3087                 goto error_return;
3088         }
3089         XFS_BMAP_INIT(&free_list, &first_block);
3090
3091         /*
3092          * Now lock the child directory inode and the parent directory
3093          * inode in the proper order.  This will take care of validating
3094          * that the directory entry for the child directory inode has
3095          * not changed while we were obtaining a log reservation.
3096          */
3097         error = xfs_lock_dir_and_entry(dp, dentry, cdp);
3098         if (error) {
3099                 xfs_trans_cancel(tp, cancel_flags);
3100                 IRELE(cdp);
3101                 goto std_return;
3102         }
3103
3104         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3105         if (dp != cdp) {
3106                 /*
3107                  * Only increment the parent directory vnode count if
3108                  * we didn't bump it in looking up cdp.  The only time
3109                  * we don't bump it is when we're looking up ".".
3110                  */
3111                 VN_HOLD(dir_vp);
3112         }
3113
3114         ITRACE(cdp);
3115         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3116
3117         ASSERT(cdp->i_d.di_nlink >= 2);
3118         if (cdp->i_d.di_nlink != 2) {
3119                 error = XFS_ERROR(ENOTEMPTY);
3120                 goto error_return;
3121         }
3122         if (!XFS_DIR_ISEMPTY(mp, cdp)) {
3123                 error = XFS_ERROR(ENOTEMPTY);
3124                 goto error_return;
3125         }
3126
3127         error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
3128                 &first_block, &free_list, resblks);
3129         if (error) {
3130                 goto error1;
3131         }
3132
3133         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3134
3135         /*
3136          * Bump the in memory generation count on the parent
3137          * directory so that other can know that it has changed.
3138          */
3139         dp->i_gen++;
3140
3141         /*
3142          * Drop the link from cdp's "..".
3143          */
3144         error = xfs_droplink(tp, dp);
3145         if (error) {
3146                 goto error1;
3147         }
3148
3149         /*
3150          * Drop the link from dp to cdp.
3151          */
3152         error = xfs_droplink(tp, cdp);
3153         if (error) {
3154                 goto error1;
3155         }
3156
3157         /*
3158          * Drop the "." link from cdp to self.
3159          */
3160         error = xfs_droplink(tp, cdp);
3161         if (error) {
3162                 goto error1;
3163         }
3164
3165         /* Determine these before committing transaction */
3166         last_cdp_link = (cdp)->i_d.di_nlink==0;
3167
3168         /*
3169          * Take an extra ref on the child vnode so that it
3170          * does not go to xfs_inactive() from within the commit.
3171          */
3172         IHOLD(cdp);
3173
3174         /*
3175          * If this is a synchronous mount, make sure that the
3176          * rmdir transaction goes to disk before returning to
3177          * the user.
3178          */
3179         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3180                 xfs_trans_set_sync(tp);
3181         }
3182
3183         error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
3184         if (error) {
3185                 xfs_bmap_cancel(&free_list);
3186                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3187                                  XFS_TRANS_ABORT));
3188                 IRELE(cdp);
3189                 goto std_return;
3190         }
3191
3192         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3193         if (error) {
3194                 IRELE(cdp);
3195                 goto std_return;
3196         }
3197
3198
3199         /*
3200          * Let interposed file systems know about removed links.
3201          */
3202         VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3203
3204         IRELE(cdp);
3205
3206         /* Fall through to std_return with error = 0 or the errno
3207          * from xfs_trans_commit. */
3208  std_return:
3209         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3210                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3211                                         dir_vp, DM_RIGHT_NULL,
3212                                         NULL, DM_RIGHT_NULL,
3213                                         name, NULL, dm_di_mode,
3214                                         error, 0);
3215         }
3216         return error;
3217
3218  error1:
3219         xfs_bmap_cancel(&free_list);
3220         cancel_flags |= XFS_TRANS_ABORT;
3221         /* FALLTHROUGH */
3222
3223  error_return:
3224         xfs_trans_cancel(tp, cancel_flags);
3225         goto std_return;
3226 }
3227
3228
3229 /*
3230  * xfs_readdir
3231  *
3232  * Read dp's entries starting at uiop->uio_offset and translate them into
3233  * bufsize bytes worth of struct dirents starting at bufbase.
3234  */
3235 STATIC int
3236 xfs_readdir(
3237         bhv_desc_t      *dir_bdp,
3238         uio_t           *uiop,
3239         cred_t          *credp,
3240         int             *eofp)
3241 {
3242         xfs_inode_t     *dp;
3243         xfs_trans_t     *tp = NULL;
3244         int             error = 0;
3245         uint            lock_mode;
3246
3247         vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3248                                                (inst_t *)__return_address);
3249         dp = XFS_BHVTOI(dir_bdp);
3250
3251         if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
3252                 return XFS_ERROR(EIO);
3253         }
3254
3255         lock_mode = xfs_ilock_map_shared(dp);
3256         error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
3257         xfs_iunlock_map_shared(dp, lock_mode);
3258         return error;
3259 }
3260
3261
3262 /*
3263  * xfs_symlink
3264  *
3265  */
3266 STATIC int
3267 xfs_symlink(
3268         bhv_desc_t              *dir_bdp,
3269         vname_t                 *dentry,
3270         vattr_t                 *vap,
3271         char                    *target_path,
3272         vnode_t                 **vpp,
3273         cred_t                  *credp)
3274 {
3275         xfs_trans_t             *tp;
3276         xfs_mount_t             *mp;
3277         xfs_inode_t             *dp;
3278         xfs_inode_t             *ip;
3279         int                     error;
3280         int                     pathlen;
3281         xfs_bmap_free_t         free_list;
3282         xfs_fsblock_t           first_block;
3283         boolean_t               dp_joined_to_trans;
3284         vnode_t                 *dir_vp;
3285         uint                    cancel_flags;
3286         int                     committed;
3287         xfs_fileoff_t           first_fsb;
3288         xfs_filblks_t           fs_blocks;
3289         int                     nmaps;
3290         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3291         xfs_daddr_t             d;
3292         char                    *cur_chunk;
3293         int                     byte_cnt;
3294         int                     n;
3295         xfs_buf_t               *bp;
3296         xfs_prid_t              prid;
3297         struct xfs_dquot        *udqp, *gdqp;
3298         uint                    resblks;
3299         char                    *link_name = VNAME(dentry);
3300         int                     link_namelen;
3301
3302         *vpp = NULL;
3303         dir_vp = BHV_TO_VNODE(dir_bdp);
3304         dp = XFS_BHVTOI(dir_bdp);
3305         dp_joined_to_trans = B_FALSE;
3306         error = 0;
3307         ip = NULL;
3308         tp = NULL;
3309
3310         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3311
3312         mp = dp->i_mount;
3313
3314         if (XFS_FORCED_SHUTDOWN(mp))
3315                 return XFS_ERROR(EIO);
3316
3317         link_namelen = VNAMELEN(dentry);
3318
3319         /*
3320          * Check component lengths of the target path name.
3321          */
3322         pathlen = strlen(target_path);
3323         if (pathlen >= MAXPATHLEN)      /* total string too long */
3324                 return XFS_ERROR(ENAMETOOLONG);
3325         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3326                 int len, total;
3327                 char *path;
3328
3329                 for(total = 0, path = target_path; total < pathlen;) {
3330                         /*
3331                          * Skip any slashes.
3332                          */
3333                         while(*path == '/') {
3334                                 total++;
3335                                 path++;
3336                         }
3337
3338                         /*
3339                          * Count up to the next slash or end of path.
3340                          * Error out if the component is bigger than MAXNAMELEN.
3341                          */
3342                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3343                                 if (++len >= MAXNAMELEN) {
3344                                         error = ENAMETOOLONG;
3345                                         return error;
3346                                 }
3347                         }
3348                 }
3349         }
3350
3351         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3352                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3353                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3354                                         link_name, target_path, 0, 0, 0);
3355                 if (error)
3356                         return error;
3357         }
3358
3359         /* Return through std_return after this point. */
3360
3361         udqp = gdqp = NULL;
3362         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3363                 prid = dp->i_d.di_projid;
3364         else if (vap->va_mask & XFS_AT_PROJID)
3365                 prid = (xfs_prid_t)vap->va_projid;
3366         else
3367                 prid = (xfs_prid_t)dfltprid;
3368
3369         /*
3370          * Make sure that we have allocated dquot(s) on disk.
3371          */
3372         error = XFS_QM_DQVOPALLOC(mp, dp,
3373                         current_fsuid(credp), current_fsgid(credp), prid,
3374                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3375         if (error)
3376                 goto std_return;
3377
3378         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3379         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3380         /*
3381          * The symlink will fit into the inode data fork?
3382          * There can't be any attributes so we get the whole variable part.
3383          */
3384         if (pathlen <= XFS_LITINO(mp))
3385                 fs_blocks = 0;
3386         else
3387                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3388         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3389         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3390                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3391         if (error == ENOSPC && fs_blocks == 0) {
3392                 resblks = 0;
3393                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3394                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3395         }
3396         if (error) {
3397                 cancel_flags = 0;
3398                 dp = NULL;
3399                 goto error_return;
3400         }
3401
3402         xfs_ilock(dp, XFS_ILOCK_EXCL);
3403
3404         /*
3405          * Check whether the directory allows new symlinks or not.
3406          */
3407         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3408                 error = XFS_ERROR(EPERM);
3409                 goto error_return;
3410         }
3411
3412         /*
3413          * Reserve disk quota : blocks and inode.
3414          */
3415         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3416         if (error)
3417                 goto error_return;
3418
3419         /*
3420          * Check for ability to enter directory entry, if no space reserved.
3421          */
3422         if (resblks == 0 &&
3423             (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
3424                 goto error_return;
3425         /*
3426          * Initialize the bmap freelist prior to calling either
3427          * bmapi or the directory create code.
3428          */
3429         XFS_BMAP_INIT(&free_list, &first_block);
3430
3431         /*
3432          * Allocate an inode for the symlink.
3433          */
3434         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3435                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3436         if (error) {
3437                 if (error == ENOSPC)
3438                         goto error_return;
3439                 goto error1;
3440         }
3441         ITRACE(ip);
3442
3443         VN_HOLD(dir_vp);
3444         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3445         dp_joined_to_trans = B_TRUE;
3446
3447         /*
3448          * Also attach the dquot(s) to it, if applicable.
3449          */
3450         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3451
3452         if (resblks)
3453                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3454         /*
3455          * If the symlink will fit into the inode, write it inline.
3456          */
3457         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3458                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3459                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3460                 ip->i_d.di_size = pathlen;
3461
3462                 /*
3463                  * The inode was initially created in extent format.
3464                  */
3465                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3466                 ip->i_df.if_flags |= XFS_IFINLINE;
3467
3468                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3469                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3470
3471         } else {
3472                 first_fsb = 0;
3473                 nmaps = SYMLINK_MAPS;
3474
3475                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3476                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3477                                   &first_block, resblks, mval, &nmaps,
3478                                   &free_list);
3479                 if (error) {
3480                         goto error1;
3481                 }
3482
3483                 if (resblks)
3484                         resblks -= fs_blocks;
3485                 ip->i_d.di_size = pathlen;
3486                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3487
3488                 cur_chunk = target_path;
3489                 for (n = 0; n < nmaps; n++) {
3490                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3491                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3492                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3493                                                BTOBB(byte_cnt), 0);
3494                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3495                         if (pathlen < byte_cnt) {
3496                                 byte_cnt = pathlen;
3497                         }
3498                         pathlen -= byte_cnt;
3499
3500                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3501                         cur_chunk += byte_cnt;
3502
3503                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3504                 }
3505         }
3506
3507         /*
3508          * Create the directory entry for the symlink.
3509          */
3510         error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
3511                         ip->i_ino, &first_block, &free_list, resblks);
3512         if (error) {
3513                 goto error1;
3514         }
3515         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3516         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3517
3518         /*
3519          * Bump the in memory version number of the parent directory
3520          * so that other processes accessing it will recognize that
3521          * the directory has changed.
3522          */
3523         dp->i_gen++;
3524
3525         /*
3526          * If this is a synchronous mount, make sure that the
3527          * symlink transaction goes to disk before returning to
3528          * the user.
3529          */
3530         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3531                 xfs_trans_set_sync(tp);
3532         }
3533
3534         /*
3535          * xfs_trans_commit normally decrements the vnode ref count
3536          * when it unlocks the inode. Since we want to return the
3537          * vnode to the caller, we bump the vnode ref count now.
3538          */
3539         IHOLD(ip);
3540
3541         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
3542         if (error) {
3543                 goto error2;
3544         }
3545         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3546         XFS_QM_DQRELE(mp, udqp);
3547         XFS_QM_DQRELE(mp, gdqp);
3548
3549         /* Fall through to std_return with error = 0 or errno from
3550          * xfs_trans_commit     */
3551 std_return:
3552         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3553                              DM_EVENT_POSTSYMLINK)) {
3554                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3555                                         dir_vp, DM_RIGHT_NULL,
3556                                         error ? NULL : XFS_ITOV(ip),
3557                                         DM_RIGHT_NULL, link_name, target_path,
3558                                         0, error, 0);
3559         }
3560
3561         if (!error) {
3562                 vnode_t *vp;
3563
3564                 ASSERT(ip);
3565                 vp = XFS_ITOV(ip);
3566                 *vpp = vp;
3567         }
3568         return error;
3569
3570  error2:
3571         IRELE(ip);
3572  error1:
3573         xfs_bmap_cancel(&free_list);
3574         cancel_flags |= XFS_TRANS_ABORT;
3575  error_return:
3576         xfs_trans_cancel(tp, cancel_flags);
3577         XFS_QM_DQRELE(mp, udqp);
3578         XFS_QM_DQRELE(mp, gdqp);
3579
3580         if (!dp_joined_to_trans && (dp != NULL)) {
3581                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3582         }
3583
3584         goto std_return;
3585 }
3586
3587
3588 /*
3589  * xfs_fid2
3590  *
3591  * A fid routine that takes a pointer to a previously allocated
3592  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3593  */
3594 STATIC int
3595 xfs_fid2(
3596         bhv_desc_t      *bdp,
3597         fid_t           *fidp)
3598 {
3599         xfs_inode_t     *ip;
3600         xfs_fid2_t      *xfid;
3601
3602         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3603                                        (inst_t *)__return_address);
3604         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3605
3606         xfid = (xfs_fid2_t *)fidp;
3607         ip = XFS_BHVTOI(bdp);
3608         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3609         xfid->fid_pad = 0;
3610         /*
3611          * use memcpy because the inode is a long long and there's no
3612          * assurance that xfid->fid_ino is properly aligned.
3613          */
3614         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3615         xfid->fid_gen = ip->i_d.di_gen;
3616
3617         return 0;
3618 }
3619
3620
3621 /*
3622  * xfs_rwlock
3623  */
3624 int
3625 xfs_rwlock(
3626         bhv_desc_t      *bdp,
3627         vrwlock_t       locktype)
3628 {
3629         xfs_inode_t     *ip;
3630         vnode_t         *vp;
3631
3632         vp = BHV_TO_VNODE(bdp);
3633         if (VN_ISDIR(vp))
3634                 return 1;
3635         ip = XFS_BHVTOI(bdp);
3636         if (locktype == VRWLOCK_WRITE) {
3637                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3638         } else if (locktype == VRWLOCK_TRY_READ) {
3639                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3640         } else if (locktype == VRWLOCK_TRY_WRITE) {
3641                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3642         } else {
3643                 ASSERT((locktype == VRWLOCK_READ) ||
3644                        (locktype == VRWLOCK_WRITE_DIRECT));
3645                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3646         }
3647
3648         return 1;
3649 }
3650
3651
3652 /*
3653  * xfs_rwunlock
3654  */
3655 void
3656 xfs_rwunlock(
3657         bhv_desc_t      *bdp,
3658         vrwlock_t       locktype)
3659 {
3660         xfs_inode_t     *ip;
3661         vnode_t         *vp;
3662
3663         vp = BHV_TO_VNODE(bdp);
3664         if (VN_ISDIR(vp))
3665                 return;
3666         ip = XFS_BHVTOI(bdp);
3667         if (locktype == VRWLOCK_WRITE) {
3668                 /*
3669                  * In the write case, we may have added a new entry to
3670                  * the reference cache.  This might store a pointer to
3671                  * an inode to be released in this inode.  If it is there,
3672                  * clear the pointer and release the inode after unlocking
3673                  * this one.
3674                  */
3675                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3676         } else {
3677                 ASSERT((locktype == VRWLOCK_READ) ||
3678                        (locktype == VRWLOCK_WRITE_DIRECT));
3679                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3680         }
3681         return;
3682 }
3683
3684 STATIC int
3685 xfs_inode_flush(
3686         bhv_desc_t      *bdp,
3687         int             flags)
3688 {
3689         xfs_inode_t     *ip;
3690         xfs_mount_t     *mp;
3691         xfs_inode_log_item_t *iip;
3692         int             error = 0;
3693
3694         ip = XFS_BHVTOI(bdp);
3695         mp = ip->i_mount;
3696         iip = ip->i_itemp;
3697
3698         if (XFS_FORCED_SHUTDOWN(mp))
3699                 return XFS_ERROR(EIO);
3700
3701         /*
3702          * Bypass inodes which have already been cleaned by
3703          * the inode flush clustering code inside xfs_iflush
3704          */
3705         if ((ip->i_update_core == 0) &&
3706             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3707                 return 0;
3708
3709         if (flags & FLUSH_LOG) {
3710                 if (iip && iip->ili_last_lsn) {
3711                         xlog_t          *log = mp->m_log;
3712                         xfs_lsn_t       sync_lsn;
3713                         int             s, log_flags = XFS_LOG_FORCE;
3714
3715                         s = GRANT_LOCK(log);
3716                         sync_lsn = log->l_last_sync_lsn;
3717                         GRANT_UNLOCK(log, s);
3718
3719                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3720                                 return 0;
3721
3722                         if (flags & FLUSH_SYNC)
3723                                 log_flags |= XFS_LOG_SYNC;
3724                         return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3725                 }
3726         }
3727
3728         /*
3729          * We make this non-blocking if the inode is contended,
3730          * return EAGAIN to indicate to the caller that they
3731          * did not succeed. This prevents the flush path from
3732          * blocking on inodes inside another operation right
3733          * now, they get caught later by xfs_sync.
3734          */
3735         if (flags & FLUSH_INODE) {
3736                 int     flush_flags;
3737
3738                 if (xfs_ipincount(ip))
3739                         return EAGAIN;
3740
3741                 if (flags & FLUSH_SYNC) {
3742                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3743                         xfs_iflock(ip);
3744                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3745                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3746                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3747                                 return EAGAIN;
3748                         }
3749                 } else {
3750                         return EAGAIN;
3751                 }
3752
3753                 if (flags & FLUSH_SYNC)
3754                         flush_flags = XFS_IFLUSH_SYNC;
3755                 else
3756                         flush_flags = XFS_IFLUSH_ASYNC;
3757
3758                 error = xfs_iflush(ip, flush_flags);
3759                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3760         }
3761
3762         return error;
3763 }
3764
3765
3766 int
3767 xfs_set_dmattrs (
3768         bhv_desc_t      *bdp,
3769         u_int           evmask,
3770         u_int16_t       state,
3771         cred_t          *credp)
3772 {
3773         xfs_inode_t     *ip;
3774         xfs_trans_t     *tp;
3775         xfs_mount_t     *mp;
3776         int             error;
3777
3778         if (!capable(CAP_SYS_ADMIN))
3779                 return XFS_ERROR(EPERM);
3780
3781         ip = XFS_BHVTOI(bdp);
3782         mp = ip->i_mount;
3783
3784         if (XFS_FORCED_SHUTDOWN(mp))
3785                 return XFS_ERROR(EIO);
3786
3787         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3788         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3789         if (error) {
3790                 xfs_trans_cancel(tp, 0);
3791                 return error;
3792         }
3793         xfs_ilock(ip, XFS_ILOCK_EXCL);
3794         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3795
3796         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3797         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3798
3799         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3800         IHOLD(ip);
3801         error = xfs_trans_commit(tp, 0, NULL);
3802
3803         return error;
3804 }
3805
3806
3807 /*
3808  * xfs_reclaim
3809  */
3810 STATIC int
3811 xfs_reclaim(
3812         bhv_desc_t      *bdp)
3813 {
3814         xfs_inode_t     *ip;
3815         vnode_t         *vp;
3816
3817         vp = BHV_TO_VNODE(bdp);
3818         ip = XFS_BHVTOI(bdp);
3819
3820         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3821
3822         ASSERT(!VN_MAPPED(vp));
3823
3824         /* bad inode, get out here ASAP */
3825         if (VN_BAD(vp)) {
3826                 xfs_ireclaim(ip);
3827                 return 0;
3828         }
3829
3830         vn_iowait(vp);
3831
3832         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3833
3834         /*
3835          * Make sure the atime in the XFS inode is correct before freeing the
3836          * Linux inode.
3837          */
3838         xfs_synchronize_atime(ip);
3839
3840         /* If we have nothing to flush with this inode then complete the
3841          * teardown now, otherwise break the link between the xfs inode
3842          * and the linux inode and clean up the xfs inode later. This
3843          * avoids flushing the inode to disk during the delete operation
3844          * itself.
3845          */
3846         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3847                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3848                 xfs_iflock(ip);
3849                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3850         } else {
3851                 xfs_mount_t     *mp = ip->i_mount;
3852
3853                 /* Protect sync from us */
3854                 XFS_MOUNT_ILOCK(mp);
3855                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3856                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3857                 ip->i_flags |= XFS_IRECLAIMABLE;
3858                 XFS_MOUNT_IUNLOCK(mp);
3859         }
3860         return 0;
3861 }
3862
3863 int
3864 xfs_finish_reclaim(
3865         xfs_inode_t     *ip,
3866         int             locked,
3867         int             sync_mode)
3868 {
3869         xfs_ihash_t     *ih = ip->i_hash;
3870         vnode_t         *vp = XFS_ITOV_NULL(ip);
3871         int             error;
3872
3873         if (vp && VN_BAD(vp))
3874                 goto reclaim;
3875
3876         /* The hash lock here protects a thread in xfs_iget_core from
3877          * racing with us on linking the inode back with a vnode.
3878          * Once we have the XFS_IRECLAIM flag set it will not touch
3879          * us.
3880          */
3881         write_lock(&ih->ih_lock);
3882         if ((ip->i_flags & XFS_IRECLAIM) ||
3883             (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) {
3884                 write_unlock(&ih->ih_lock);
3885                 if (locked) {
3886                         xfs_ifunlock(ip);
3887                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3888                 }
3889                 return 1;
3890         }
3891         ip->i_flags |= XFS_IRECLAIM;
3892         write_unlock(&ih->ih_lock);
3893
3894         /*
3895          * If the inode is still dirty, then flush it out.  If the inode
3896          * is not in the AIL, then it will be OK to flush it delwri as
3897          * long as xfs_iflush() does not keep any references to the inode.
3898          * We leave that decision up to xfs_iflush() since it has the
3899          * knowledge of whether it's OK to simply do a delwri flush of
3900          * the inode or whether we need to wait until the inode is
3901          * pulled from the AIL.
3902          * We get the flush lock regardless, though, just to make sure
3903          * we don't free it while it is being flushed.
3904          */
3905         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3906                 if (!locked) {
3907                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3908                         xfs_iflock(ip);
3909                 }
3910
3911                 if (ip->i_update_core ||
3912                     ((ip->i_itemp != NULL) &&
3913                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3914                         error = xfs_iflush(ip, sync_mode);
3915                         /*
3916                          * If we hit an error, typically because of filesystem
3917                          * shutdown, we don't need to let vn_reclaim to know
3918                          * because we're gonna reclaim the inode anyway.
3919                          */
3920                         if (error) {
3921                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3922                                 goto reclaim;
3923                         }
3924                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3925                 }
3926
3927                 ASSERT(ip->i_update_core == 0);
3928                 ASSERT(ip->i_itemp == NULL ||
3929                        ip->i_itemp->ili_format.ilf_fields == 0);
3930                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3931         } else if (locked) {
3932                 /*
3933                  * We are not interested in doing an iflush if we're
3934                  * in the process of shutting down the filesystem forcibly.
3935                  * So, just reclaim the inode.
3936                  */
3937                 xfs_ifunlock(ip);
3938                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3939         }
3940
3941  reclaim:
3942         xfs_ireclaim(ip);
3943         return 0;
3944 }
3945
3946 int
3947 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3948 {
3949         int             purged;
3950         xfs_inode_t     *ip, *n;
3951         int             done = 0;
3952
3953         while (!done) {
3954                 purged = 0;
3955                 XFS_MOUNT_ILOCK(mp);
3956                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
3957                         if (noblock) {
3958                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3959                                         continue;
3960                                 if (xfs_ipincount(ip) ||
3961                                     !xfs_iflock_nowait(ip)) {
3962                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3963                                         continue;
3964                                 }
3965                         }
3966                         XFS_MOUNT_IUNLOCK(mp);
3967                         if (xfs_finish_reclaim(ip, noblock,
3968                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
3969                                 delay(1);
3970                         purged = 1;
3971                         break;
3972                 }
3973
3974                 done = !purged;
3975         }
3976
3977         XFS_MOUNT_IUNLOCK(mp);
3978         return 0;
3979 }
3980
3981 /*
3982  * xfs_alloc_file_space()
3983  *      This routine allocates disk space for the given file.
3984  *
3985  *      If alloc_type == 0, this request is for an ALLOCSP type
3986  *      request which will change the file size.  In this case, no
3987  *      DMAPI event will be generated by the call.  A TRUNCATE event
3988  *      will be generated later by xfs_setattr.
3989  *
3990  *      If alloc_type != 0, this request is for a RESVSP type
3991  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3992  *      lower block boundary byte address is less than the file's
3993  *      length.
3994  *
3995  * RETURNS:
3996  *       0 on success
3997  *      errno on error
3998  *
3999  */
4000 STATIC int
4001 xfs_alloc_file_space(
4002         xfs_inode_t             *ip,
4003         xfs_off_t               offset,
4004         xfs_off_t               len,
4005         int                     alloc_type,
4006         int                     attr_flags)
4007 {
4008         xfs_mount_t             *mp = ip->i_mount;
4009         xfs_off_t               count;
4010         xfs_filblks_t           allocated_fsb;
4011         xfs_filblks_t           allocatesize_fsb;
4012         xfs_extlen_t            extsz, temp;
4013         xfs_fileoff_t           startoffset_fsb;
4014         xfs_fsblock_t           firstfsb;
4015         int                     nimaps;
4016         int                     bmapi_flag;
4017         int                     quota_flag;
4018         int                     rt;
4019         xfs_trans_t             *tp;
4020         xfs_bmbt_irec_t         imaps[1], *imapp;
4021         xfs_bmap_free_t         free_list;
4022         uint                    qblocks, resblks, resrtextents;
4023         int                     committed;
4024         int                     error;
4025
4026         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4027
4028         if (XFS_FORCED_SHUTDOWN(mp))
4029                 return XFS_ERROR(EIO);
4030
4031         rt = XFS_IS_REALTIME_INODE(ip);
4032         if (unlikely(rt)) {
4033                 if (!(extsz = ip->i_d.di_extsize))
4034                         extsz = mp->m_sb.sb_rextsize;
4035         } else {
4036                 extsz = ip->i_d.di_extsize;
4037         }
4038
4039         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4040                 return error;
4041
4042         if (len <= 0)
4043                 return XFS_ERROR(EINVAL);
4044
4045         count = len;
4046         error = 0;
4047         imapp = &imaps[0];
4048         nimaps = 1;
4049         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4050         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4051         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4052
4053         /*      Generate a DMAPI event if needed.       */
4054         if (alloc_type != 0 && offset < ip->i_d.di_size &&
4055                         (attr_flags&ATTR_DMI) == 0  &&
4056                         DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4057                 xfs_off_t           end_dmi_offset;
4058
4059                 end_dmi_offset = offset+len;
4060                 if (end_dmi_offset > ip->i_d.di_size)
4061                         end_dmi_offset = ip->i_d.di_size;
4062                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4063                         offset, end_dmi_offset - offset,
4064                         0, NULL);
4065                 if (error)
4066                         return error;
4067         }
4068
4069         /*
4070          * Allocate file space until done or until there is an error
4071          */
4072 retry:
4073         while (allocatesize_fsb && !error) {
4074                 xfs_fileoff_t   s, e;
4075
4076                 /*
4077                  * Determine space reservations for data/realtime.
4078                  */
4079                 if (unlikely(extsz)) {
4080                         s = startoffset_fsb;
4081                         do_div(s, extsz);
4082                         s *= extsz;
4083                         e = startoffset_fsb + allocatesize_fsb;
4084                         if ((temp = do_mod(startoffset_fsb, extsz)))
4085                                 e += temp;
4086                         if ((temp = do_mod(e, extsz)))
4087                                 e += extsz - temp;
4088                 } else {
4089                         s = 0;
4090                         e = allocatesize_fsb;
4091                 }
4092
4093                 if (unlikely(rt)) {
4094                         resrtextents = qblocks = (uint)(e - s);
4095                         resrtextents /= mp->m_sb.sb_rextsize;
4096                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4097                         quota_flag = XFS_QMOPT_RES_RTBLKS;
4098                 } else {
4099                         resrtextents = 0;
4100                         resblks = qblocks = \
4101                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4102                         quota_flag = XFS_QMOPT_RES_REGBLKS;
4103                 }
4104
4105                 /*
4106                  * Allocate and setup the transaction.
4107                  */
4108                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4109                 error = xfs_trans_reserve(tp, resblks,
4110                                           XFS_WRITE_LOG_RES(mp), resrtextents,
4111                                           XFS_TRANS_PERM_LOG_RES,
4112                                           XFS_WRITE_LOG_COUNT);
4113                 /*
4114                  * Check for running out of space
4115                  */
4116                 if (error) {
4117                         /*
4118                          * Free the transaction structure.
4119                          */
4120                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4121                         xfs_trans_cancel(tp, 0);
4122                         break;
4123                 }
4124                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4125                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4126                                                       qblocks, 0, quota_flag);
4127                 if (error)
4128                         goto error1;
4129
4130                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4131                 xfs_trans_ihold(tp, ip);
4132
4133                 /*
4134                  * Issue the xfs_bmapi() call to allocate the blocks
4135                  */
4136                 XFS_BMAP_INIT(&free_list, &firstfsb);
4137                 error = xfs_bmapi(tp, ip, startoffset_fsb,
4138                                   allocatesize_fsb, bmapi_flag,
4139                                   &firstfsb, 0, imapp, &nimaps,
4140                                   &free_list);
4141                 if (error) {
4142                         goto error0;
4143                 }
4144
4145                 /*
4146                  * Complete the transaction
4147                  */
4148                 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4149                 if (error) {
4150                         goto error0;
4151                 }
4152
4153                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4154                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4155                 if (error) {
4156                         break;
4157                 }
4158
4159                 allocated_fsb = imapp->br_blockcount;
4160
4161                 if (nimaps == 0) {
4162                         error = XFS_ERROR(ENOSPC);
4163                         break;
4164                 }
4165
4166                 startoffset_fsb += allocated_fsb;
4167                 allocatesize_fsb -= allocated_fsb;
4168         }
4169 dmapi_enospc_check:
4170         if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4171             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4172
4173                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4174                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4175                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4176                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4177                 if (error == 0)
4178                         goto retry;     /* Maybe DMAPI app. has made space */
4179                 /* else fall through with error from XFS_SEND_DATA */
4180         }
4181
4182         return error;
4183
4184 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4185         xfs_bmap_cancel(&free_list);
4186         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4187
4188 error1: /* Just cancel transaction */
4189         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4190         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4191         goto dmapi_enospc_check;
4192 }
4193
4194 /*
4195  * Zero file bytes between startoff and endoff inclusive.
4196  * The iolock is held exclusive and no blocks are buffered.
4197  */
4198 STATIC int
4199 xfs_zero_remaining_bytes(
4200         xfs_inode_t             *ip,
4201         xfs_off_t               startoff,
4202         xfs_off_t               endoff)
4203 {
4204         xfs_bmbt_irec_t         imap;
4205         xfs_fileoff_t           offset_fsb;
4206         xfs_off_t               lastoffset;
4207         xfs_off_t               offset;
4208         xfs_buf_t               *bp;
4209         xfs_mount_t             *mp = ip->i_mount;
4210         int                     nimap;
4211         int                     error = 0;
4212
4213         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4214                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4215                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4216
4217         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4218                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4219                 nimap = 1;
4220                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
4221                         &nimap, NULL);
4222                 if (error || nimap < 1)
4223                         break;
4224                 ASSERT(imap.br_blockcount >= 1);
4225                 ASSERT(imap.br_startoff == offset_fsb);
4226                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4227                 if (lastoffset > endoff)
4228                         lastoffset = endoff;
4229                 if (imap.br_startblock == HOLESTARTBLOCK)
4230                         continue;
4231                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4232                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4233                         continue;
4234                 XFS_BUF_UNDONE(bp);
4235                 XFS_BUF_UNWRITE(bp);
4236                 XFS_BUF_READ(bp);
4237                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4238                 xfsbdstrat(mp, bp);
4239                 if ((error = xfs_iowait(bp))) {
4240                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4241                                           mp, bp, XFS_BUF_ADDR(bp));
4242                         break;
4243                 }
4244                 memset(XFS_BUF_PTR(bp) +
4245                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4246                       0, lastoffset - offset + 1);
4247                 XFS_BUF_UNDONE(bp);
4248                 XFS_BUF_UNREAD(bp);
4249                 XFS_BUF_WRITE(bp);
4250                 xfsbdstrat(mp, bp);
4251                 if ((error = xfs_iowait(bp))) {
4252                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4253                                           mp, bp, XFS_BUF_ADDR(bp));
4254                         break;
4255                 }
4256         }
4257         xfs_buf_free(bp);
4258         return error;
4259 }
4260
4261 /*
4262  * xfs_free_file_space()
4263  *      This routine frees disk space for the given file.
4264  *
4265  *      This routine is only called by xfs_change_file_space
4266  *      for an UNRESVSP type call.
4267  *
4268  * RETURNS:
4269  *       0 on success
4270  *      errno on error
4271  *
4272  */
4273 STATIC int
4274 xfs_free_file_space(
4275         xfs_inode_t             *ip,
4276         xfs_off_t               offset,
4277         xfs_off_t               len,
4278         int                     attr_flags)
4279 {
4280         vnode_t                 *vp;
4281         int                     committed;
4282         int                     done;
4283         xfs_off_t               end_dmi_offset;
4284         xfs_fileoff_t           endoffset_fsb;
4285         int                     error;
4286         xfs_fsblock_t           firstfsb;
4287         xfs_bmap_free_t         free_list;
4288         xfs_off_t               ilen;
4289         xfs_bmbt_irec_t         imap;
4290         xfs_off_t               ioffset;
4291         xfs_extlen_t            mod=0;
4292         xfs_mount_t             *mp;
4293         int                     nimap;
4294         uint                    resblks;
4295         int                     rounding;
4296         int                     rt;
4297         xfs_fileoff_t           startoffset_fsb;
4298         xfs_trans_t             *tp;
4299         int                     need_iolock = 1;
4300
4301         vp = XFS_ITOV(ip);
4302         mp = ip->i_mount;
4303
4304         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4305
4306         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4307                 return error;
4308
4309         error = 0;
4310         if (len <= 0)   /* if nothing being freed */
4311                 return error;
4312         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4313         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4314         end_dmi_offset = offset + len;
4315         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4316
4317         if (offset < ip->i_d.di_size &&
4318             (attr_flags & ATTR_DMI) == 0 &&
4319             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4320                 if (end_dmi_offset > ip->i_d.di_size)
4321                         end_dmi_offset = ip->i_d.di_size;
4322                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4323                                 offset, end_dmi_offset - offset,
4324                                 AT_DELAY_FLAG(attr_flags), NULL);
4325                 if (error)
4326                         return error;
4327         }
4328
4329         ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1);
4330         if (attr_flags & ATTR_NOLOCK)
4331                 need_iolock = 0;
4332         if (need_iolock) {
4333                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4334                 vn_iowait(vp);  /* wait for the completion of any pending DIOs */
4335         }
4336
4337         rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
4338                         (__uint8_t)NBPP);
4339         ilen = len + (offset & (rounding - 1));
4340         ioffset = offset & ~(rounding - 1);
4341         if (ilen & (rounding - 1))
4342                 ilen = (ilen + rounding) & ~(rounding - 1);
4343
4344         if (VN_CACHED(vp) != 0) {
4345                 xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4346                                 ctooff(offtoct(ioffset)), -1);
4347                 VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(ioffset)),
4348                                 -1, FI_REMAPF_LOCKED);
4349         }
4350
4351         /*
4352          * Need to zero the stuff we're not freeing, on disk.
4353          * If its a realtime file & can't use unwritten extents then we
4354          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4355          * will take care of it for us.
4356          */
4357         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4358                 nimap = 1;
4359                 error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
4360                         &imap, &nimap, NULL);
4361                 if (error)
4362                         goto out_unlock_iolock;
4363                 ASSERT(nimap == 0 || nimap == 1);
4364                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4365                         xfs_daddr_t     block;
4366
4367                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4368                         block = imap.br_startblock;
4369                         mod = do_div(block, mp->m_sb.sb_rextsize);
4370                         if (mod)
4371                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4372                 }
4373                 nimap = 1;
4374                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
4375                         &imap, &nimap, NULL);
4376                 if (error)
4377                         goto out_unlock_iolock;
4378                 ASSERT(nimap == 0 || nimap == 1);
4379                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4380                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4381                         mod++;
4382                         if (mod && (mod != mp->m_sb.sb_rextsize))
4383                                 endoffset_fsb -= mod;
4384                 }
4385         }
4386         if ((done = (endoffset_fsb <= startoffset_fsb)))
4387                 /*
4388                  * One contiguous piece to clear
4389                  */
4390                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4391         else {
4392                 /*
4393                  * Some full blocks, possibly two pieces to clear
4394                  */
4395                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4396                         error = xfs_zero_remaining_bytes(ip, offset,
4397                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4398                 if (!error &&
4399                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4400                         error = xfs_zero_remaining_bytes(ip,
4401                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4402                                 offset + len - 1);
4403         }
4404
4405         /*
4406          * free file space until done or until there is an error
4407          */
4408         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4409         while (!error && !done) {
4410
4411                 /*
4412                  * allocate and setup the transaction
4413                  */
4414                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4415                 error = xfs_trans_reserve(tp,
4416                                           resblks,
4417                                           XFS_WRITE_LOG_RES(mp),
4418                                           0,
4419                                           XFS_TRANS_PERM_LOG_RES,
4420                                           XFS_WRITE_LOG_COUNT);
4421
4422                 /*
4423                  * check for running out of space
4424                  */
4425                 if (error) {
4426                         /*
4427                          * Free the transaction structure.
4428                          */
4429                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4430                         xfs_trans_cancel(tp, 0);
4431                         break;
4432                 }
4433                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4434                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4435                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4436                                 XFS_QMOPT_RES_REGBLKS);
4437                 if (error)
4438                         goto error1;
4439
4440                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4441                 xfs_trans_ihold(tp, ip);
4442
4443                 /*
4444                  * issue the bunmapi() call to free the blocks
4445                  */
4446                 XFS_BMAP_INIT(&free_list, &firstfsb);
4447                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
4448                                   endoffset_fsb - startoffset_fsb,
4449                                   0, 2, &firstfsb, &free_list, &done);
4450                 if (error) {
4451                         goto error0;
4452                 }
4453
4454                 /*
4455                  * complete the transaction
4456                  */
4457                 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4458                 if (error) {
4459                         goto error0;
4460                 }
4461
4462                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4463                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4464         }
4465
4466  out_unlock_iolock:
4467         if (need_iolock)
4468                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4469         return error;
4470
4471  error0:
4472         xfs_bmap_cancel(&free_list);
4473  error1:
4474         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4475         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4476                     XFS_ILOCK_EXCL);
4477         return error;
4478 }
4479
4480 /*
4481  * xfs_change_file_space()
4482  *      This routine allocates or frees disk space for the given file.
4483  *      The user specified parameters are checked for alignment and size
4484  *      limitations.
4485  *
4486  * RETURNS:
4487  *       0 on success
4488  *      errno on error
4489  *
4490  */
4491 int
4492 xfs_change_file_space(
4493         bhv_desc_t      *bdp,
4494         int             cmd,
4495         xfs_flock64_t   *bf,
4496         xfs_off_t       offset,
4497         cred_t          *credp,
4498         int             attr_flags)
4499 {
4500         int             clrprealloc;
4501         int             error;
4502         xfs_fsize_t     fsize;
4503         xfs_inode_t     *ip;
4504         xfs_mount_t     *mp;
4505         int             setprealloc;
4506         xfs_off_t       startoffset;
4507         xfs_off_t       llen;
4508         xfs_trans_t     *tp;
4509         vattr_t         va;
4510         vnode_t         *vp;
4511
4512         vp = BHV_TO_VNODE(bdp);
4513         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4514
4515         ip = XFS_BHVTOI(bdp);
4516         mp = ip->i_mount;
4517
4518         /*
4519          * must be a regular file and have write permission
4520          */
4521         if (!VN_ISREG(vp))
4522                 return XFS_ERROR(EINVAL);
4523
4524         xfs_ilock(ip, XFS_ILOCK_SHARED);
4525
4526         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4527                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4528                 return error;
4529         }
4530
4531         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4532
4533         switch (bf->l_whence) {
4534         case 0: /*SEEK_SET*/
4535                 break;
4536         case 1: /*SEEK_CUR*/
4537                 bf->l_start += offset;
4538                 break;
4539         case 2: /*SEEK_END*/
4540                 bf->l_start += ip->i_d.di_size;
4541                 break;
4542         default:
4543                 return XFS_ERROR(EINVAL);
4544         }
4545
4546         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4547
4548         if (   (bf->l_start < 0)
4549             || (bf->l_start > XFS_MAXIOFFSET(mp))
4550             || (bf->l_start + llen < 0)
4551             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4552                 return XFS_ERROR(EINVAL);
4553
4554         bf->l_whence = 0;
4555
4556         startoffset = bf->l_start;
4557         fsize = ip->i_d.di_size;
4558
4559         /*
4560          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4561          * file space.
4562          * These calls do NOT zero the data space allocated to the file,
4563          * nor do they change the file size.
4564          *
4565          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4566          * space.
4567          * These calls cause the new file data to be zeroed and the file
4568          * size to be changed.
4569          */
4570         setprealloc = clrprealloc = 0;
4571
4572         switch (cmd) {
4573         case XFS_IOC_RESVSP:
4574         case XFS_IOC_RESVSP64:
4575                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4576                                                                 1, attr_flags);
4577                 if (error)
4578                         return error;
4579                 setprealloc = 1;
4580                 break;
4581
4582         case XFS_IOC_UNRESVSP:
4583         case XFS_IOC_UNRESVSP64:
4584                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4585                                                                 attr_flags)))
4586                         return error;
4587                 break;
4588
4589         case XFS_IOC_ALLOCSP:
4590         case XFS_IOC_ALLOCSP64:
4591         case XFS_IOC_FREESP:
4592         case XFS_IOC_FREESP64:
4593                 if (startoffset > fsize) {
4594                         error = xfs_alloc_file_space(ip, fsize,
4595                                         startoffset - fsize, 0, attr_flags);
4596                         if (error)
4597                                 break;
4598                 }
4599
4600                 va.va_mask = XFS_AT_SIZE;
4601                 va.va_size = startoffset;
4602
4603                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4604
4605                 if (error)
4606                         return error;
4607
4608                 clrprealloc = 1;
4609                 break;
4610
4611         default:
4612                 ASSERT(0);
4613                 return XFS_ERROR(EINVAL);
4614         }
4615
4616         /*
4617          * update the inode timestamp, mode, and prealloc flag bits
4618          */
4619         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4620
4621         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4622                                       0, 0, 0))) {
4623                 /* ASSERT(0); */
4624                 xfs_trans_cancel(tp, 0);
4625                 return error;
4626         }
4627
4628         xfs_ilock(ip, XFS_ILOCK_EXCL);
4629
4630         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4631         xfs_trans_ihold(tp, ip);
4632
4633         if ((attr_flags & ATTR_DMI) == 0) {
4634                 ip->i_d.di_mode &= ~S_ISUID;
4635
4636                 /*
4637                  * Note that we don't have to worry about mandatory
4638                  * file locking being disabled here because we only
4639                  * clear the S_ISGID bit if the Group execute bit is
4640                  * on, but if it was on then mandatory locking wouldn't
4641                  * have been enabled.
4642                  */
4643                 if (ip->i_d.di_mode & S_IXGRP)
4644                         ip->i_d.di_mode &= ~S_ISGID;
4645
4646                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4647         }
4648         if (setprealloc)
4649                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4650         else if (clrprealloc)
4651                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4652
4653         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4654         xfs_trans_set_sync(tp);
4655
4656         error = xfs_trans_commit(tp, 0, NULL);
4657
4658         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4659
4660         return error;
4661 }
4662
4663 vnodeops_t xfs_vnodeops = {
4664         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4665         .vop_open               = xfs_open,
4666         .vop_read               = xfs_read,
4667 #ifdef HAVE_SENDFILE
4668         .vop_sendfile           = xfs_sendfile,
4669 #endif
4670 #ifdef HAVE_SPLICE
4671         .vop_splice_read        = xfs_splice_read,
4672         .vop_splice_write       = xfs_splice_write,
4673 #endif
4674         .vop_write              = xfs_write,
4675         .vop_ioctl              = xfs_ioctl,
4676         .vop_getattr            = xfs_getattr,
4677         .vop_setattr            = xfs_setattr,
4678         .vop_access             = xfs_access,
4679         .vop_lookup             = xfs_lookup,
4680         .vop_create             = xfs_create,
4681         .vop_remove             = xfs_remove,
4682         .vop_link               = xfs_link,
4683         .vop_rename             = xfs_rename,
4684         .vop_mkdir              = xfs_mkdir,
4685         .vop_rmdir              = xfs_rmdir,
4686         .vop_readdir            = xfs_readdir,
4687         .vop_symlink            = xfs_symlink,
4688         .vop_readlink           = xfs_readlink,
4689         .vop_fsync              = xfs_fsync,
4690         .vop_inactive           = xfs_inactive,
4691         .vop_fid2               = xfs_fid2,
4692         .vop_rwlock             = xfs_rwlock,
4693         .vop_rwunlock           = xfs_rwunlock,
4694         .vop_bmap               = xfs_bmap,
4695         .vop_reclaim            = xfs_reclaim,
4696         .vop_attr_get           = xfs_attr_get,
4697         .vop_attr_set           = xfs_attr_set,
4698         .vop_attr_remove        = xfs_attr_remove,
4699         .vop_attr_list          = xfs_attr_list,
4700         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4701         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4702         .vop_tosspages          = fs_tosspages,
4703         .vop_flushinval_pages   = fs_flushinval_pages,
4704         .vop_flush_pages        = fs_flush_pages,
4705         .vop_release            = xfs_release,
4706         .vop_iflush             = xfs_inode_flush,
4707 };