This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / fs / xfs / linux-2.6 / xfs_lrw.c
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
new file mode 100644 (file)
index 0000000..c45e963
--- /dev/null
@@ -0,0 +1,1028 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ *  fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
+ *
+ */
+
+#include "xfs.h"
+
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_iomap.h"
+
+#include <linux/capability.h>
+
+
+#if defined(XFS_RW_TRACE)
+void
+xfs_rw_enter_trace(
+       int                     tag,
+       xfs_iocore_t            *io,
+       const struct iovec      *iovp,
+       size_t                  segs,
+       loff_t                  offset,
+       int                     ioflags)
+{
+       xfs_inode_t     *ip = XFS_IO_INODE(io);
+
+       if (ip->i_rwtrace == NULL)
+               return;
+       ktrace_enter(ip->i_rwtrace,
+               (void *)(unsigned long)tag,
+               (void *)ip,
+               (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
+               (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
+               (void *)(__psint_t)iovp,
+               (void *)((unsigned long)segs),
+               (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+               (void *)((unsigned long)(offset & 0xffffffff)),
+               (void *)((unsigned long)ioflags),
+               (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
+               (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
+               (void *)NULL,
+               (void *)NULL,
+               (void *)NULL,
+               (void *)NULL,
+               (void *)NULL);
+}
+
+void
+xfs_inval_cached_trace(
+       xfs_iocore_t    *io,
+       xfs_off_t       offset,
+       xfs_off_t       len,
+       xfs_off_t       first,
+       xfs_off_t       last)
+{
+       xfs_inode_t     *ip = XFS_IO_INODE(io);
+
+       if (ip->i_rwtrace == NULL)
+               return;
+       ktrace_enter(ip->i_rwtrace,
+               (void *)(__psint_t)XFS_INVAL_CACHED,
+               (void *)ip,
+               (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+               (void *)((unsigned long)(offset & 0xffffffff)),
+               (void *)((unsigned long)((len >> 32) & 0xffffffff)),
+               (void *)((unsigned long)(len & 0xffffffff)),
+               (void *)((unsigned long)((first >> 32) & 0xffffffff)),
+               (void *)((unsigned long)(first & 0xffffffff)),
+               (void *)((unsigned long)((last >> 32) & 0xffffffff)),
+               (void *)((unsigned long)(last & 0xffffffff)),
+               (void *)NULL,
+               (void *)NULL,
+               (void *)NULL,
+               (void *)NULL,
+               (void *)NULL,
+               (void *)NULL);
+}
+#endif
+
+/*
+ *     xfs_iozero
+ *
+ *     xfs_iozero clears the specified range of buffer supplied,
+ *     and marks all the affected blocks as valid and modified.  If
+ *     an affected block is not allocated, it will be allocated.  If
+ *     an affected block is not completely overwritten, and is not
+ *     valid before the operation, it will be read from disk before
+ *     being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+       struct inode            *ip,    /* inode                        */
+       loff_t                  pos,    /* offset in file               */
+       size_t                  count,  /* size of data to zero         */
+       loff_t                  end_size)       /* max file size to set */
+{
+       unsigned                bytes;
+       struct page             *page;
+       struct address_space    *mapping;
+       char                    *kaddr;
+       int                     status;
+
+       mapping = ip->i_mapping;
+       do {
+               unsigned long index, offset;
+
+               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+               index = pos >> PAGE_CACHE_SHIFT;
+               bytes = PAGE_CACHE_SIZE - offset;
+               if (bytes > count)
+                       bytes = count;
+
+               status = -ENOMEM;
+               page = grab_cache_page(mapping, index);
+               if (!page)
+                       break;
+
+               kaddr = kmap(page);
+               status = mapping->a_ops->prepare_write(NULL, page, offset,
+                                                       offset + bytes);
+               if (status) {
+                       goto unlock;
+               }
+
+               memset((void *) (kaddr + offset), 0, bytes);
+               flush_dcache_page(page);
+               status = mapping->a_ops->commit_write(NULL, page, offset,
+                                                       offset + bytes);
+               if (!status) {
+                       pos += bytes;
+                       count -= bytes;
+                       if (pos > i_size_read(ip))
+                               i_size_write(ip, pos < end_size ? pos : end_size);
+               }
+
+unlock:
+               kunmap(page);
+               unlock_page(page);
+               page_cache_release(page);
+               if (status)
+                       break;
+       } while (count);
+
+       return (-status);
+}
+
+/*
+ * xfs_inval_cached_pages
+ * 
+ * This routine is responsible for keeping direct I/O and buffered I/O
+ * somewhat coherent.  From here we make sure that we're at least
+ * temporarily holding the inode I/O lock exclusively and then call
+ * the page cache to flush and invalidate any cached pages.  If there
+ * are no cached pages this routine will be very quick.
+ */
+void
+xfs_inval_cached_pages(
+       vnode_t         *vp,
+       xfs_iocore_t    *io,
+       xfs_off_t       offset,
+       int             write,
+       int             relock)
+{
+       xfs_mount_t     *mp;
+
+       if (!VN_CACHED(vp)) {
+               return;
+       }
+
+       mp = io->io_mount;
+
+       /*
+        * We need to get the I/O lock exclusively in order
+        * to safely invalidate pages and mappings.
+        */
+       if (relock) {
+               XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
+               XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL);
+       }
+
+       /* Writing beyond EOF creates a hole that must be zeroed */
+       if (write && (offset > XFS_SIZE(mp, io))) {
+               xfs_fsize_t     isize;
+
+               XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+               isize = XFS_SIZE(mp, io);
+               if (offset > isize) {
+                       xfs_zero_eof(vp, io, offset, isize, offset);
+               }
+               XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+       }
+
+       xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1);
+       VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
+       if (relock) {
+               XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
+       }
+}
+
+ssize_t                        /* bytes read, or (-)  error */
+xfs_read(
+       bhv_desc_t              *bdp,
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned int            segs,
+       loff_t                  *offset,
+       int                     ioflags,
+       cred_t                  *credp)
+{
+       struct file             *file = iocb->ki_filp;
+       size_t                  size = 0;
+       ssize_t                 ret;
+       xfs_fsize_t             n;
+       xfs_inode_t             *ip;
+       xfs_mount_t             *mp;
+       vnode_t                 *vp;
+       unsigned long           seg;
+
+       ip = XFS_BHVTOI(bdp);
+       vp = BHV_TO_VNODE(bdp);
+       mp = ip->i_mount;
+
+       XFS_STATS_INC(xs_read_calls);
+
+       /* START copy & waste from filemap.c */
+       for (seg = 0; seg < segs; seg++) {
+               const struct iovec *iv = &iovp[seg];
+
+               /*
+                * If any segment has a negative length, or the cumulative
+                * length ever wraps negative then return -EINVAL.
+                */
+               size += iv->iov_len;
+               if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+                       return XFS_ERROR(-EINVAL);
+       }
+       /* END copy & waste from filemap.c */
+
+       if (ioflags & IO_ISDIRECT) {
+               xfs_buftarg_t   *target =
+                       (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+                               mp->m_rtdev_targp : mp->m_ddev_targp;
+               if ((*offset & target->pbr_smask) ||
+                   (size & target->pbr_smask)) {
+                       if (*offset == ip->i_d.di_size) {
+                               return (0);
+                       }
+                       return -XFS_ERROR(EINVAL);
+               }
+       }
+
+       n = XFS_MAXIOFFSET(mp) - *offset;
+       if ((n <= 0) || (size == 0))
+               return 0;
+
+       if (n < size)
+               size = n;
+
+       if (XFS_FORCED_SHUTDOWN(mp)) {
+               return -EIO;
+       }
+
+       /* OK so we are holding the I/O lock for the duration
+        * of the submission, then what happens if the I/O
+        * does not really happen here, but is scheduled 
+        * later?
+        */
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+       if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
+           !(ioflags & IO_INVIS)) {
+               vrwlock_t locktype = VRWLOCK_READ;
+
+               ret = XFS_SEND_DATA(mp, DM_EVENT_READ,
+                                       BHV_TO_VNODE(bdp), *offset, size,
+                                       FILP_DELAY_FLAG(file), &locktype);
+               if (ret) {
+                       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                       return -ret;
+               }
+       }
+
+       xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
+                               iovp, segs, *offset, ioflags);
+       ret = __generic_file_aio_read(iocb, iovp, segs, offset);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       if (ret > 0)
+               XFS_STATS_ADD(xs_read_bytes, ret);
+
+       if (likely(!(ioflags & IO_INVIS)))
+               xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+
+       return ret;
+}
+
+ssize_t
+xfs_sendfile(
+       bhv_desc_t              *bdp,
+       struct file             *filp,
+       loff_t                  *offset,
+       int                     ioflags,
+       size_t                  count,
+       read_actor_t            actor,
+       void                    *target,
+       cred_t                  *credp)
+{
+       ssize_t                 ret;
+       xfs_fsize_t             n;
+       xfs_inode_t             *ip;
+       xfs_mount_t             *mp;
+       vnode_t                 *vp;
+
+       ip = XFS_BHVTOI(bdp);
+       vp = BHV_TO_VNODE(bdp);
+       mp = ip->i_mount;
+
+       XFS_STATS_INC(xs_read_calls);
+
+       n = XFS_MAXIOFFSET(mp) - *offset;
+       if ((n <= 0) || (count == 0))
+               return 0;
+
+       if (n < count)
+               count = n;
+
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
+
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+       if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
+           (!(ioflags & IO_INVIS))) {
+               vrwlock_t locktype = VRWLOCK_READ;
+               int error;
+
+               error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
+                                     FILP_DELAY_FLAG(filp), &locktype);
+               if (error) {
+                       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                       return -error;
+               }
+       }
+       xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
+                               target, count, *offset, ioflags);
+       ret = generic_file_sendfile(filp, offset, count, actor, target);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       XFS_STATS_ADD(xs_read_bytes, ret);
+       xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+       return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF.  We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int                             /* error (positive) */
+xfs_zero_last_block(
+       struct inode    *ip,
+       xfs_iocore_t    *io,
+       xfs_off_t       offset,
+       xfs_fsize_t     isize,
+       xfs_fsize_t     end_size)
+{
+       xfs_fileoff_t   last_fsb;
+       xfs_mount_t     *mp;
+       int             nimaps;
+       int             zero_offset;
+       int             zero_len;
+       int             isize_fsb_offset;
+       int             error = 0;
+       xfs_bmbt_irec_t imap;
+       loff_t          loff;
+       size_t          lsize;
+
+       ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
+       ASSERT(offset > isize);
+
+       mp = io->io_mount;
+
+       isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
+       if (isize_fsb_offset == 0) {
+               /*
+                * There are no extra bytes in the last block on disk to
+                * zero, so return.
+                */
+               return 0;
+       }
+
+       last_fsb = XFS_B_TO_FSBT(mp, isize);
+       nimaps = 1;
+       error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
+                         &nimaps, NULL);
+       if (error) {
+               return error;
+       }
+       ASSERT(nimaps > 0);
+       /*
+        * If the block underlying isize is just a hole, then there
+        * is nothing to zero.
+        */
+       if (imap.br_startblock == HOLESTARTBLOCK) {
+               return 0;
+       }
+       /*
+        * Zero the part of the last block beyond the EOF, and write it
+        * out sync.  We need to drop the ilock while we do this so we
+        * don't deadlock when the buffer cache calls back to us.
+        */
+       XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
+       loff = XFS_FSB_TO_B(mp, last_fsb);
+       lsize = XFS_FSB_TO_B(mp, 1);
+
+       zero_offset = isize_fsb_offset;
+       zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
+
+       error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
+
+       XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+       ASSERT(error >= 0);
+       return error;
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF.  This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file.  This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated.  If fill is set,
+ * then any holes in the range are filled and zeroed.  If not, the holes
+ * are left alone as holes.
+ */
+
+int                                    /* error (positive) */
+xfs_zero_eof(
+       vnode_t         *vp,
+       xfs_iocore_t    *io,
+       xfs_off_t       offset,         /* starting I/O offset */
+       xfs_fsize_t     isize,          /* current inode size */
+       xfs_fsize_t     end_size)       /* terminal inode size */
+{
+       struct inode    *ip = LINVFS_GET_IP(vp);
+       xfs_fileoff_t   start_zero_fsb;
+       xfs_fileoff_t   end_zero_fsb;
+       xfs_fileoff_t   prev_zero_fsb;
+       xfs_fileoff_t   zero_count_fsb;
+       xfs_fileoff_t   last_fsb;
+       xfs_extlen_t    buf_len_fsb;
+       xfs_extlen_t    prev_zero_count;
+       xfs_mount_t     *mp;
+       int             nimaps;
+       int             error = 0;
+       xfs_bmbt_irec_t imap;
+       loff_t          loff;
+       size_t          lsize;
+
+       ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+       ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+
+       mp = io->io_mount;
+
+       /*
+        * First handle zeroing the block on which isize resides.
+        * We only zero a part of that block so it is handled specially.
+        */
+       error = xfs_zero_last_block(ip, io, offset, isize, end_size);
+       if (error) {
+               ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+               ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+               return error;
+       }
+
+       /*
+        * Calculate the range between the new size and the old
+        * where blocks needing to be zeroed may exist.  To get the
+        * block where the last byte in the file currently resides,
+        * we need to subtract one from the size and truncate back
+        * to a block boundary.  We subtract 1 in case the size is
+        * exactly on a block boundary.
+        */
+       last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+       start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+       end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+       ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+       if (last_fsb == end_zero_fsb) {
+               /*
+                * The size was only incremented on its last block.
+                * We took care of that above, so just return.
+                */
+               return 0;
+       }
+
+       ASSERT(start_zero_fsb <= end_zero_fsb);
+       prev_zero_fsb = NULLFILEOFF;
+       prev_zero_count = 0;
+       while (start_zero_fsb <= end_zero_fsb) {
+               nimaps = 1;
+               zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+               error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
+                                 0, NULL, 0, &imap, &nimaps, NULL);
+               if (error) {
+                       ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+                       ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+                       return error;
+               }
+               ASSERT(nimaps > 0);
+
+               if (imap.br_state == XFS_EXT_UNWRITTEN ||
+                   imap.br_startblock == HOLESTARTBLOCK) {
+                       /*
+                        * This loop handles initializing pages that were
+                        * partially initialized by the code below this
+                        * loop. It basically zeroes the part of the page
+                        * that sits on a hole and sets the page as P_HOLE
+                        * and calls remapf if it is a mapped file.
+                        */
+                       prev_zero_fsb = NULLFILEOFF;
+                       prev_zero_count = 0;
+                       start_zero_fsb = imap.br_startoff +
+                                        imap.br_blockcount;
+                       ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+                       continue;
+               }
+
+               /*
+                * There are blocks in the range requested.
+                * Zero them a single write at a time.  We actually
+                * don't zero the entire range returned if it is
+                * too big and simply loop around to get the rest.
+                * That is not the most efficient thing to do, but it
+                * is simple and this path should not be exercised often.
+                */
+               buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
+                                             mp->m_writeio_blocks << 8);
+               /*
+                * Drop the inode lock while we're doing the I/O.
+                * We'll still have the iolock to protect us.
+                */
+               XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+
+               loff = XFS_FSB_TO_B(mp, start_zero_fsb);
+               lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
+
+               error = xfs_iozero(ip, loff, lsize, end_size);
+
+               if (error) {
+                       goto out_lock;
+               }
+
+               prev_zero_fsb = start_zero_fsb;
+               prev_zero_count = buf_len_fsb;
+               start_zero_fsb = imap.br_startoff + buf_len_fsb;
+               ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+
+               XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+       }
+
+       return 0;
+
+out_lock:
+
+       XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+       ASSERT(error >= 0);
+       return error;
+}
+
+ssize_t                                /* bytes written, or (-) error */
+xfs_write(
+       bhv_desc_t              *bdp,
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned int            segs,
+       loff_t                  *offset,
+       int                     ioflags,
+       cred_t                  *credp)
+{
+       struct file             *file = iocb->ki_filp;
+       size_t                  size = 0;
+       xfs_inode_t             *xip;
+       xfs_mount_t             *mp;
+       ssize_t                 ret;
+       int                     error = 0;
+       xfs_fsize_t             isize, new_size;
+       xfs_fsize_t             n, limit;
+       xfs_iocore_t            *io;
+       vnode_t                 *vp;
+       unsigned long           seg;
+       int                     iolock;
+       int                     eventsent = 0;
+       vrwlock_t               locktype;
+
+       XFS_STATS_INC(xs_write_calls);
+
+       vp = BHV_TO_VNODE(bdp);
+       xip = XFS_BHVTOI(bdp);
+
+       /* START copy & waste from filemap.c */
+       for (seg = 0; seg < segs; seg++) {
+               const struct iovec *iv = &iovp[seg];
+
+               /*
+                * If any segment has a negative length, or the cumulative
+                * length ever wraps negative then return -EINVAL.
+                */
+               size += iv->iov_len;
+               if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+                       return XFS_ERROR(-EINVAL);
+       }
+       /* END copy & waste from filemap.c */
+
+       if (size == 0)
+               return 0;
+
+       io = &xip->i_iocore;
+       mp = io->io_mount;
+
+       if (XFS_FORCED_SHUTDOWN(mp)) {
+               return -EIO;
+       }
+
+       if (ioflags & IO_ISDIRECT) {
+               xfs_buftarg_t   *target =
+                       (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+                               mp->m_rtdev_targp : mp->m_ddev_targp;
+
+               if ((*offset & target->pbr_smask) ||
+                   (size & target->pbr_smask)) {
+                       return XFS_ERROR(-EINVAL);
+               }
+               iolock = XFS_IOLOCK_SHARED;
+               locktype = VRWLOCK_WRITE_DIRECT;
+       } else {
+               iolock = XFS_IOLOCK_EXCL;
+               locktype = VRWLOCK_WRITE;
+       }
+
+       xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
+
+       isize = xip->i_d.di_size;
+       limit = XFS_MAXIOFFSET(mp);
+
+       if (file->f_flags & O_APPEND)
+               *offset = isize;
+
+start:
+       n = limit - *offset;
+       if (n <= 0) {
+               xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+               return -EFBIG;
+       }
+
+       if (n < size)
+               size = n;
+
+       new_size = *offset + size;
+       if (new_size > isize) {
+               io->io_new_size = new_size;
+       }
+
+       if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
+           !(ioflags & IO_INVIS) && !eventsent)) {
+               loff_t          savedsize = *offset;
+               int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
+
+               xfs_iunlock(xip, XFS_ILOCK_EXCL);
+               error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
+                                     *offset, size,
+                                     dmflags, &locktype);
+               if (error) {
+                       xfs_iunlock(xip, iolock);
+                       return -error;
+               }
+               xfs_ilock(xip, XFS_ILOCK_EXCL);
+               eventsent = 1;
+
+               /*
+                * The iolock was dropped and reaquired in XFS_SEND_DATA
+                * so we have to recheck the size when appending.
+                * We will only "goto start;" once, since having sent the
+                * event prevents another call to XFS_SEND_DATA, which is
+                * what allows the size to change in the first place.
+                */
+               if ((file->f_flags & O_APPEND) &&
+                   savedsize != xip->i_d.di_size) {
+                       *offset = isize = xip->i_d.di_size;
+                       goto start;
+               }
+       }
+
+       /*
+        * On Linux, generic_file_write updates the times even if
+        * no data is copied in so long as the write had a size.
+        *
+        * We must update xfs' times since revalidate will overcopy xfs.
+        */
+       if (size && !(ioflags & IO_INVIS))
+               xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+       /*
+        * If the offset is beyond the size of the file, we have a couple
+        * of things to do. First, if there is already space allocated
+        * we need to either create holes or zero the disk or ...
+        *
+        * If there is a page where the previous size lands, we need
+        * to zero it out up to the new size.
+        */
+
+       if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
+               error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
+                       isize, *offset + size);
+               if (error) {
+                       xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+                       return(-error);
+               }
+       }
+       xfs_iunlock(xip, XFS_ILOCK_EXCL);
+
+       /*
+        * If we're writing the file then make sure to clear the
+        * setuid and setgid bits if the process is not being run
+        * by root.  This keeps people from modifying setuid and
+        * setgid binaries.
+        */
+
+       if (((xip->i_d.di_mode & S_ISUID) ||
+           ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
+               (S_ISGID | S_IXGRP))) &&
+            !capable(CAP_FSETID)) {
+               error = xfs_write_clear_setuid(xip);
+               if (error) {
+                       xfs_iunlock(xip, iolock);
+                       return -error;
+               }
+       }
+
+retry:
+       if (ioflags & IO_ISDIRECT) {
+               xfs_inval_cached_pages(vp, io, *offset, 1, 1);
+               xfs_rw_enter_trace(XFS_DIOWR_ENTER,
+                               io, iovp, segs, *offset, ioflags);
+       } else {
+               xfs_rw_enter_trace(XFS_WRITE_ENTER,
+                               io, iovp, segs, *offset, ioflags);
+       }
+       ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset);
+
+       if ((ret == -ENOSPC) &&
+           DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
+           !(ioflags & IO_INVIS)) {
+
+               xfs_rwunlock(bdp, locktype);
+               error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
+                               DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
+                               0, 0, 0); /* Delay flag intentionally  unused */
+               if (error)
+                       return -error;
+               xfs_rwlock(bdp, locktype);
+               *offset = xip->i_d.di_size;
+               goto retry;
+       }
+
+       if (*offset > xip->i_d.di_size) {
+               xfs_ilock(xip, XFS_ILOCK_EXCL);
+               if (*offset > xip->i_d.di_size) {
+                       struct inode    *inode = LINVFS_GET_IP(vp);
+
+                       xip->i_d.di_size = *offset;
+                       i_size_write(inode, *offset);
+                       xip->i_update_core = 1;
+                       xip->i_update_size = 1;
+               }
+               xfs_iunlock(xip, XFS_ILOCK_EXCL);
+       }
+
+       if (ret <= 0) {
+               xfs_rwunlock(bdp, locktype);
+               return ret;
+       }
+
+       XFS_STATS_ADD(xs_write_bytes, ret);
+
+       /* Handle various SYNC-type writes */
+       if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {
+
+               /*
+                * If we're treating this as O_DSYNC and we have not updated the
+                * size, force the log.
+                */
+
+               if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
+                       && !(xip->i_update_size)) {
+                       /*
+                        * If an allocation transaction occurred
+                        * without extending the size, then we have to force
+                        * the log up the proper point to ensure that the
+                        * allocation is permanent.  We can't count on
+                        * the fact that buffered writes lock out direct I/O
+                        * writes - the direct I/O write could have extended
+                        * the size nontransactionally, then finished before
+                        * we started.  xfs_write_file will think that the file
+                        * didn't grow but the update isn't safe unless the
+                        * size change is logged.
+                        *
+                        * Force the log if we've committed a transaction
+                        * against the inode or if someone else has and
+                        * the commit record hasn't gone to disk (e.g.
+                        * the inode is pinned).  This guarantees that
+                        * all changes affecting the inode are permanent
+                        * when we return.
+                        */
+
+                       xfs_inode_log_item_t *iip;
+                       xfs_lsn_t lsn;
+
+                       iip = xip->i_itemp;
+                       if (iip && iip->ili_last_lsn) {
+                               lsn = iip->ili_last_lsn;
+                               xfs_log_force(mp, lsn,
+                                               XFS_LOG_FORCE | XFS_LOG_SYNC);
+                       } else if (xfs_ipincount(xip) > 0) {
+                               xfs_log_force(mp, (xfs_lsn_t)0,
+                                               XFS_LOG_FORCE | XFS_LOG_SYNC);
+                       }
+
+               } else {
+                       xfs_trans_t     *tp;
+
+                       /*
+                        * O_SYNC or O_DSYNC _with_ a size update are handled
+                        * the same way.
+                        *
+                        * If the write was synchronous then we need to make
+                        * sure that the inode modification time is permanent.
+                        * We'll have updated the timestamp above, so here
+                        * we use a synchronous transaction to log the inode.
+                        * It's not fast, but it's necessary.
+                        *
+                        * If this a dsync write and the size got changed
+                        * non-transactionally, then we need to ensure that
+                        * the size change gets logged in a synchronous
+                        * transaction.
+                        */
+
+                       tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
+                       if ((error = xfs_trans_reserve(tp, 0,
+                                                     XFS_SWRITE_LOG_RES(mp),
+                                                     0, 0, 0))) {
+                               /* Transaction reserve failed */
+                               xfs_trans_cancel(tp, 0);
+                       } else {
+                               /* Transaction reserve successful */
+                               xfs_ilock(xip, XFS_ILOCK_EXCL);
+                               xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
+                               xfs_trans_ihold(tp, xip);
+                               xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
+                               xfs_trans_set_sync(tp);
+                               error = xfs_trans_commit(tp, 0, NULL);
+                               xfs_iunlock(xip, XFS_ILOCK_EXCL);
+                       }
+               }
+       } /* (ioflags & O_SYNC) */
+
+       xfs_rwunlock(bdp, locktype);
+       return(ret);
+}
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(struct xfs_buf *bp)
+{
+       xfs_mount_t     *mp;
+
+       mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
+       if (!XFS_FORCED_SHUTDOWN(mp)) {
+               pagebuf_iorequest(bp);
+               return 0;
+       } else {
+               xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
+               /*
+                * Metadata write that didn't get logged but
+                * written delayed anyway. These aren't associated
+                * with a transaction, and can be ignored.
+                */
+               if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
+                   (XFS_BUF_ISREAD(bp)) == 0)
+                       return (xfs_bioerror_relse(bp));
+               else
+                       return (xfs_bioerror(bp));
+       }
+}
+
+
+int
+xfs_bmap(bhv_desc_t    *bdp,
+       xfs_off_t       offset,
+       ssize_t         count,
+       int             flags,
+       xfs_iomap_t     *iomapp,
+       int             *niomaps)
+{
+       xfs_inode_t     *ip = XFS_BHVTOI(bdp);
+       xfs_iocore_t    *io = &ip->i_iocore;
+
+       ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+       ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+              ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
+
+       return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
+}
+
+/*
+ * Wrapper around bdstrat so that we can stop data
+ * from going to disk in case we are shutting down the filesystem.
+ * Typically user data goes thru this path; one of the exceptions
+ * is the superblock.
+ */
+int
+xfsbdstrat(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       ASSERT(mp);
+       if (!XFS_FORCED_SHUTDOWN(mp)) {
+               /* Grio redirection would go here
+                * if (XFS_BUF_IS_GRIO(bp)) {
+                */
+
+               pagebuf_iorequest(bp);
+               return 0;
+       }
+
+       xfs_buftrace("XFSBDSTRAT IOERROR", bp);
+       return (xfs_bioerror_relse(bp));
+}
+
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+       xfs_mount_t             *mp,
+       char                    *message)
+{
+       if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+           xfs_readonly_buftarg(mp->m_logdev_targp) ||
+           (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+               cmn_err(CE_NOTE,
+                       "XFS: %s required on read-only device.", message);
+               cmn_err(CE_NOTE,
+                       "XFS: write access unavailable, cannot proceed.");
+               return EROFS;
+       }
+       return 0;
+}