fs/xfs/linux/xfs_lrw.c

   1 /*
   2  * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms of version 2 of the GNU General Public License as
   6  * published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it would be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11  *
  12  * Further, this software is distributed without any warranty that it is
  13  * free of the rightful claim of any third person regarding infringement
  14  * or the like.  Any license provided herein, whether implied or
  15  * otherwise, applies only to this software file.  Patent licenses, if
  16  * any, provided herein do not apply to combinations of this program with
  17  * other software, or any other product whatsoever.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write the Free Software Foundation, Inc., 59
  21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  22  *
  23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  24  * Mountain View, CA  94043, or:
  25  *
  26  * http://www.sgi.com
  27  *
  28  * For further information regarding this notice, see:
  29  *
  30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  31  */
  32 /*
  33  *  fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
  34  *
  35  */
  36
  37 #include "xfs.h"
  38
  39 #include "xfs_fs.h"
  40 #include "xfs_inum.h"
  41 #include "xfs_log.h"
  42 #include "xfs_trans.h"
  43 #include "xfs_sb.h"
  44 #include "xfs_ag.h"
  45 #include "xfs_dir.h"
  46 #include "xfs_dir2.h"
  47 #include "xfs_alloc.h"
  48 #include "xfs_dmapi.h"
  49 #include "xfs_quota.h"
  50 #include "xfs_mount.h"
  51 #include "xfs_alloc_btree.h"
  52 #include "xfs_bmap_btree.h"
  53 #include "xfs_ialloc_btree.h"
  54 #include "xfs_btree.h"
  55 #include "xfs_ialloc.h"
  56 #include "xfs_attr_sf.h"
  57 #include "xfs_dir_sf.h"
  58 #include "xfs_dir2_sf.h"
  59 #include "xfs_dinode.h"
  60 #include "xfs_inode.h"
  61 #include "xfs_bmap.h"
  62 #include "xfs_bit.h"
  63 #include "xfs_rtalloc.h"
  64 #include "xfs_error.h"
  65 #include "xfs_itable.h"
  66 #include "xfs_rw.h"
  67 #include "xfs_acl.h"
  68 #include "xfs_cap.h"
  69 #include "xfs_mac.h"
  70 #include "xfs_attr.h"
  71 #include "xfs_inode_item.h"
  72 #include "xfs_buf_item.h"
  73 #include "xfs_utils.h"
  74 #include "xfs_iomap.h"
  75
  76 #include <linux/capability.h>
  77
  78
  79 #if defined(XFS_RW_TRACE)
  80 void
  81 xfs_rw_enter_trace(
  82         int                     tag,
  83         xfs_iocore_t            *io,
  84         const struct iovec      *iovp,
  85         size_t                  segs,
  86         loff_t                  offset,
  87         int                     ioflags)
  88 {
  89         xfs_inode_t     *ip = XFS_IO_INODE(io);
  90
  91         if (ip->i_rwtrace == NULL)
  92                 return;
  93         ktrace_enter(ip->i_rwtrace,
  94                 (void *)(unsigned long)tag,
  95                 (void *)ip,
  96                 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
  97                 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
  98                 (void *)(__psint_t)iovp,
  99                 (void *)((unsigned long)segs),
 100                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 101                 (void *)((unsigned long)(offset & 0xffffffff)),
 102                 (void *)((unsigned long)ioflags),
 103                 (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
 104                 (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
 105                 (void *)NULL,
 106                 (void *)NULL,
 107                 (void *)NULL,
 108                 (void *)NULL,
 109                 (void *)NULL);
 110 }
 111
 112 void
 113 xfs_inval_cached_trace(
 114         xfs_iocore_t    *io,
 115         xfs_off_t       offset,
 116         xfs_off_t       len,
 117         xfs_off_t       first,
 118         xfs_off_t       last)
 119 {
 120         xfs_inode_t     *ip = XFS_IO_INODE(io);
 121
 122         if (ip->i_rwtrace == NULL)
 123                 return;
 124         ktrace_enter(ip->i_rwtrace,
 125                 (void *)(__psint_t)XFS_INVAL_CACHED,
 126                 (void *)ip,
 127                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 128                 (void *)((unsigned long)(offset & 0xffffffff)),
 129                 (void *)((unsigned long)((len >> 32) & 0xffffffff)),
 130                 (void *)((unsigned long)(len & 0xffffffff)),
 131                 (void *)((unsigned long)((first >> 32) & 0xffffffff)),
 132                 (void *)((unsigned long)(first & 0xffffffff)),
 133                 (void *)((unsigned long)((last >> 32) & 0xffffffff)),
 134                 (void *)((unsigned long)(last & 0xffffffff)),
 135                 (void *)NULL,
 136                 (void *)NULL,
 137                 (void *)NULL,
 138                 (void *)NULL,
 139                 (void *)NULL,
 140                 (void *)NULL);
 141 }
 142 #endif
 143
 144 /*
 145  *      xfs_iozero
 146  *
 147  *      xfs_iozero clears the specified range of buffer supplied,
 148  *      and marks all the affected blocks as valid and modified.  If
 149  *      an affected block is not allocated, it will be allocated.  If
 150  *      an affected block is not completely overwritten, and is not
 151  *      valid before the operation, it will be read from disk before
 152  *      being partially zeroed.
 153  */
 154 STATIC int
 155 xfs_iozero(
 156         struct inode            *ip,    /* inode                        */
 157         loff_t                  pos,    /* offset in file               */
 158         size_t                  count,  /* size of data to zero         */
 159         loff_t                  end_size)       /* max file size to set */
 160 {
 161         unsigned                bytes;
 162         struct page             *page;
 163         struct address_space    *mapping;
 164         char                    *kaddr;
 165         int                     status;
 166
 167         mapping = ip->i_mapping;
 168         do {
 169                 unsigned long index, offset;
 170
 171                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 172                 index = pos >> PAGE_CACHE_SHIFT;
 173                 bytes = PAGE_CACHE_SIZE - offset;
 174                 if (bytes > count)
 175                         bytes = count;
 176
 177                 status = -ENOMEM;
 178                 page = grab_cache_page(mapping, index);
 179                 if (!page)
 180                         break;
 181
 182                 kaddr = kmap(page);
 183                 status = mapping->a_ops->prepare_write(NULL, page, offset,
 184                                                         offset + bytes);
 185                 if (status) {
 186                         goto unlock;
 187                 }
 188
 189                 memset((void *) (kaddr + offset), 0, bytes);
 190                 flush_dcache_page(page);
 191                 status = mapping->a_ops->commit_write(NULL, page, offset,
 192                                                         offset + bytes);
 193                 if (!status) {
 194                         pos += bytes;
 195                         count -= bytes;
 196                         if (pos > i_size_read(ip))
 197                                 i_size_write(ip, pos < end_size ? pos : end_size);
 198                 }
 199
 200 unlock:
 201                 kunmap(page);
 202                 unlock_page(page);
 203                 page_cache_release(page);
 204                 if (status)
 205                         break;
 206         } while (count);
 207
 208         return (-status);
 209 }
 210
 211 /*
 212  * xfs_inval_cached_pages
 213  *
 214  * This routine is responsible for keeping direct I/O and buffered I/O
 215  * somewhat coherent.  From here we make sure that we're at least
 216  * temporarily holding the inode I/O lock exclusively and then call
 217  * the page cache to flush and invalidate any cached pages.  If there
 218  * are no cached pages this routine will be very quick.
 219  */
 220 void
 221 xfs_inval_cached_pages(
 222         vnode_t         *vp,
 223         xfs_iocore_t    *io,
 224         xfs_off_t       offset,
 225         int             write,
 226         int             relock)
 227 {
 228         xfs_mount_t     *mp;
 229
 230         if (!VN_CACHED(vp)) {
 231                 return;
 232         }
 233
 234         mp = io->io_mount;
 235
 236         /*
 237          * We need to get the I/O lock exclusively in order
 238          * to safely invalidate pages and mappings.
 239          */
 240         if (relock) {
 241                 XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
 242                 XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL);
 243         }
 244
 245         /* Writing beyond EOF creates a hole that must be zeroed */
 246         if (write && (offset > XFS_SIZE(mp, io))) {
 247                 xfs_fsize_t     isize;
 248
 249                 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 250                 isize = XFS_SIZE(mp, io);
 251                 if (offset > isize) {
 252                         xfs_zero_eof(vp, io, offset, isize, offset);
 253                 }
 254                 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 255         }
 256
 257         xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1);
 258         VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
 259         if (relock) {
 260                 XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
 261         }
 262 }
 263
 264 ssize_t                 /* bytes read, or (-)  error */
 265 xfs_read(
 266         bhv_desc_t              *bdp,
 267         struct kiocb            *iocb,
 268         const struct iovec      *iovp,
 269         unsigned int            segs,
 270         loff_t                  *offset,
 271         int                     ioflags,
 272         cred_t                  *credp)
 273 {
 274         struct file             *file = iocb->ki_filp;
 275         size_t                  size = 0;
 276         ssize_t                 ret;
 277         xfs_fsize_t             n;
 278         xfs_inode_t             *ip;
 279         xfs_mount_t             *mp;
 280         vnode_t                 *vp;
 281         unsigned long           seg;
 282
 283         ip = XFS_BHVTOI(bdp);
 284         vp = BHV_TO_VNODE(bdp);
 285         mp = ip->i_mount;
 286
 287         XFS_STATS_INC(xs_read_calls);
 288
 289         /* START copy & waste from filemap.c */
 290         for (seg = 0; seg < segs; seg++) {
 291                 const struct iovec *iv = &iovp[seg];
 292
 293                 /*
 294                  * If any segment has a negative length, or the cumulative
 295                  * length ever wraps negative then return -EINVAL.
 296                  */
 297                 size += iv->iov_len;
 298                 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
 299                         return XFS_ERROR(-EINVAL);
 300         }
 301         /* END copy & waste from filemap.c */
 302
 303         if (ioflags & IO_ISDIRECT) {
 304                 xfs_buftarg_t   *target =
 305                         (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 306                                 mp->m_rtdev_targp : mp->m_ddev_targp;
 307                 if ((*offset & target->pbr_smask) ||
 308                     (size & target->pbr_smask)) {
 309                         if (*offset == ip->i_d.di_size) {
 310                                 return (0);
 311                         }
 312                         return -XFS_ERROR(EINVAL);
 313                 }
 314         }
 315
 316         n = XFS_MAXIOFFSET(mp) - *offset;
 317         if ((n <= 0) || (size == 0))
 318                 return 0;
 319
 320         if (n < size)
 321                 size = n;
 322
 323         if (XFS_FORCED_SHUTDOWN(mp)) {
 324                 return -EIO;
 325         }
 326
 327         /* OK so we are holding the I/O lock for the duration
 328          * of the submission, then what happens if the I/O
 329          * does not really happen here, but is scheduled
 330          * later?
 331          */
 332         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 333
 334         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
 335             !(ioflags & IO_INVIS)) {
 336                 vrwlock_t locktype = VRWLOCK_READ;
 337
 338                 ret = XFS_SEND_DATA(mp, DM_EVENT_READ,
 339                                         BHV_TO_VNODE(bdp), *offset, size,
 340                                         FILP_DELAY_FLAG(file), &locktype);
 341                 if (ret) {
 342                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 343                         return -ret;
 344                 }
 345         }
 346
 347         xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
 348                                 iovp, segs, *offset, ioflags);
 349         ret = __generic_file_aio_read(iocb, iovp, segs, offset);
 350         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 351
 352         if (ret > 0)
 353                 XFS_STATS_ADD(xs_read_bytes, ret);
 354
 355         if (likely(!(ioflags & IO_INVIS)))
 356                 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
 357
 358         return ret;
 359 }
 360
 361 ssize_t
 362 xfs_sendfile(
 363         bhv_desc_t              *bdp,
 364         struct file             *filp,
 365         loff_t                  *offset,
 366         int                     ioflags,
 367         size_t                  count,
 368         read_actor_t            actor,
 369         void                    *target,
 370         cred_t                  *credp)
 371 {
 372         ssize_t                 ret;
 373         xfs_fsize_t             n;
 374         xfs_inode_t             *ip;
 375         xfs_mount_t             *mp;
 376         vnode_t                 *vp;
 377
 378         ip = XFS_BHVTOI(bdp);
 379         vp = BHV_TO_VNODE(bdp);
 380         mp = ip->i_mount;
 381
 382         XFS_STATS_INC(xs_read_calls);
 383
 384         n = XFS_MAXIOFFSET(mp) - *offset;
 385         if ((n <= 0) || (count == 0))
 386                 return 0;
 387
 388         if (n < count)
 389                 count = n;
 390
 391         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 392                 return -EIO;
 393
 394         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 395
 396         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
 397             (!(ioflags & IO_INVIS))) {
 398                 vrwlock_t locktype = VRWLOCK_READ;
 399                 int error;
 400
 401                 error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
 402                                       FILP_DELAY_FLAG(filp), &locktype);
 403                 if (error) {
 404                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 405                         return -error;
 406                 }
 407         }
 408         xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
 409                                 target, count, *offset, ioflags);
 410         ret = generic_file_sendfile(filp, offset, count, actor, target);
 411         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 412
 413         XFS_STATS_ADD(xs_read_bytes, ret);
 414         xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
 415         return ret;
 416 }
 417
 418 /*
 419  * This routine is called to handle zeroing any space in the last
 420  * block of the file that is beyond the EOF.  We do this since the
 421  * size is being increased without writing anything to that block
 422  * and we don't want anyone to read the garbage on the disk.
 423  */
 424 STATIC int                              /* error (positive) */
 425 xfs_zero_last_block(
 426         struct inode    *ip,
 427         xfs_iocore_t    *io,
 428         xfs_off_t       offset,
 429         xfs_fsize_t     isize,
 430         xfs_fsize_t     end_size)
 431 {
 432         xfs_fileoff_t   last_fsb;
 433         xfs_mount_t     *mp;
 434         int             nimaps;
 435         int             zero_offset;
 436         int             zero_len;
 437         int             isize_fsb_offset;
 438         int             error = 0;
 439         xfs_bmbt_irec_t imap;
 440         loff_t          loff;
 441         size_t          lsize;
 442
 443         ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
 444         ASSERT(offset > isize);
 445
 446         mp = io->io_mount;
 447
 448         isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
 449         if (isize_fsb_offset == 0) {
 450                 /*
 451                  * There are no extra bytes in the last block on disk to
 452                  * zero, so return.
 453                  */
 454                 return 0;
 455         }
 456
 457         last_fsb = XFS_B_TO_FSBT(mp, isize);
 458         nimaps = 1;
 459         error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
 460                           &nimaps, NULL);
 461         if (error) {
 462                 return error;
 463         }
 464         ASSERT(nimaps > 0);
 465         /*
 466          * If the block underlying isize is just a hole, then there
 467          * is nothing to zero.
 468          */
 469         if (imap.br_startblock == HOLESTARTBLOCK) {
 470                 return 0;
 471         }
 472         /*
 473          * Zero the part of the last block beyond the EOF, and write it
 474          * out sync.  We need to drop the ilock while we do this so we
 475          * don't deadlock when the buffer cache calls back to us.
 476          */
 477         XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
 478         loff = XFS_FSB_TO_B(mp, last_fsb);
 479         lsize = XFS_FSB_TO_B(mp, 1);
 480
 481         zero_offset = isize_fsb_offset;
 482         zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
 483
 484         error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
 485
 486         XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 487         ASSERT(error >= 0);
 488         return error;
 489 }
 490
 491 /*
 492  * Zero any on disk space between the current EOF and the new,
 493  * larger EOF.  This handles the normal case of zeroing the remainder
 494  * of the last block in the file and the unusual case of zeroing blocks
 495  * out beyond the size of the file.  This second case only happens
 496  * with fixed size extents and when the system crashes before the inode
 497  * size was updated but after blocks were allocated.  If fill is set,
 498  * then any holes in the range are filled and zeroed.  If not, the holes
 499  * are left alone as holes.
 500  */
 501
 502 int                                     /* error (positive) */
 503 xfs_zero_eof(
 504         vnode_t         *vp,
 505         xfs_iocore_t    *io,
 506         xfs_off_t       offset,         /* starting I/O offset */
 507         xfs_fsize_t     isize,          /* current inode size */
 508         xfs_fsize_t     end_size)       /* terminal inode size */
 509 {
 510         struct inode    *ip = LINVFS_GET_IP(vp);
 511         xfs_fileoff_t   start_zero_fsb;
 512         xfs_fileoff_t   end_zero_fsb;
 513         xfs_fileoff_t   prev_zero_fsb;
 514         xfs_fileoff_t   zero_count_fsb;
 515         xfs_fileoff_t   last_fsb;
 516         xfs_extlen_t    buf_len_fsb;
 517         xfs_extlen_t    prev_zero_count;
 518         xfs_mount_t     *mp;
 519         int             nimaps;
 520         int             error = 0;
 521         xfs_bmbt_irec_t imap;
 522         loff_t          loff;
 523         size_t          lsize;
 524
 525         ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 526         ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 527
 528         mp = io->io_mount;
 529
 530         /*
 531          * First handle zeroing the block on which isize resides.
 532          * We only zero a part of that block so it is handled specially.
 533          */
 534         error = xfs_zero_last_block(ip, io, offset, isize, end_size);
 535         if (error) {
 536                 ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 537                 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 538                 return error;
 539         }
 540
 541         /*
 542          * Calculate the range between the new size and the old
 543          * where blocks needing to be zeroed may exist.  To get the
 544          * block where the last byte in the file currently resides,
 545          * we need to subtract one from the size and truncate back
 546          * to a block boundary.  We subtract 1 in case the size is
 547          * exactly on a block boundary.
 548          */
 549         last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
 550         start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
 551         end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
 552         ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
 553         if (last_fsb == end_zero_fsb) {
 554                 /*
 555                  * The size was only incremented on its last block.
 556                  * We took care of that above, so just return.
 557                  */
 558                 return 0;
 559         }
 560
 561         ASSERT(start_zero_fsb <= end_zero_fsb);
 562         prev_zero_fsb = NULLFILEOFF;
 563         prev_zero_count = 0;
 564         while (start_zero_fsb <= end_zero_fsb) {
 565                 nimaps = 1;
 566                 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 567                 error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
 568                                   0, NULL, 0, &imap, &nimaps, NULL);
 569                 if (error) {
 570                         ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 571                         ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 572                         return error;
 573                 }
 574                 ASSERT(nimaps > 0);
 575
 576                 if (imap.br_state == XFS_EXT_UNWRITTEN ||
 577                     imap.br_startblock == HOLESTARTBLOCK) {
 578                         /*
 579                          * This loop handles initializing pages that were
 580                          * partially initialized by the code below this
 581                          * loop. It basically zeroes the part of the page
 582                          * that sits on a hole and sets the page as P_HOLE
 583                          * and calls remapf if it is a mapped file.
 584                          */
 585                         prev_zero_fsb = NULLFILEOFF;
 586                         prev_zero_count = 0;
 587                         start_zero_fsb = imap.br_startoff +
 588                                          imap.br_blockcount;
 589                         ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 590                         continue;
 591                 }
 592
 593                 /*
 594                  * There are blocks in the range requested.
 595                  * Zero them a single write at a time.  We actually
 596                  * don't zero the entire range returned if it is
 597                  * too big and simply loop around to get the rest.
 598                  * That is not the most efficient thing to do, but it
 599                  * is simple and this path should not be exercised often.
 600                  */
 601                 buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
 602                                               mp->m_writeio_blocks << 8);
 603                 /*
 604                  * Drop the inode lock while we're doing the I/O.
 605                  * We'll still have the iolock to protect us.
 606                  */
 607                 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 608
 609                 loff = XFS_FSB_TO_B(mp, start_zero_fsb);
 610                 lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
 611
 612                 error = xfs_iozero(ip, loff, lsize, end_size);
 613
 614                 if (error) {
 615                         goto out_lock;
 616                 }
 617
 618                 prev_zero_fsb = start_zero_fsb;
 619                 prev_zero_count = buf_len_fsb;
 620                 start_zero_fsb = imap.br_startoff + buf_len_fsb;
 621                 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 622
 623                 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 624         }
 625
 626         return 0;
 627
 628 out_lock:
 629
 630         XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 631         ASSERT(error >= 0);
 632         return error;
 633 }
 634
 635 ssize_t                         /* bytes written, or (-) error */
 636 xfs_write(
 637         bhv_desc_t              *bdp,
 638         struct kiocb            *iocb,
 639         const struct iovec      *iovp,
 640         unsigned int            segs,
 641         loff_t                  *offset,
 642         int                     ioflags,
 643         cred_t                  *credp)
 644 {
 645         struct file             *file = iocb->ki_filp;
 646         size_t                  size = 0;
 647         xfs_inode_t             *xip;
 648         xfs_mount_t             *mp;
 649         ssize_t                 ret;
 650         int                     error = 0;
 651         xfs_fsize_t             isize, new_size;
 652         xfs_fsize_t             n, limit;
 653         xfs_iocore_t            *io;
 654         vnode_t                 *vp;
 655         unsigned long           seg;
 656         int                     iolock;
 657         int                     eventsent = 0;
 658         vrwlock_t               locktype;
 659
 660         XFS_STATS_INC(xs_write_calls);
 661
 662         vp = BHV_TO_VNODE(bdp);
 663         xip = XFS_BHVTOI(bdp);
 664
 665         /* START copy & waste from filemap.c */
 666         for (seg = 0; seg < segs; seg++) {
 667                 const struct iovec *iv = &iovp[seg];
 668
 669                 /*
 670                  * If any segment has a negative length, or the cumulative
 671                  * length ever wraps negative then return -EINVAL.
 672                  */
 673                 size += iv->iov_len;
 674                 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
 675                         return XFS_ERROR(-EINVAL);
 676         }
 677         /* END copy & waste from filemap.c */
 678
 679         if (size == 0)
 680                 return 0;
 681
 682         io = &xip->i_iocore;
 683         mp = io->io_mount;
 684
 685         if (XFS_FORCED_SHUTDOWN(mp)) {
 686                 return -EIO;
 687         }
 688
 689         if (ioflags & IO_ISDIRECT) {
 690                 xfs_buftarg_t   *target =
 691                         (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 692                                 mp->m_rtdev_targp : mp->m_ddev_targp;
 693
 694                 if ((*offset & target->pbr_smask) ||
 695                     (size & target->pbr_smask)) {
 696                         return XFS_ERROR(-EINVAL);
 697                 }
 698                 iolock = XFS_IOLOCK_SHARED;
 699                 locktype = VRWLOCK_WRITE_DIRECT;
 700         } else {
 701                 iolock = XFS_IOLOCK_EXCL;
 702                 locktype = VRWLOCK_WRITE;
 703         }
 704
 705         xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 706
 707         isize = xip->i_d.di_size;
 708         limit = XFS_MAXIOFFSET(mp);
 709
 710         if (file->f_flags & O_APPEND)
 711                 *offset = isize;
 712
 713 start:
 714         n = limit - *offset;
 715         if (n <= 0) {
 716                 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 717                 return -EFBIG;
 718         }
 719
 720         if (n < size)
 721                 size = n;
 722
 723         new_size = *offset + size;
 724         if (new_size > isize) {
 725                 io->io_new_size = new_size;
 726         }
 727
 728         if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
 729             !(ioflags & IO_INVIS) && !eventsent)) {
 730                 loff_t          savedsize = *offset;
 731                 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 732
 733                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 734                 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
 735                                       *offset, size,
 736                                       dmflags, &locktype);
 737                 if (error) {
 738                         xfs_iunlock(xip, iolock);
 739                         return -error;
 740                 }
 741                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 742                 eventsent = 1;
 743
 744                 /*
 745                  * The iolock was dropped and reaquired in XFS_SEND_DATA
 746                  * so we have to recheck the size when appending.
 747                  * We will only "goto start;" once, since having sent the
 748                  * event prevents another call to XFS_SEND_DATA, which is
 749                  * what allows the size to change in the first place.
 750                  */
 751                 if ((file->f_flags & O_APPEND) &&
 752                     savedsize != xip->i_d.di_size) {
 753                         *offset = isize = xip->i_d.di_size;
 754                         goto start;
 755                 }
 756         }
 757
 758         /*
 759          * On Linux, generic_file_write updates the times even if
 760          * no data is copied in so long as the write had a size.
 761          *
 762          * We must update xfs' times since revalidate will overcopy xfs.
 763          */
 764         if (size && !(ioflags & IO_INVIS))
 765                 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 766
 767         /*
 768          * If the offset is beyond the size of the file, we have a couple
 769          * of things to do. First, if there is already space allocated
 770          * we need to either create holes or zero the disk or ...
 771          *
 772          * If there is a page where the previous size lands, we need
 773          * to zero it out up to the new size.
 774          */
 775
 776         if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) {
 777                 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
 778                         isize, *offset + size);
 779                 if (error) {
 780                         xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 781                         return(-error);
 782                 }
 783         }
 784         xfs_iunlock(xip, XFS_ILOCK_EXCL);
 785
 786         /*
 787          * If we're writing the file then make sure to clear the
 788          * setuid and setgid bits if the process is not being run
 789          * by root.  This keeps people from modifying setuid and
 790          * setgid binaries.
 791          */
 792
 793         if (((xip->i_d.di_mode & S_ISUID) ||
 794             ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
 795                 (S_ISGID | S_IXGRP))) &&
 796              !capable(CAP_FSETID)) {
 797                 error = xfs_write_clear_setuid(xip);
 798                 if (error) {
 799                         xfs_iunlock(xip, iolock);
 800                         return -error;
 801                 }
 802         }
 803
 804 retry:
 805         if (ioflags & IO_ISDIRECT) {
 806                 xfs_inval_cached_pages(vp, io, *offset, 1, 1);
 807                 xfs_rw_enter_trace(XFS_DIOWR_ENTER,
 808                                 io, iovp, segs, *offset, ioflags);
 809         } else {
 810                 xfs_rw_enter_trace(XFS_WRITE_ENTER,
 811                                 io, iovp, segs, *offset, ioflags);
 812         }
 813         ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset);
 814
 815         if ((ret == -ENOSPC) &&
 816             DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
 817             !(ioflags & IO_INVIS)) {
 818
 819                 xfs_rwunlock(bdp, locktype);
 820                 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
 821                                 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
 822                                 0, 0, 0); /* Delay flag intentionally  unused */
 823                 if (error)
 824                         return -error;
 825                 xfs_rwlock(bdp, locktype);
 826                 *offset = xip->i_d.di_size;
 827                 goto retry;
 828         }
 829
 830         if (*offset > xip->i_d.di_size) {
 831                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 832                 if (*offset > xip->i_d.di_size) {
 833                         struct inode    *inode = LINVFS_GET_IP(vp);
 834
 835                         xip->i_d.di_size = *offset;
 836                         i_size_write(inode, *offset);
 837                         xip->i_update_core = 1;
 838                         xip->i_update_size = 1;
 839                 }
 840                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 841         }
 842
 843         if (ret <= 0) {
 844                 xfs_rwunlock(bdp, locktype);
 845                 return ret;
 846         }
 847
 848         XFS_STATS_ADD(xs_write_bytes, ret);
 849
 850         /* Handle various SYNC-type writes */
 851         if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {
 852
 853                 /*
 854                  * If we're treating this as O_DSYNC and we have not updated the
 855                  * size, force the log.
 856                  */
 857
 858                 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
 859                         && !(xip->i_update_size)) {
 860                         /*
 861                          * If an allocation transaction occurred
 862                          * without extending the size, then we have to force
 863                          * the log up the proper point to ensure that the
 864                          * allocation is permanent.  We can't count on
 865                          * the fact that buffered writes lock out direct I/O
 866                          * writes - the direct I/O write could have extended
 867                          * the size nontransactionally, then finished before
 868                          * we started.  xfs_write_file will think that the file
 869                          * didn't grow but the update isn't safe unless the
 870                          * size change is logged.
 871                          *
 872                          * Force the log if we've committed a transaction
 873                          * against the inode or if someone else has and
 874                          * the commit record hasn't gone to disk (e.g.
 875                          * the inode is pinned).  This guarantees that
 876                          * all changes affecting the inode are permanent
 877                          * when we return.
 878                          */
 879
 880                         xfs_inode_log_item_t *iip;
 881                         xfs_lsn_t lsn;
 882
 883                         iip = xip->i_itemp;
 884                         if (iip && iip->ili_last_lsn) {
 885                                 lsn = iip->ili_last_lsn;
 886                                 xfs_log_force(mp, lsn,
 887                                                 XFS_LOG_FORCE | XFS_LOG_SYNC);
 888                         } else if (xfs_ipincount(xip) > 0) {
 889                                 xfs_log_force(mp, (xfs_lsn_t)0,
 890                                                 XFS_LOG_FORCE | XFS_LOG_SYNC);
 891                         }
 892
 893                 } else {
 894                         xfs_trans_t     *tp;
 895
 896                         /*
 897                          * O_SYNC or O_DSYNC _with_ a size update are handled
 898                          * the same way.
 899                          *
 900                          * If the write was synchronous then we need to make
 901                          * sure that the inode modification time is permanent.
 902                          * We'll have updated the timestamp above, so here
 903                          * we use a synchronous transaction to log the inode.
 904                          * It's not fast, but it's necessary.
 905                          *
 906                          * If this a dsync write and the size got changed
 907                          * non-transactionally, then we need to ensure that
 908                          * the size change gets logged in a synchronous
 909                          * transaction.
 910                          */
 911
 912                         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
 913                         if ((error = xfs_trans_reserve(tp, 0,
 914                                                       XFS_SWRITE_LOG_RES(mp),
 915                                                       0, 0, 0))) {
 916                                 /* Transaction reserve failed */
 917                                 xfs_trans_cancel(tp, 0);
 918                         } else {
 919                                 /* Transaction reserve successful */
 920                                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 921                                 xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
 922                                 xfs_trans_ihold(tp, xip);
 923                                 xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
 924                                 xfs_trans_set_sync(tp);
 925                                 error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0);
 926                                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 927                         }
 928                 }
 929         } /* (ioflags & O_SYNC) */
 930
 931         xfs_rwunlock(bdp, locktype);
 932         return(ret);
 933 }
 934
 935 /*
 936  * All xfs metadata buffers except log state machine buffers
 937  * get this attached as their b_bdstrat callback function.
 938  * This is so that we can catch a buffer
 939  * after prematurely unpinning it to forcibly shutdown the filesystem.
 940  */
 941 int
 942 xfs_bdstrat_cb(struct xfs_buf *bp)
 943 {
 944         xfs_mount_t     *mp;
 945
 946         mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
 947         if (!XFS_FORCED_SHUTDOWN(mp)) {
 948                 pagebuf_iorequest(bp);
 949                 return 0;
 950         } else {
 951                 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
 952                 /*
 953                  * Metadata write that didn't get logged but
 954                  * written delayed anyway. These aren't associated
 955                  * with a transaction, and can be ignored.
 956                  */
 957                 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
 958                     (XFS_BUF_ISREAD(bp)) == 0)
 959                         return (xfs_bioerror_relse(bp));
 960                 else
 961                         return (xfs_bioerror(bp));
 962         }
 963 }
 964
 965
 966 int
 967 xfs_bmap(bhv_desc_t     *bdp,
 968         xfs_off_t       offset,
 969         ssize_t         count,
 970         int             flags,
 971         xfs_iomap_t     *iomapp,
 972         int             *niomaps)
 973 {
 974         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
 975         xfs_iocore_t    *io = &ip->i_iocore;
 976
 977         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 978         ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
 979                ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
 980
 981         return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
 982 }
 983
 984 /*
 985  * Wrapper around bdstrat so that we can stop data
 986  * from going to disk in case we are shutting down the filesystem.
 987  * Typically user data goes thru this path; one of the exceptions
 988  * is the superblock.
 989  */
 990 int
 991 xfsbdstrat(
 992         struct xfs_mount        *mp,
 993         struct xfs_buf          *bp)
 994 {
 995         ASSERT(mp);
 996         if (!XFS_FORCED_SHUTDOWN(mp)) {
 997                 /* Grio redirection would go here
 998                  * if (XFS_BUF_IS_GRIO(bp)) {
 999                  */
1000
1001                 pagebuf_iorequest(bp);
1002                 return 0;
1003         }
1004
1005         xfs_buftrace("XFSBDSTRAT IOERROR", bp);
1006         return (xfs_bioerror_relse(bp));
1007 }
1008
1009 /*
1010  * If the underlying (data/log/rt) device is readonly, there are some
1011  * operations that cannot proceed.
1012  */
1013 int
1014 xfs_dev_is_read_only(
1015         xfs_mount_t             *mp,
1016         char                    *message)
1017 {
1018         if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
1019             xfs_readonly_buftarg(mp->m_logdev_targp) ||
1020             (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
1021                 cmn_err(CE_NOTE,
1022                         "XFS: %s required on read-only device.", message);
1023                 cmn_err(CE_NOTE,
1024                         "XFS: write access unavailable, cannot proceed.");
1025                 return EROFS;
1026         }
1027         return 0;
1028 }