fs/xfs/linux/xfs_aops.c

   1 /*
   2  * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms of version 2 of the GNU General Public License as
   6  * published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it would be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11  *
  12  * Further, this software is distributed without any warranty that it is
  13  * free of the rightful claim of any third person regarding infringement
  14  * or the like.  Any license provided herein, whether implied or
  15  * otherwise, applies only to this software file.  Patent licenses, if
  16  * any, provided herein do not apply to combinations of this program with
  17  * other software, or any other product whatsoever.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write the Free Software Foundation, Inc., 59
  21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  22  *
  23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  24  * Mountain View, CA  94043, or:
  25  *
  26  * http://www.sgi.com
  27  *
  28  * For further information regarding this notice, see:
  29  *
  30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  31  */
  32
  33 #include "xfs.h"
  34 #include "xfs_inum.h"
  35 #include "xfs_log.h"
  36 #include "xfs_sb.h"
  37 #include "xfs_dir.h"
  38 #include "xfs_dir2.h"
  39 #include "xfs_trans.h"
  40 #include "xfs_dmapi.h"
  41 #include "xfs_mount.h"
  42 #include "xfs_bmap_btree.h"
  43 #include "xfs_alloc_btree.h"
  44 #include "xfs_ialloc_btree.h"
  45 #include "xfs_alloc.h"
  46 #include "xfs_btree.h"
  47 #include "xfs_attr_sf.h"
  48 #include "xfs_dir_sf.h"
  49 #include "xfs_dir2_sf.h"
  50 #include "xfs_dinode.h"
  51 #include "xfs_inode.h"
  52 #include "xfs_error.h"
  53 #include "xfs_rw.h"
  54 #include "xfs_iomap.h"
  55 #include <linux/mpage.h>
  56
  57 STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
  58 STATIC void xfs_convert_page(struct inode *, struct page *,
  59                                 xfs_iomap_t *, void *, int, int);
  60
  61 #if defined(XFS_RW_TRACE)
  62 void
  63 xfs_page_trace(
  64         int             tag,
  65         struct inode    *inode,
  66         struct page     *page,
  67         int             mask)
  68 {
  69         xfs_inode_t     *ip;
  70         bhv_desc_t      *bdp;
  71         vnode_t         *vp = LINVFS_GET_VP(inode);
  72         loff_t          isize = i_size_read(inode);
  73         loff_t          offset = page->index << PAGE_CACHE_SHIFT;
  74         int             delalloc = -1, unmapped = -1, unwritten = -1;
  75
  76         if (page_has_buffers(page))
  77                 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
  78
  79         bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
  80         ip = XFS_BHVTOI(bdp);
  81         if (!ip->i_rwtrace)
  82                 return;
  83
  84         ktrace_enter(ip->i_rwtrace,
  85                 (void *)((unsigned long)tag),
  86                 (void *)ip,
  87                 (void *)inode,
  88                 (void *)page,
  89                 (void *)((unsigned long)mask),
  90                 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
  91                 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
  92                 (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
  93                 (void *)((unsigned long)(isize & 0xffffffff)),
  94                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
  95                 (void *)((unsigned long)(offset & 0xffffffff)),
  96                 (void *)((unsigned long)delalloc),
  97                 (void *)((unsigned long)unmapped),
  98                 (void *)((unsigned long)unwritten),
  99                 (void *)NULL,
 100                 (void *)NULL);
 101 }
 102 #else
 103 #define xfs_page_trace(tag, inode, page, mask)
 104 #endif
 105
 106 void
 107 linvfs_unwritten_done(
 108         struct buffer_head      *bh,
 109         int                     uptodate)
 110 {
 111         xfs_buf_t               *pb = (xfs_buf_t *)bh->b_private;
 112
 113         ASSERT(buffer_unwritten(bh));
 114         bh->b_end_io = NULL;
 115         clear_buffer_unwritten(bh);
 116         if (!uptodate)
 117                 pagebuf_ioerror(pb, EIO);
 118         if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
 119                 pagebuf_iodone(pb, 1, 1);
 120         }
 121         end_buffer_async_write(bh, uptodate);
 122 }
 123
 124 /*
 125  * Issue transactions to convert a buffer range from unwritten
 126  * to written extents (buffered IO).
 127  */
 128 STATIC void
 129 linvfs_unwritten_convert(
 130         xfs_buf_t       *bp)
 131 {
 132         vnode_t         *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *);
 133         int             error;
 134
 135         BUG_ON(atomic_read(&bp->pb_hold) < 1);
 136         VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp),
 137                         BMAPI_UNWRITTEN, NULL, NULL, error);
 138         XFS_BUF_SET_FSPRIVATE(bp, NULL);
 139         XFS_BUF_CLR_IODONE_FUNC(bp);
 140         XFS_BUF_UNDATAIO(bp);
 141         iput(LINVFS_GET_IP(vp));
 142         pagebuf_iodone(bp, 0, 0);
 143 }
 144
 145 /*
 146  * Issue transactions to convert a buffer range from unwritten
 147  * to written extents (direct IO).
 148  */
 149 STATIC void
 150 linvfs_unwritten_convert_direct(
 151         struct inode    *inode,
 152         loff_t          offset,
 153         ssize_t         size,
 154         void            *private)
 155 {
 156         ASSERT(!private || inode == (struct inode *)private);
 157
 158         /* private indicates an unwritten extent lay beneath this IO,
 159          * see linvfs_get_block_core.
 160          */
 161         if (private && size > 0) {
 162                 vnode_t *vp = LINVFS_GET_VP(inode);
 163                 int     error;
 164
 165                 VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
 166         }
 167 }
 168
 169 STATIC int
 170 xfs_map_blocks(
 171         struct inode            *inode,
 172         loff_t                  offset,
 173         ssize_t                 count,
 174         xfs_iomap_t             *iomapp,
 175         int                     flags)
 176 {
 177         vnode_t                 *vp = LINVFS_GET_VP(inode);
 178         int                     error, niomaps = 1;
 179
 180         if (((flags & (BMAPI_DIRECT|BMAPI_SYNC)) == BMAPI_DIRECT) &&
 181             (offset >= i_size_read(inode)))
 182                 count = max_t(ssize_t, count, XFS_WRITE_IO_LOG);
 183 retry:
 184         VOP_BMAP(vp, offset, count, flags, iomapp, &niomaps, error);
 185         if ((error == EAGAIN) || (error == EIO))
 186                 return -error;
 187         if (unlikely((flags & (BMAPI_WRITE|BMAPI_DIRECT)) ==
 188                                         (BMAPI_WRITE|BMAPI_DIRECT) && niomaps &&
 189                                         (iomapp->iomap_flags & IOMAP_DELAY))) {
 190                 flags = BMAPI_ALLOCATE;
 191                 goto retry;
 192         }
 193         if (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
 194                 VMODIFY(vp);
 195         }
 196         return -error;
 197 }
 198
 199 /*
 200  * Finds the corresponding mapping in block @map array of the
 201  * given @offset within a @page.
 202  */
 203 STATIC xfs_iomap_t *
 204 xfs_offset_to_map(
 205         struct page             *page,
 206         xfs_iomap_t             *iomapp,
 207         unsigned long           offset)
 208 {
 209         loff_t                  full_offset;    /* offset from start of file */
 210
 211         ASSERT(offset < PAGE_CACHE_SIZE);
 212
 213         full_offset = page->index;              /* NB: using 64bit number */
 214         full_offset <<= PAGE_CACHE_SHIFT;       /* offset from file start */
 215         full_offset += offset;                  /* offset from page start */
 216
 217         if (full_offset < iomapp->iomap_offset)
 218                 return NULL;
 219         if (iomapp->iomap_offset + iomapp->iomap_bsize > full_offset)
 220                 return iomapp;
 221         return NULL;
 222 }
 223
 224 STATIC void
 225 xfs_map_at_offset(
 226         struct page             *page,
 227         struct buffer_head      *bh,
 228         unsigned long           offset,
 229         int                     block_bits,
 230         xfs_iomap_t             *iomapp)
 231 {
 232         xfs_daddr_t             bn;
 233         loff_t                  delta;
 234         int                     sector_shift;
 235
 236         ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
 237         ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
 238         ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
 239
 240         delta = page->index;
 241         delta <<= PAGE_CACHE_SHIFT;
 242         delta += offset;
 243         delta -= iomapp->iomap_offset;
 244         delta >>= block_bits;
 245
 246         sector_shift = block_bits - BBSHIFT;
 247         bn = iomapp->iomap_bn >> sector_shift;
 248         bn += delta;
 249         ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
 250
 251         lock_buffer(bh);
 252         bh->b_blocknr = bn;
 253         bh->b_bdev = iomapp->iomap_target->pbr_bdev;
 254         set_buffer_mapped(bh);
 255         clear_buffer_delay(bh);
 256 }
 257
 258 /*
 259  * Look for a page at index which is unlocked and contains our
 260  * unwritten extent flagged buffers at its head.  Returns page
 261  * locked and with an extra reference count, and length of the
 262  * unwritten extent component on this page that we can write,
 263  * in units of filesystem blocks.
 264  */
 265 STATIC struct page *
 266 xfs_probe_unwritten_page(
 267         struct address_space    *mapping,
 268         pgoff_t                 index,
 269         xfs_iomap_t             *iomapp,
 270         xfs_buf_t               *pb,
 271         unsigned long           max_offset,
 272         unsigned long           *fsbs,
 273         unsigned int            bbits)
 274 {
 275         struct page             *page;
 276
 277         page = find_trylock_page(mapping, index);
 278         if (!page)
 279                 return 0;
 280         if (PageWriteback(page))
 281                 goto out;
 282
 283         if (page->mapping && page_has_buffers(page)) {
 284                 struct buffer_head      *bh, *head;
 285                 unsigned long           p_offset = 0;
 286
 287                 *fsbs = 0;
 288                 bh = head = page_buffers(page);
 289                 do {
 290                         if (!buffer_unwritten(bh))
 291                                 break;
 292                         if (!xfs_offset_to_map(page, iomapp, p_offset))
 293                                 break;
 294                         if (p_offset >= max_offset)
 295                                 break;
 296                         xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
 297                         set_buffer_unwritten_io(bh);
 298                         bh->b_private = pb;
 299                         p_offset += bh->b_size;
 300                         (*fsbs)++;
 301                 } while ((bh = bh->b_this_page) != head);
 302
 303                 if (p_offset)
 304                         return page;
 305         }
 306
 307 out:
 308         unlock_page(page);
 309         return NULL;
 310 }
 311
 312 /*
 313  * Look for a page at index which is unlocked and not mapped
 314  * yet - clustering for mmap write case.
 315  */
 316 STATIC unsigned int
 317 xfs_probe_unmapped_page(
 318         struct address_space    *mapping,
 319         pgoff_t                 index,
 320         unsigned int            pg_offset)
 321 {
 322         struct page             *page;
 323         int                     ret = 0;
 324
 325         page = find_trylock_page(mapping, index);
 326         if (!page)
 327                 return 0;
 328         if (PageWriteback(page))
 329                 goto out;
 330
 331         if (page->mapping && PageDirty(page)) {
 332                 if (page_has_buffers(page)) {
 333                         struct buffer_head      *bh, *head;
 334
 335                         bh = head = page_buffers(page);
 336                         do {
 337                                 if (buffer_mapped(bh) || !buffer_uptodate(bh))
 338                                         break;
 339                                 ret += bh->b_size;
 340                                 if (ret >= pg_offset)
 341                                         break;
 342                         } while ((bh = bh->b_this_page) != head);
 343                 } else
 344                         ret = PAGE_CACHE_SIZE;
 345         }
 346
 347 out:
 348         unlock_page(page);
 349         return ret;
 350 }
 351
 352 STATIC unsigned int
 353 xfs_probe_unmapped_cluster(
 354         struct inode            *inode,
 355         struct page             *startpage,
 356         struct buffer_head      *bh,
 357         struct buffer_head      *head)
 358 {
 359         pgoff_t                 tindex, tlast, tloff;
 360         unsigned int            pg_offset, len, total = 0;
 361         struct address_space    *mapping = inode->i_mapping;
 362
 363         /* First sum forwards in this page */
 364         do {
 365                 if (buffer_mapped(bh))
 366                         break;
 367                 total += bh->b_size;
 368         } while ((bh = bh->b_this_page) != head);
 369
 370         /* If we reached the end of the page, sum forwards in
 371          * following pages.
 372          */
 373         if (bh == head) {
 374                 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
 375                 /* Prune this back to avoid pathological behavior */
 376                 tloff = min(tlast, startpage->index + 64);
 377                 for (tindex = startpage->index + 1; tindex < tloff; tindex++) {
 378                         len = xfs_probe_unmapped_page(mapping, tindex,
 379                                                         PAGE_CACHE_SIZE);
 380                         if (!len)
 381                                 return total;
 382                         total += len;
 383                 }
 384                 if (tindex == tlast &&
 385                     (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
 386                         total += xfs_probe_unmapped_page(mapping,
 387                                                         tindex, pg_offset);
 388                 }
 389         }
 390         return total;
 391 }
 392
 393 /*
 394  * Probe for a given page (index) in the inode and test if it is delayed
 395  * and without unwritten buffers.  Returns page locked and with an extra
 396  * reference count.
 397  */
 398 STATIC struct page *
 399 xfs_probe_delalloc_page(
 400         struct inode            *inode,
 401         pgoff_t                 index)
 402 {
 403         struct page             *page;
 404
 405         page = find_trylock_page(inode->i_mapping, index);
 406         if (!page)
 407                 return NULL;
 408         if (PageWriteback(page))
 409                 goto out;
 410
 411         if (page->mapping && page_has_buffers(page)) {
 412                 struct buffer_head      *bh, *head;
 413                 int                     acceptable = 0;
 414
 415                 bh = head = page_buffers(page);
 416                 do {
 417                         if (buffer_unwritten(bh)) {
 418                                 acceptable = 0;
 419                                 break;
 420                         } else if (buffer_delay(bh)) {
 421                                 acceptable = 1;
 422                         }
 423                 } while ((bh = bh->b_this_page) != head);
 424
 425                 if (acceptable)
 426                         return page;
 427         }
 428
 429 out:
 430         unlock_page(page);
 431         return NULL;
 432 }
 433
 434 STATIC int
 435 xfs_map_unwritten(
 436         struct inode            *inode,
 437         struct page             *start_page,
 438         struct buffer_head      *head,
 439         struct buffer_head      *curr,
 440         unsigned long           p_offset,
 441         int                     block_bits,
 442         xfs_iomap_t             *iomapp,
 443         int                     startio,
 444         int                     all_bh)
 445 {
 446         struct buffer_head      *bh = curr;
 447         xfs_iomap_t             *tmp;
 448         xfs_buf_t               *pb;
 449         loff_t                  offset, size;
 450         unsigned long           nblocks = 0;
 451
 452         offset = start_page->index;
 453         offset <<= PAGE_CACHE_SHIFT;
 454         offset += p_offset;
 455
 456         /* get an "empty" pagebuf to manage IO completion
 457          * Proper values will be set before returning */
 458         pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0);
 459         if (!pb)
 460                 return -EAGAIN;
 461
 462         /* Take a reference to the inode to prevent it from
 463          * being reclaimed while we have outstanding unwritten
 464          * extent IO on it.
 465          */
 466         if ((igrab(inode)) != inode) {
 467                 pagebuf_free(pb);
 468                 return -EAGAIN;
 469         }
 470
 471         /* Set the count to 1 initially, this will stop an I/O
 472          * completion callout which happens before we have started
 473          * all the I/O from calling pagebuf_iodone too early.
 474          */
 475         atomic_set(&pb->pb_io_remaining, 1);
 476
 477         /* First map forwards in the page consecutive buffers
 478          * covering this unwritten extent
 479          */
 480         do {
 481                 if (!buffer_unwritten(bh))
 482                         break;
 483                 tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
 484                 if (!tmp)
 485                         break;
 486                 xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
 487                 set_buffer_unwritten_io(bh);
 488                 bh->b_private = pb;
 489                 p_offset += bh->b_size;
 490                 nblocks++;
 491         } while ((bh = bh->b_this_page) != head);
 492
 493         atomic_add(nblocks, &pb->pb_io_remaining);
 494
 495         /* If we reached the end of the page, map forwards in any
 496          * following pages which are also covered by this extent.
 497          */
 498         if (bh == head) {
 499                 struct address_space    *mapping = inode->i_mapping;
 500                 pgoff_t                 tindex, tloff, tlast;
 501                 unsigned long           bs;
 502                 unsigned int            pg_offset, bbits = inode->i_blkbits;
 503                 struct page             *page;
 504
 505                 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
 506                 tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
 507                 tloff = min(tlast, tloff);
 508                 for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
 509                         page = xfs_probe_unwritten_page(mapping,
 510                                                 tindex, iomapp, pb,
 511                                                 PAGE_CACHE_SIZE, &bs, bbits);
 512                         if (!page)
 513                                 break;
 514                         nblocks += bs;
 515                         atomic_add(bs, &pb->pb_io_remaining);
 516                         xfs_convert_page(inode, page, iomapp, pb,
 517                                                         startio, all_bh);
 518                         /* stop if converting the next page might add
 519                          * enough blocks that the corresponding byte
 520                          * count won't fit in our ulong page buf length */
 521                         if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
 522                                 goto enough;
 523                 }
 524
 525                 if (tindex == tlast &&
 526                     (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
 527                         page = xfs_probe_unwritten_page(mapping,
 528                                                         tindex, iomapp, pb,
 529                                                         pg_offset, &bs, bbits);
 530                         if (page) {
 531                                 nblocks += bs;
 532                                 atomic_add(bs, &pb->pb_io_remaining);
 533                                 xfs_convert_page(inode, page, iomapp, pb,
 534                                                         startio, all_bh);
 535                                 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
 536                                         goto enough;
 537                         }
 538                 }
 539         }
 540
 541 enough:
 542         size = nblocks;         /* NB: using 64bit number here */
 543         size <<= block_bits;    /* convert fsb's to byte range */
 544
 545         XFS_BUF_DATAIO(pb);
 546         XFS_BUF_ASYNC(pb);
 547         XFS_BUF_SET_SIZE(pb, size);
 548         XFS_BUF_SET_COUNT(pb, size);
 549         XFS_BUF_SET_OFFSET(pb, offset);
 550         XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode));
 551         XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert);
 552
 553         if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
 554                 pagebuf_iodone(pb, 1, 1);
 555         }
 556
 557         return 0;
 558 }
 559
 560 STATIC void
 561 xfs_submit_page(
 562         struct page             *page,
 563         struct buffer_head      *bh_arr[],
 564         int                     cnt)
 565 {
 566         struct buffer_head      *bh;
 567         int                     i;
 568
 569         BUG_ON(PageWriteback(page));
 570         set_page_writeback(page);
 571         clear_page_dirty(page);
 572         unlock_page(page);
 573
 574         if (cnt) {
 575                 for (i = 0; i < cnt; i++) {
 576                         bh = bh_arr[i];
 577                         mark_buffer_async_write(bh);
 578                         if (buffer_unwritten(bh))
 579                                 set_buffer_unwritten_io(bh);
 580                         set_buffer_uptodate(bh);
 581                         clear_buffer_dirty(bh);
 582                 }
 583
 584                 for (i = 0; i < cnt; i++)
 585                         submit_bh(WRITE, bh_arr[i]);
 586         } else
 587                 end_page_writeback(page);
 588 }
 589
 590 /*
 591  * Allocate & map buffers for page given the extent map. Write it out.
 592  * except for the original page of a writepage, this is called on
 593  * delalloc/unwritten pages only, for the original page it is possible
 594  * that the page has no mapping at all.
 595  */
 596 STATIC void
 597 xfs_convert_page(
 598         struct inode            *inode,
 599         struct page             *page,
 600         xfs_iomap_t             *iomapp,
 601         void                    *private,
 602         int                     startio,
 603         int                     all_bh)
 604 {
 605         struct buffer_head      *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
 606         xfs_iomap_t             *mp = iomapp, *tmp;
 607         unsigned long           end, offset;
 608         pgoff_t                 end_index;
 609         int                     i = 0, index = 0;
 610         int                     bbits = inode->i_blkbits;
 611
 612         end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
 613         if (page->index < end_index) {
 614                 end = PAGE_CACHE_SIZE;
 615         } else {
 616                 end = i_size_read(inode) & (PAGE_CACHE_SIZE-1);
 617         }
 618         bh = head = page_buffers(page);
 619         do {
 620                 offset = i << bbits;
 621                 if (!(PageUptodate(page) || buffer_uptodate(bh)))
 622                         continue;
 623                 if (buffer_mapped(bh) && all_bh &&
 624                     !buffer_unwritten(bh) && !buffer_delay(bh)) {
 625                         if (startio && (offset < end)) {
 626                                 lock_buffer(bh);
 627                                 bh_arr[index++] = bh;
 628                         }
 629                         continue;
 630                 }
 631                 tmp = xfs_offset_to_map(page, mp, offset);
 632                 if (!tmp)
 633                         continue;
 634                 ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
 635                 ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
 636
 637                 /* If this is a new unwritten extent buffer (i.e. one
 638                  * that we haven't passed in private data for, we must
 639                  * now map this buffer too.
 640                  */
 641                 if (buffer_unwritten(bh) && !bh->b_end_io) {
 642                         ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN);
 643                         xfs_map_unwritten(inode, page, head, bh,
 644                                         offset, bbits, tmp, startio, all_bh);
 645                 } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) {
 646                         xfs_map_at_offset(page, bh, offset, bbits, tmp);
 647                         if (buffer_unwritten(bh)) {
 648                                 set_buffer_unwritten_io(bh);
 649                                 bh->b_private = private;
 650                                 ASSERT(private);
 651                         }
 652                 }
 653                 if (startio && (offset < end)) {
 654                         bh_arr[index++] = bh;
 655                 } else {
 656                         set_buffer_dirty(bh);
 657                         unlock_buffer(bh);
 658                         mark_buffer_dirty(bh);
 659                 }
 660         } while (i++, (bh = bh->b_this_page) != head);
 661
 662         if (startio) {
 663                 xfs_submit_page(page, bh_arr, index);
 664         } else {
 665                 unlock_page(page);
 666         }
 667 }
 668
 669 /*
 670  * Convert & write out a cluster of pages in the same extent as defined
 671  * by mp and following the start page.
 672  */
 673 STATIC void
 674 xfs_cluster_write(
 675         struct inode            *inode,
 676         pgoff_t                 tindex,
 677         xfs_iomap_t             *iomapp,
 678         int                     startio,
 679         int                     all_bh)
 680 {
 681         pgoff_t                 tlast;
 682         struct page             *page;
 683
 684         tlast = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
 685         for (; tindex < tlast; tindex++) {
 686                 page = xfs_probe_delalloc_page(inode, tindex);
 687                 if (!page)
 688                         break;
 689                 xfs_convert_page(inode, page, iomapp, NULL, startio, all_bh);
 690         }
 691 }
 692
 693 /*
 694  * Calling this without startio set means we are being asked to make a dirty
 695  * page ready for freeing it's buffers.  When called with startio set then
 696  * we are coming from writepage.
 697  *
 698  * When called with startio set it is important that we write the WHOLE
 699  * page if possible.
 700  * The bh->b_state's cannot know if any of the blocks or which block for
 701  * that matter are dirty due to mmap writes, and therefore bh uptodate is
 702  * only vaild if the page itself isn't completely uptodate.  Some layers
 703  * may clear the page dirty flag prior to calling write page, under the
 704  * assumption the entire page will be written out; by not writing out the
 705  * whole page the page can be reused before all valid dirty data is
 706  * written out.  Note: in the case of a page that has been dirty'd by
 707  * mapwrite and but partially setup by block_prepare_write the
 708  * bh->b_states's will not agree and only ones setup by BPW/BCW will have
 709  * valid state, thus the whole page must be written out thing.
 710  */
 711
 712 STATIC int
 713 xfs_page_state_convert(
 714         struct inode    *inode,
 715         struct page     *page,
 716         int             startio,
 717         int             unmapped) /* also implies page uptodate */
 718 {
 719         struct buffer_head      *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
 720         xfs_iomap_t             *iomp, iomap;
 721         unsigned long           p_offset = 0;
 722         pgoff_t                 end_index;
 723         loff_t                  offset;
 724         unsigned long long      end_offset;
 725         int                     len, err, i, cnt = 0, uptodate = 1;
 726         int                     flags = startio ? 0 : BMAPI_TRYLOCK;
 727         int                     page_dirty = 1;
 728
 729
 730         /* Are we off the end of the file ? */
 731         end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
 732         if (page->index >= end_index) {
 733                 if ((page->index >= end_index + 1) ||
 734                     !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
 735                         err = -EIO;
 736                         goto error;
 737                 }
 738         }
 739
 740         offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
 741         end_offset = min_t(unsigned long long,
 742                         offset + PAGE_CACHE_SIZE, i_size_read(inode));
 743
 744         bh = head = page_buffers(page);
 745         iomp = NULL;
 746
 747         len = bh->b_size;
 748         do {
 749                 if (offset >= end_offset)
 750                         break;
 751                 if (!buffer_uptodate(bh))
 752                         uptodate = 0;
 753                 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio)
 754                         continue;
 755
 756                 if (iomp) {
 757                         iomp = xfs_offset_to_map(page, &iomap, p_offset);
 758                 }
 759
 760                 /*
 761                  * First case, map an unwritten extent and prepare for
 762                  * extent state conversion transaction on completion.
 763                  */
 764                 if (buffer_unwritten(bh)) {
 765                         if (!iomp) {
 766                                 err = xfs_map_blocks(inode, offset, len, &iomap,
 767                                                 BMAPI_READ|BMAPI_IGNSTATE);
 768                                 if (err) {
 769                                         goto error;
 770                                 }
 771                                 iomp = xfs_offset_to_map(page, &iomap,
 772                                                                 p_offset);
 773                         }
 774                         if (iomp && startio) {
 775                                 if (!bh->b_end_io) {
 776                                         err = xfs_map_unwritten(inode, page,
 777                                                         head, bh, p_offset,
 778                                                         inode->i_blkbits, iomp,
 779                                                         startio, unmapped);
 780                                         if (err) {
 781                                                 goto error;
 782                                         }
 783                                 }
 784                                 bh_arr[cnt++] = bh;
 785                                 page_dirty = 0;
 786                         }
 787                 /*
 788                  * Second case, allocate space for a delalloc buffer.
 789                  * We can return EAGAIN here in the release page case.
 790                  */
 791                 } else if (buffer_delay(bh)) {
 792                         if (!iomp) {
 793                                 err = xfs_map_blocks(inode, offset, len, &iomap,
 794                                                 BMAPI_ALLOCATE | flags);
 795                                 if (err) {
 796                                         goto error;
 797                                 }
 798                                 iomp = xfs_offset_to_map(page, &iomap,
 799                                                                 p_offset);
 800                         }
 801                         if (iomp) {
 802                                 xfs_map_at_offset(page, bh, p_offset,
 803                                                 inode->i_blkbits, iomp);
 804                                 if (startio) {
 805                                         bh_arr[cnt++] = bh;
 806                                 } else {
 807                                         set_buffer_dirty(bh);
 808                                         unlock_buffer(bh);
 809                                         mark_buffer_dirty(bh);
 810                                 }
 811                                 page_dirty = 0;
 812                         }
 813                 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
 814                            (unmapped || startio)) {
 815
 816                         if (!buffer_mapped(bh)) {
 817                                 int     size;
 818
 819                                 /*
 820                                  * Getting here implies an unmapped buffer
 821                                  * was found, and we are in a path where we
 822                                  * need to write the whole page out.
 823                                  */
 824                                 if (!iomp) {
 825                                         size = xfs_probe_unmapped_cluster(
 826                                                         inode, page, bh, head);
 827                                         err = xfs_map_blocks(inode, offset,
 828                                                         size, &iomap,
 829                                                         BMAPI_WRITE|BMAPI_MMAP);
 830                                         if (err) {
 831                                                 goto error;
 832                                         }
 833                                         iomp = xfs_offset_to_map(page, &iomap,
 834                                                                      p_offset);
 835                                 }
 836                                 if (iomp) {
 837                                         xfs_map_at_offset(page,
 838                                                         bh, p_offset,
 839                                                         inode->i_blkbits, iomp);
 840                                         if (startio) {
 841                                                 bh_arr[cnt++] = bh;
 842                                         } else {
 843                                                 set_buffer_dirty(bh);
 844                                                 unlock_buffer(bh);
 845                                                 mark_buffer_dirty(bh);
 846                                         }
 847                                         page_dirty = 0;
 848                                 }
 849                         } else if (startio) {
 850                                 if (buffer_uptodate(bh) &&
 851                                     !test_and_set_bit(BH_Lock, &bh->b_state)) {
 852                                         bh_arr[cnt++] = bh;
 853                                         page_dirty = 0;
 854                                 }
 855                         }
 856                 }
 857         } while (offset += len, p_offset += len,
 858                 ((bh = bh->b_this_page) != head));
 859
 860         if (uptodate && bh == head)
 861                 SetPageUptodate(page);
 862
 863         if (startio)
 864                 xfs_submit_page(page, bh_arr, cnt);
 865
 866         if (iomp)
 867                 xfs_cluster_write(inode, page->index + 1, iomp, startio, unmapped);
 868
 869         return page_dirty;
 870
 871 error:
 872         for (i = 0; i < cnt; i++) {
 873                 unlock_buffer(bh_arr[i]);
 874         }
 875
 876         /*
 877          * If it's delalloc and we have nowhere to put it,
 878          * throw it away, unless the lower layers told
 879          * us to try again.
 880          */
 881         if (err != -EAGAIN) {
 882                 if (!unmapped) {
 883                         block_invalidatepage(page, 0);
 884                 }
 885                 ClearPageUptodate(page);
 886         }
 887         return err;
 888 }
 889
 890 STATIC int
 891 linvfs_get_block_core(
 892         struct inode            *inode,
 893         sector_t                iblock,
 894         unsigned long           blocks,
 895         struct buffer_head      *bh_result,
 896         int                     create,
 897         int                     direct,
 898         bmapi_flags_t           flags)
 899 {
 900         vnode_t                 *vp = LINVFS_GET_VP(inode);
 901         xfs_iomap_t             iomap;
 902         int                     retpbbm = 1;
 903         int                     error;
 904         ssize_t                 size;
 905         loff_t                  offset = (loff_t)iblock << inode->i_blkbits;
 906
 907         /* If we are doing writes at the end of the file,
 908          * allocate in chunks
 909          */
 910         if (blocks)
 911                 size = blocks << inode->i_blkbits;
 912         else if (create && (offset >= i_size_read(inode)))
 913                 size = 1 << XFS_WRITE_IO_LOG;
 914         else
 915                 size = 1 << inode->i_blkbits;
 916
 917         VOP_BMAP(vp, offset, size,
 918                 create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
 919         if (error)
 920                 return -error;
 921
 922         if (retpbbm == 0)
 923                 return 0;
 924
 925         if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
 926                 xfs_daddr_t             bn;
 927                 loff_t                  delta;
 928
 929                 /* For unwritten extents do not report a disk address on
 930                  * the read case (treat as if we're reading into a hole).
 931                  */
 932                 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
 933                         delta = offset - iomap.iomap_offset;
 934                         delta >>= inode->i_blkbits;
 935
 936                         bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT);
 937                         bn += delta;
 938
 939                         bh_result->b_blocknr = bn;
 940                         bh_result->b_bdev = iomap.iomap_target->pbr_bdev;
 941                         set_buffer_mapped(bh_result);
 942                 }
 943                 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
 944                         if (direct)
 945                                 bh_result->b_private = inode;
 946                         set_buffer_unwritten(bh_result);
 947                         set_buffer_delay(bh_result);
 948                 }
 949         }
 950
 951         /* If this is a realtime file, data might be on a new device */
 952         bh_result->b_bdev = iomap.iomap_target->pbr_bdev;
 953
 954         /* If we previously allocated a block out beyond eof and
 955          * we are now coming back to use it then we will need to
 956          * flag it as new even if it has a disk address.
 957          */
 958         if (create &&
 959             ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
 960              (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) {
 961                 set_buffer_new(bh_result);
 962         }
 963
 964         if (iomap.iomap_flags & IOMAP_DELAY) {
 965                 if (unlikely(direct))
 966                         BUG();
 967                 if (create) {
 968                         set_buffer_mapped(bh_result);
 969                         set_buffer_uptodate(bh_result);
 970                 }
 971                 bh_result->b_bdev = iomap.iomap_target->pbr_bdev;
 972                 set_buffer_delay(bh_result);
 973         }
 974
 975         if (blocks) {
 976                 loff_t iosize;
 977                 iosize = (iomap.iomap_bsize - iomap.iomap_delta);
 978                 bh_result->b_size =
 979                     (ssize_t)min(iosize, (loff_t)(blocks << inode->i_blkbits));
 980         }
 981
 982         return 0;
 983 }
 984
 985 int
 986 linvfs_get_block(
 987         struct inode            *inode,
 988         sector_t                iblock,
 989         struct buffer_head      *bh_result,
 990         int                     create)
 991 {
 992         return linvfs_get_block_core(inode, iblock, 0, bh_result,
 993                                         create, 0, BMAPI_WRITE);
 994 }
 995
 996 STATIC int
 997 linvfs_get_block_sync(
 998         struct inode            *inode,
 999         sector_t                iblock,
1000         struct buffer_head      *bh_result,
1001         int                     create)
1002 {
1003         return linvfs_get_block_core(inode, iblock, 0, bh_result,
1004                                         create, 0, BMAPI_SYNC|BMAPI_WRITE);
1005 }
1006
1007 STATIC int
1008 linvfs_get_blocks_direct(
1009         struct inode            *inode,
1010         sector_t                iblock,
1011         unsigned long           max_blocks,
1012         struct buffer_head      *bh_result,
1013         int                     create)
1014 {
1015         return linvfs_get_block_core(inode, iblock, max_blocks, bh_result,
1016                                         create, 1, BMAPI_WRITE|BMAPI_DIRECT);
1017 }
1018
1019 STATIC ssize_t
1020 linvfs_direct_IO(
1021         int                     rw,
1022         struct kiocb            *iocb,
1023         const struct iovec      *iov,
1024         loff_t                  offset,
1025         unsigned long           nr_segs)
1026 {
1027         struct file     *file = iocb->ki_filp;
1028         struct inode    *inode = file->f_mapping->host;
1029         vnode_t         *vp = LINVFS_GET_VP(inode);
1030         xfs_iomap_t     iomap;
1031         int             maps = 1;
1032         int             error;
1033
1034         VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
1035         if (error)
1036                 return -error;
1037
1038         return blockdev_direct_IO_no_locking(rw, iocb, inode,
1039                 iomap.iomap_target->pbr_bdev,
1040                 iov, offset, nr_segs,
1041                 linvfs_get_blocks_direct,
1042                 linvfs_unwritten_convert_direct);
1043 }
1044
1045
1046 STATIC sector_t
1047 linvfs_bmap(
1048         struct address_space    *mapping,
1049         sector_t                block)
1050 {
1051         struct inode            *inode = (struct inode *)mapping->host;
1052         vnode_t                 *vp = LINVFS_GET_VP(inode);
1053         int                     error;
1054
1055         vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
1056
1057         VOP_RWLOCK(vp, VRWLOCK_READ);
1058         VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
1059         VOP_RWUNLOCK(vp, VRWLOCK_READ);
1060         return generic_block_bmap(mapping, block, linvfs_get_block);
1061 }
1062
1063 STATIC int
1064 linvfs_readpage(
1065         struct file             *unused,
1066         struct page             *page)
1067 {
1068         return mpage_readpage(page, linvfs_get_block);
1069 }
1070
1071 STATIC int
1072 linvfs_readpages(
1073         struct file             *unused,
1074         struct address_space    *mapping,
1075         struct list_head        *pages,
1076         unsigned                nr_pages)
1077 {
1078         return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
1079 }
1080
1081 STATIC void
1082 xfs_count_page_state(
1083         struct page             *page,
1084         int                     *delalloc,
1085         int                     *unmapped,
1086         int                     *unwritten)
1087 {
1088         struct buffer_head      *bh, *head;
1089
1090         *delalloc = *unmapped = *unwritten = 0;
1091
1092         bh = head = page_buffers(page);
1093         do {
1094                 if (buffer_uptodate(bh) && !buffer_mapped(bh))
1095                         (*unmapped) = 1;
1096                 else if (buffer_unwritten(bh) && !buffer_delay(bh))
1097                         clear_buffer_unwritten(bh);
1098                 else if (buffer_unwritten(bh))
1099                         (*unwritten) = 1;
1100                 else if (buffer_delay(bh))
1101                         (*delalloc) = 1;
1102         } while ((bh = bh->b_this_page) != head);
1103 }
1104
1105
1106 /*
1107  * writepage: Called from one of two places:
1108  *
1109  * 1. we are flushing a delalloc buffer head.
1110  *
1111  * 2. we are writing out a dirty page. Typically the page dirty
1112  *    state is cleared before we get here. In this case is it
1113  *    conceivable we have no buffer heads.
1114  *
1115  * For delalloc space on the page we need to allocate space and
1116  * flush it. For unmapped buffer heads on the page we should
1117  * allocate space if the page is uptodate. For any other dirty
1118  * buffer heads on the page we should flush them.
1119  *
1120  * If we detect that a transaction would be required to flush
1121  * the page, we have to check the process flags first, if we
1122  * are already in a transaction or disk I/O during allocations
1123  * is off, we need to fail the writepage and redirty the page.
1124  */
1125
1126 STATIC int
1127 linvfs_writepage(
1128         struct page             *page,
1129         struct writeback_control *wbc)
1130 {
1131         int                     error;
1132         int                     need_trans;
1133         int                     delalloc, unmapped, unwritten;
1134         struct inode            *inode = page->mapping->host;
1135
1136         xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
1137
1138         /*
1139          * We need a transaction if:
1140          *  1. There are delalloc buffers on the page
1141          *  2. The page is uptodate and we have unmapped buffers
1142          *  3. The page is uptodate and we have no buffers
1143          *  4. There are unwritten buffers on the page
1144          */
1145
1146         if (!page_has_buffers(page)) {
1147                 unmapped = 1;
1148                 need_trans = 1;
1149         } else {
1150                 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1151                 if (!PageUptodate(page))
1152                         unmapped = 0;
1153                 need_trans = delalloc + unmapped + unwritten;
1154         }
1155
1156         /*
1157          * If we need a transaction and the process flags say
1158          * we are already in a transaction, or no IO is allowed
1159          * then mark the page dirty again and leave the page
1160          * as is.
1161          */
1162         if (PFLAGS_TEST_FSTRANS() && need_trans)
1163                 goto out_fail;
1164
1165         /*
1166          * Delay hooking up buffer heads until we have
1167          * made our go/no-go decision.
1168          */
1169         if (!page_has_buffers(page))
1170                 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1171
1172         /*
1173          * Convert delayed allocate, unwritten or unmapped space
1174          * to real space and flush out to disk.
1175          */
1176         error = xfs_page_state_convert(inode, page, 1, unmapped);
1177         if (error == -EAGAIN)
1178                 goto out_fail;
1179         if (unlikely(error < 0))
1180                 goto out_unlock;
1181
1182         return 0;
1183
1184 out_fail:
1185         set_page_dirty(page);
1186         unlock_page(page);
1187         return 0;
1188 out_unlock:
1189         unlock_page(page);
1190         return error;
1191 }
1192
1193 /*
1194  * Called to move a page into cleanable state - and from there
1195  * to be released. Possibly the page is already clean. We always
1196  * have buffer heads in this call.
1197  *
1198  * Returns 0 if the page is ok to release, 1 otherwise.
1199  *
1200  * Possible scenarios are:
1201  *
1202  * 1. We are being called to release a page which has been written
1203  *    to via regular I/O. buffer heads will be dirty and possibly
1204  *    delalloc. If no delalloc buffer heads in this case then we
1205  *    can just return zero.
1206  *
1207  * 2. We are called to release a page which has been written via
1208  *    mmap, all we need to do is ensure there is no delalloc
1209  *    state in the buffer heads, if not we can let the caller
1210  *    free them and we should come back later via writepage.
1211  */
1212 STATIC int
1213 linvfs_release_page(
1214         struct page             *page,
1215         int                     gfp_mask)
1216 {
1217         struct inode            *inode = page->mapping->host;
1218         int                     dirty, delalloc, unmapped, unwritten;
1219
1220         xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
1221
1222         xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1223         if (!delalloc && !unwritten)
1224                 goto free_buffers;
1225
1226         if (!(gfp_mask & __GFP_FS))
1227                 return 0;
1228
1229         /* If we are already inside a transaction or the thread cannot
1230          * do I/O, we cannot release this page.
1231          */
1232         if (PFLAGS_TEST_FSTRANS())
1233                 return 0;
1234
1235         /*
1236          * Convert delalloc space to real space, do not flush the
1237          * data out to disk, that will be done by the caller.
1238          * Never need to allocate space here - we will always
1239          * come back to writepage in that case.
1240          */
1241         dirty = xfs_page_state_convert(inode, page, 0, 0);
1242         if (dirty == 0 && !unwritten)
1243                 goto free_buffers;
1244         return 0;
1245
1246 free_buffers:
1247         return try_to_free_buffers(page);
1248 }
1249
1250 STATIC int
1251 linvfs_prepare_write(
1252         struct file             *file,
1253         struct page             *page,
1254         unsigned int            from,
1255         unsigned int            to)
1256 {
1257         if (file && (file->f_flags & O_SYNC)) {
1258                 return block_prepare_write(page, from, to,
1259                                                 linvfs_get_block_sync);
1260         } else {
1261                 return block_prepare_write(page, from, to,
1262                                                 linvfs_get_block);
1263         }
1264 }
1265
1266 struct address_space_operations linvfs_aops = {
1267         .readpage               = linvfs_readpage,
1268         .readpages              = linvfs_readpages,
1269         .writepage              = linvfs_writepage,
1270         .sync_page              = block_sync_page,
1271         .releasepage            = linvfs_release_page,
1272         .prepare_write          = linvfs_prepare_write,
1273         .commit_write           = generic_commit_write,
1274         .bmap                   = linvfs_bmap,
1275         .direct_IO              = linvfs_direct_IO,
1276 };