fs/ntfs/mft.c

   1 /**
   2  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
   3  *
   4  * Copyright (c) 2001-2004 Anton Altaparmakov
   5  * Copyright (c) 2002 Richard Russon
   6  *
   7  * This program/include file is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License as published
   9  * by the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program/include file is distributed in the hope that it will be
  13  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  14  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program (in the main directory of the Linux-NTFS
  19  * distribution in the file COPYING); if not, write to the Free Software
  20  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21  */
  22
  23 #include <linux/swap.h>
  24
  25 #include "ntfs.h"
  26
  27 /**
  28  * __format_mft_record - initialize an empty mft record
  29  * @m:          mapped, pinned and locked for writing mft record
  30  * @size:       size of the mft record
  31  * @rec_no:     mft record number / inode number
  32  *
  33  * Private function to initialize an empty mft record. Use one of the two
  34  * provided format_mft_record() functions instead.
  35  */
  36 static void __format_mft_record(MFT_RECORD *m, const int size,
  37                 const unsigned long rec_no)
  38 {
  39         ATTR_RECORD *a;
  40
  41         memset(m, 0, size);
  42         m->magic = magic_FILE;
  43         /* Aligned to 2-byte boundary. */
  44         m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
  45         m->usa_count = cpu_to_le16(size / NTFS_BLOCK_SIZE + 1);
  46         /* Set the update sequence number to 1. */
  47         *(u16*)((char*)m + ((sizeof(MFT_RECORD) + 1) & ~1)) = cpu_to_le16(1);
  48         m->lsn = cpu_to_le64(0LL);
  49         m->sequence_number = cpu_to_le16(1);
  50         m->link_count = cpu_to_le16(0);
  51         /* Aligned to 8-byte boundary. */
  52         m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
  53                         (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
  54         m->flags = cpu_to_le16(0);
  55         /*
  56          * Using attrs_offset plus eight bytes (for the termination attribute),
  57          * aligned to 8-byte boundary.
  58          */
  59         m->bytes_in_use = cpu_to_le32((le16_to_cpu(m->attrs_offset) + 8 + 7) &
  60                         ~7);
  61         m->bytes_allocated = cpu_to_le32(size);
  62         m->base_mft_record = cpu_to_le64((MFT_REF)0);
  63         m->next_attr_instance = cpu_to_le16(0);
  64         a = (ATTR_RECORD*)((char*)m + le16_to_cpu(m->attrs_offset));
  65         a->type = AT_END;
  66         a->length = cpu_to_le32(0);
  67 }
  68
  69 /**
  70  * format_mft_record - initialize an empty mft record
  71  * @ni:         ntfs inode of mft record
  72  * @mft_rec:    mapped, pinned and locked mft record (optional)
  73  *
  74  * Initialize an empty mft record. This is used when extending the MFT.
  75  *
  76  * If @mft_rec is NULL, we call map_mft_record() to obtain the
  77  * record and we unmap it again when finished.
  78  *
  79  * We return 0 on success or -errno on error.
  80  */
  81 int format_mft_record(ntfs_inode *ni, MFT_RECORD *mft_rec)
  82 {
  83         MFT_RECORD *m;
  84
  85         if (mft_rec)
  86                 m = mft_rec;
  87         else {
  88                 m = map_mft_record(ni);
  89                 if (IS_ERR(m))
  90                         return PTR_ERR(m);
  91         }
  92         __format_mft_record(m, ni->vol->mft_record_size, ni->mft_no);
  93         if (!mft_rec) {
  94                 // FIXME: Need to set the mft record dirty!
  95                 unmap_mft_record(ni);
  96         }
  97         return 0;
  98 }
  99
 100 /**
 101  * ntfs_readpage - external declaration, function is in fs/ntfs/aops.c
 102  */
 103 extern int ntfs_readpage(struct file *, struct page *);
 104
 105 #ifdef NTFS_RW
 106 /**
 107  * ntfs_mft_writepage - forward declaration, function is further below
 108  */
 109 static int ntfs_mft_writepage(struct page *page, struct writeback_control *wbc);
 110 #endif /* NTFS_RW */
 111
 112 /**
 113  * ntfs_mft_aops - address space operations for access to $MFT
 114  *
 115  * Address space operations for access to $MFT. This allows us to simply use
 116  * ntfs_map_page() in map_mft_record_page().
 117  */
 118 struct address_space_operations ntfs_mft_aops = {
 119         .readpage       = ntfs_readpage,        /* Fill page with data. */
 120         .sync_page      = block_sync_page,      /* Currently, just unplugs the
 121                                                    disk request queue. */
 122 #ifdef NTFS_RW
 123         .writepage      = ntfs_mft_writepage,   /* Write out the dirty mft
 124                                                    records in a page. */
 125         .set_page_dirty = __set_page_dirty_nobuffers,   /* Set the page dirty
 126                                                    without touching the buffers
 127                                                    belonging to the page. */
 128 #endif /* NTFS_RW */
 129 };
 130
 131 /**
 132  * map_mft_record_page - map the page in which a specific mft record resides
 133  * @ni:         ntfs inode whose mft record page to map
 134  *
 135  * This maps the page in which the mft record of the ntfs inode @ni is situated
 136  * and returns a pointer to the mft record within the mapped page.
 137  *
 138  * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
 139  * contains the negative error code returned.
 140  */
 141 static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 142 {
 143         ntfs_volume *vol = ni->vol;
 144         struct inode *mft_vi = vol->mft_ino;
 145         struct page *page;
 146         unsigned long index, ofs, end_index;
 147
 148         BUG_ON(ni->page);
 149         /*
 150          * The index into the page cache and the offset within the page cache
 151          * page of the wanted mft record. FIXME: We need to check for
 152          * overflowing the unsigned long, but I don't think we would ever get
 153          * here if the volume was that big...
 154          */
 155         index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
 156         ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 157
 158         /* The maximum valid index into the page cache for $MFT's data. */
 159         end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
 160
 161         /* If the wanted index is out of bounds the mft record doesn't exist. */
 162         if (unlikely(index >= end_index)) {
 163                 if (index > end_index || (mft_vi->i_size & ~PAGE_CACHE_MASK) <
 164                                 ofs + vol->mft_record_size) {
 165                         page = ERR_PTR(-ENOENT);
 166                         goto err_out;
 167                 }
 168         }
 169         /* Read, map, and pin the page. */
 170         page = ntfs_map_page(mft_vi->i_mapping, index);
 171         if (likely(!IS_ERR(page))) {
 172                 ni->page = page;
 173                 ni->page_ofs = ofs;
 174                 return page_address(page) + ofs;
 175         }
 176 err_out:
 177         ni->page = NULL;
 178         ni->page_ofs = 0;
 179         ntfs_error(vol->sb, "Failed with error code %lu.", -PTR_ERR(page));
 180         return (void*)page;
 181 }
 182
 183 /**
 184  * map_mft_record - map, pin and lock an mft record
 185  * @ni:         ntfs inode whose MFT record to map
 186  *
 187  * First, take the mrec_lock semaphore. We might now be sleeping, while waiting
 188  * for the semaphore if it was already locked by someone else.
 189  *
 190  * The page of the record is mapped using map_mft_record_page() before being
 191  * returned to the caller.
 192  *
 193  * This in turn uses ntfs_map_page() to get the page containing the wanted mft
 194  * record (it in turn calls read_cache_page() which reads it in from disk if
 195  * necessary, increments the use count on the page so that it cannot disappear
 196  * under us and returns a reference to the page cache page).
 197  *
 198  * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
 199  * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
 200  * and the post-read mst fixups on each mft record in the page have been
 201  * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
 202  * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
 203  * ntfs_map_page() waits for PG_locked to become clear and checks if
 204  * PG_uptodate is set and returns an error code if not. This provides
 205  * sufficient protection against races when reading/using the page.
 206  *
 207  * However there is the write mapping to think about. Doing the above described
 208  * checking here will be fine, because when initiating the write we will set
 209  * PG_locked and clear PG_uptodate making sure nobody is touching the page
 210  * contents. Doing the locking this way means that the commit to disk code in
 211  * the page cache code paths is automatically sufficiently locked with us as
 212  * we will not touch a page that has been locked or is not uptodate. The only
 213  * locking problem then is them locking the page while we are accessing it.
 214  *
 215  * So that code will end up having to own the mrec_lock of all mft
 216  * records/inodes present in the page before I/O can proceed. In that case we
 217  * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
 218  * accessing anything without owning the mrec_lock semaphore. But we do need
 219  * to use them because of the read_cache_page() invocation and the code becomes
 220  * so much simpler this way that it is well worth it.
 221  *
 222  * The mft record is now ours and we return a pointer to it. You need to check
 223  * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
 224  * the error code.
 225  *
 226  * NOTE: Caller is responsible for setting the mft record dirty before calling
 227  * unmap_mft_record(). This is obviously only necessary if the caller really
 228  * modified the mft record...
 229  * Q: Do we want to recycle one of the VFS inode state bits instead?
 230  * A: No, the inode ones mean we want to change the mft record, not we want to
 231  * write it out.
 232  */
 233 MFT_RECORD *map_mft_record(ntfs_inode *ni)
 234 {
 235         MFT_RECORD *m;
 236
 237         ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
 238
 239         /* Make sure the ntfs inode doesn't go away. */
 240         atomic_inc(&ni->count);
 241
 242         /* Serialize access to this mft record. */
 243         down(&ni->mrec_lock);
 244
 245         m = map_mft_record_page(ni);
 246         if (likely(!IS_ERR(m)))
 247                 return m;
 248
 249         up(&ni->mrec_lock);
 250         atomic_dec(&ni->count);
 251         ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
 252         return m;
 253 }
 254
 255 /**
 256  * unmap_mft_record_page - unmap the page in which a specific mft record resides
 257  * @ni:         ntfs inode whose mft record page to unmap
 258  *
 259  * This unmaps the page in which the mft record of the ntfs inode @ni is
 260  * situated and returns. This is a NOOP if highmem is not configured.
 261  *
 262  * The unmap happens via ntfs_unmap_page() which in turn decrements the use
 263  * count on the page thus releasing it from the pinned state.
 264  *
 265  * We do not actually unmap the page from memory of course, as that will be
 266  * done by the page cache code itself when memory pressure increases or
 267  * whatever.
 268  */
 269 static inline void unmap_mft_record_page(ntfs_inode *ni)
 270 {
 271         BUG_ON(!ni->page);
 272
 273         // TODO: If dirty, blah...
 274         ntfs_unmap_page(ni->page);
 275         ni->page = NULL;
 276         ni->page_ofs = 0;
 277         return;
 278 }
 279
 280 /**
 281  * unmap_mft_record - release a mapped mft record
 282  * @ni:         ntfs inode whose MFT record to unmap
 283  *
 284  * We release the page mapping and the mrec_lock mutex which unmaps the mft
 285  * record and releases it for others to get hold of. We also release the ntfs
 286  * inode by decrementing the ntfs inode reference count.
 287  *
 288  * NOTE: If caller has modified the mft record, it is imperative to set the mft
 289  * record dirty BEFORE calling unmap_mft_record().
 290  */
 291 void unmap_mft_record(ntfs_inode *ni)
 292 {
 293         struct page *page = ni->page;
 294
 295         BUG_ON(!page);
 296
 297         ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
 298
 299         unmap_mft_record_page(ni);
 300         up(&ni->mrec_lock);
 301         atomic_dec(&ni->count);
 302         /*
 303          * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
 304          * ntfs_clear_extent_inode() in the extent inode case, and to the
 305          * caller in the non-extent, yet pure ntfs inode case, to do the actual
 306          * tear down of all structures and freeing of all allocated memory.
 307          */
 308         return;
 309 }
 310
 311 /**
 312  * map_extent_mft_record - load an extent inode and attach it to its base
 313  * @base_ni:    base ntfs inode
 314  * @mref:       mft reference of the extent inode to load (in little endian)
 315  * @ntfs_ino:   on successful return, pointer to the ntfs_inode structure
 316  *
 317  * Load the extent mft record @mref and attach it to its base inode @base_ni.
 318  * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
 319  * PTR_ERR(result) gives the negative error code.
 320  *
 321  * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
 322  * structure of the mapped extent inode.
 323  */
 324 MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
 325                 ntfs_inode **ntfs_ino)
 326 {
 327         MFT_RECORD *m;
 328         ntfs_inode *ni = NULL;
 329         ntfs_inode **extent_nis = NULL;
 330         int i;
 331         unsigned long mft_no = MREF_LE(mref);
 332         u16 seq_no = MSEQNO_LE(mref);
 333         BOOL destroy_ni = FALSE;
 334
 335         ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
 336                         mft_no, base_ni->mft_no);
 337         /* Make sure the base ntfs inode doesn't go away. */
 338         atomic_inc(&base_ni->count);
 339         /*
 340          * Check if this extent inode has already been added to the base inode,
 341          * in which case just return it. If not found, add it to the base
 342          * inode before returning it.
 343          */
 344         down(&base_ni->extent_lock);
 345         if (base_ni->nr_extents > 0) {
 346                 extent_nis = base_ni->ext.extent_ntfs_inos;
 347                 for (i = 0; i < base_ni->nr_extents; i++) {
 348                         if (mft_no != extent_nis[i]->mft_no)
 349                                 continue;
 350                         ni = extent_nis[i];
 351                         /* Make sure the ntfs inode doesn't go away. */
 352                         atomic_inc(&ni->count);
 353                         break;
 354                 }
 355         }
 356         if (likely(ni != NULL)) {
 357                 up(&base_ni->extent_lock);
 358                 atomic_dec(&base_ni->count);
 359                 /* We found the record; just have to map and return it. */
 360                 m = map_mft_record(ni);
 361                 /* map_mft_record() has incremented this on success. */
 362                 atomic_dec(&ni->count);
 363                 if (likely(!IS_ERR(m))) {
 364                         /* Verify the sequence number. */
 365                         if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
 366                                 ntfs_debug("Done 1.");
 367                                 *ntfs_ino = ni;
 368                                 return m;
 369                         }
 370                         unmap_mft_record(ni);
 371                         ntfs_error(base_ni->vol->sb, "Found stale extent mft "
 372                                         "reference! Corrupt file system. "
 373                                         "Run chkdsk.");
 374                         return ERR_PTR(-EIO);
 375                 }
 376 map_err_out:
 377                 ntfs_error(base_ni->vol->sb, "Failed to map extent "
 378                                 "mft record, error code %ld.", -PTR_ERR(m));
 379                 return m;
 380         }
 381         /* Record wasn't there. Get a new ntfs inode and initialize it. */
 382         ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
 383         if (unlikely(!ni)) {
 384                 up(&base_ni->extent_lock);
 385                 atomic_dec(&base_ni->count);
 386                 return ERR_PTR(-ENOMEM);
 387         }
 388         ni->vol = base_ni->vol;
 389         ni->seq_no = seq_no;
 390         ni->nr_extents = -1;
 391         ni->ext.base_ntfs_ino = base_ni;
 392         /* Now map the record. */
 393         m = map_mft_record(ni);
 394         if (unlikely(IS_ERR(m))) {
 395                 up(&base_ni->extent_lock);
 396                 atomic_dec(&base_ni->count);
 397                 ntfs_clear_extent_inode(ni);
 398                 goto map_err_out;
 399         }
 400         /* Verify the sequence number. */
 401         if (unlikely(le16_to_cpu(m->sequence_number) != seq_no)) {
 402                 ntfs_error(base_ni->vol->sb, "Found stale extent mft "
 403                                 "reference! Corrupt file system. Run chkdsk.");
 404                 destroy_ni = TRUE;
 405                 m = ERR_PTR(-EIO);
 406                 goto unm_err_out;
 407         }
 408         /* Attach extent inode to base inode, reallocating memory if needed. */
 409         if (!(base_ni->nr_extents & 3)) {
 410                 ntfs_inode **tmp;
 411                 int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
 412
 413                 tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS);
 414                 if (unlikely(!tmp)) {
 415                         ntfs_error(base_ni->vol->sb, "Failed to allocate "
 416                                         "internal buffer.");
 417                         destroy_ni = TRUE;
 418                         m = ERR_PTR(-ENOMEM);
 419                         goto unm_err_out;
 420                 }
 421                 if (base_ni->ext.extent_ntfs_inos) {
 422                         memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
 423                                         4 * sizeof(ntfs_inode *));
 424                         kfree(base_ni->ext.extent_ntfs_inos);
 425                 }
 426                 base_ni->ext.extent_ntfs_inos = tmp;
 427         }
 428         base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
 429         up(&base_ni->extent_lock);
 430         atomic_dec(&base_ni->count);
 431         ntfs_debug("Done 2.");
 432         *ntfs_ino = ni;
 433         return m;
 434 unm_err_out:
 435         unmap_mft_record(ni);
 436         up(&base_ni->extent_lock);
 437         atomic_dec(&base_ni->count);
 438         /*
 439          * If the extent inode was not attached to the base inode we need to
 440          * release it or we will leak memory.
 441          */
 442         if (destroy_ni)
 443                 ntfs_clear_extent_inode(ni);
 444         return m;
 445 }
 446
 447 #ifdef NTFS_RW
 448
 449 /**
 450  * __mark_mft_record_dirty - set the mft record and the page containing it dirty
 451  * @ni:         ntfs inode describing the mapped mft record
 452  *
 453  * Internal function.  Users should call mark_mft_record_dirty() instead.
 454  *
 455  * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
 456  * as well as the page containing the mft record, dirty.  Also, mark the base
 457  * vfs inode dirty.  This ensures that any changes to the mft record are
 458  * written out to disk.
 459  *
 460  * NOTE:  We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
 461  * on the base vfs inode, because even though file data may have been modified,
 462  * it is dirty in the inode meta data rather than the data page cache of the
 463  * inode, and thus there are no data pages that need writing out.  Therefore, a
 464  * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
 465  * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
 466  * ensure ->write_inode is called from generic_osync_inode() and this needs to
 467  * happen or the file data would not necessarily hit the device synchronously,
 468  * even though the vfs inode has the O_SYNC flag set.  Also, I_DIRTY_DATASYNC
 469  * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
 470  * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
 471  * would suggest.
 472  */
 473 void __mark_mft_record_dirty(ntfs_inode *ni)
 474 {
 475         struct page *page = ni->page;
 476         ntfs_inode *base_ni;
 477
 478         ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
 479         BUG_ON(!page);
 480         BUG_ON(NInoAttr(ni));
 481
 482         /*
 483          * Set the page containing the mft record dirty.  This also marks the
 484          * $MFT inode dirty (I_DIRTY_PAGES).
 485          */
 486         __set_page_dirty_nobuffers(page);
 487
 488         /* Determine the base vfs inode and mark it dirty, too. */
 489         down(&ni->extent_lock);
 490         if (likely(ni->nr_extents >= 0))
 491                 base_ni = ni;
 492         else
 493                 base_ni = ni->ext.base_ntfs_ino;
 494         up(&ni->extent_lock);
 495         __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
 496 }
 497
 498 static const char *ntfs_please_email = "Please email "
 499                 "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
 500                 "this message.  Thank you.";
 501
 502 /**
 503  * sync_mft_mirror_umount - synchronise an mft record to the mft mirror
 504  * @ni:         ntfs inode whose mft record to synchronize
 505  * @m:          mapped, mst protected (extent) mft record to synchronize
 506  *
 507  * Write the mapped, mst protected (extent) mft record @m described by the
 508  * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr) bypassing
 509  * the page cache and the $MFTMirr inode itself.
 510  *
 511  * This function is only for use at umount time when the mft mirror inode has
 512  * already been disposed off.  We BUG() if we are called while the mft mirror
 513  * inode is still attached to the volume.
 514  *
 515  * On success return 0.  On error return -errno.
 516  *
 517  * NOTE:  This function is not implemented yet as I am not convinced it can
 518  * actually be triggered considering the sequence of commits we do in super.c::
 519  * ntfs_put_super().  But just in case we provide this place holder as the
 520  * alternative would be either to BUG() or to get a NULL pointer dereference
 521  * and Oops.
 522  */
 523 static int sync_mft_mirror_umount(ntfs_inode *ni, MFT_RECORD *m)
 524 {
 525         ntfs_volume *vol = ni->vol;
 526
 527         BUG_ON(vol->mftmirr_ino);
 528         ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
 529                         "implemented yet.  %s", ntfs_please_email);
 530         return -EOPNOTSUPP;
 531 }
 532
 533 /**
 534  * sync_mft_mirror - synchronize an mft record to the mft mirror
 535  * @ni:         ntfs inode whose mft record to synchronize
 536  * @m:          mapped, mst protected (extent) mft record to synchronize
 537  * @sync:       if true, wait for i/o completion
 538  *
 539  * Write the mapped, mst protected (extent) mft record @m described by the
 540  * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr).
 541  *
 542  * On success return 0.  On error return -errno and set the volume errors flag
 543  * in the ntfs_volume to which @ni belongs.
 544  *
 545  * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
 546  *
 547  * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
 548  * schedule i/o via ->writepage or do it via kntfsd or whatever.
 549  */
 550 static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync)
 551 {
 552         ntfs_volume *vol = ni->vol;
 553         struct page *page;
 554         unsigned int blocksize = vol->sb->s_blocksize;
 555         int max_bhs = vol->mft_record_size / blocksize;
 556         struct buffer_head *bhs[max_bhs];
 557         struct buffer_head *bh, *head;
 558         u8 *kmirr;
 559         unsigned int block_start, block_end, m_start, m_end;
 560         int i_bhs, nr_bhs, err = 0;
 561
 562         ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
 563         BUG_ON(!max_bhs);
 564         if (unlikely(!vol->mftmirr_ino)) {
 565                 /* This could happen during umount... */
 566                 err = sync_mft_mirror_umount(ni, m);
 567                 if (likely(!err))
 568                         return err;
 569                 goto err_out;
 570         }
 571         /* Get the page containing the mirror copy of the mft record @m. */
 572         page = ntfs_map_page(vol->mftmirr_ino->i_mapping, ni->mft_no >>
 573                         (PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
 574         if (unlikely(IS_ERR(page))) {
 575                 ntfs_error(vol->sb, "Failed to map mft mirror page.");
 576                 err = PTR_ERR(page);
 577                 goto err_out;
 578         }
 579         /*
 580          * Exclusion against other writers.   This should never be a problem
 581          * since the page in which the mft record @m resides is also locked and
 582          * hence any other writers would be held up there but it is better to
 583          * make sure no one is writing from elsewhere.
 584          */
 585         lock_page(page);
 586         /* The address in the page of the mirror copy of the mft record @m. */
 587         kmirr = page_address(page) + ((ni->mft_no << vol->mft_record_size_bits)
 588                         & ~PAGE_CACHE_MASK);
 589         /* Copy the mst protected mft record to the mirror. */
 590         memcpy(kmirr, m, vol->mft_record_size);
 591         /* Make sure we have mapped buffers. */
 592         if (!page_has_buffers(page)) {
 593 no_buffers_err_out:
 594                 ntfs_error(vol->sb, "Writing mft mirror records without "
 595                                 "existing buffers is not implemented yet.  %s",
 596                                 ntfs_please_email);
 597                 err = -EOPNOTSUPP;
 598                 goto unlock_err_out;
 599         }
 600         bh = head = page_buffers(page);
 601         if (!bh)
 602                 goto no_buffers_err_out;
 603         nr_bhs = 0;
 604         block_start = 0;
 605         m_start = kmirr - (u8*)page_address(page);
 606         m_end = m_start + vol->mft_record_size;
 607         do {
 608                 block_end = block_start + blocksize;
 609                 /*
 610                  * If the buffer is outside the mft record, just skip it,
 611                  * clearing it if it is dirty to make sure it is not written
 612                  * out.  It should never be marked dirty but better be safe.
 613                  */
 614                 if ((block_end <= m_start) || (block_start >= m_end)) {
 615                         if (buffer_dirty(bh)) {
 616                                 ntfs_warning(vol->sb, "Clearing dirty mft "
 617                                                 "record page buffer.  %s",
 618                                                 ntfs_please_email);
 619                                 clear_buffer_dirty(bh);
 620                         }
 621                         continue;
 622                 }
 623                 if (!buffer_mapped(bh)) {
 624                         ntfs_error(vol->sb, "Writing mft mirror records "
 625                                         "without existing mapped buffers is "
 626                                         "not implemented yet.  %s",
 627                                         ntfs_please_email);
 628                         err = -EOPNOTSUPP;
 629                         continue;
 630                 }
 631                 if (!buffer_uptodate(bh)) {
 632                         ntfs_error(vol->sb, "Writing mft mirror records "
 633                                         "without existing uptodate buffers is "
 634                                         "not implemented yet.  %s",
 635                                         ntfs_please_email);
 636                         err = -EOPNOTSUPP;
 637                         continue;
 638                 }
 639                 BUG_ON(!nr_bhs && (m_start != block_start));
 640                 BUG_ON(nr_bhs >= max_bhs);
 641                 bhs[nr_bhs++] = bh;
 642                 BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
 643         } while (block_start = block_end, (bh = bh->b_this_page) != head);
 644         if (likely(!err)) {
 645                 /* Lock buffers and start synchronous write i/o on them. */
 646                 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 647                         struct buffer_head *tbh = bhs[i_bhs];
 648
 649                         if (unlikely(test_set_buffer_locked(tbh)))
 650                                 BUG();
 651                         BUG_ON(!buffer_uptodate(tbh));
 652                         if (buffer_dirty(tbh))
 653                                 clear_buffer_dirty(tbh);
 654                         get_bh(tbh);
 655                         tbh->b_end_io = end_buffer_write_sync;
 656                         submit_bh(WRITE, tbh);
 657                 }
 658                 /* Wait on i/o completion of buffers. */
 659                 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 660                         struct buffer_head *tbh = bhs[i_bhs];
 661
 662                         wait_on_buffer(tbh);
 663                         if (unlikely(!buffer_uptodate(tbh))) {
 664                                 err = -EIO;
 665                                 /*
 666                                  * Set the buffer uptodate so the page & buffer
 667                                  * states don't become out of sync.
 668                                  */
 669                                 if (PageUptodate(page))
 670                                         set_buffer_uptodate(tbh);
 671                         }
 672                 }
 673         } else /* if (unlikely(err)) */ {
 674                 /* Clean the buffers. */
 675                 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
 676                         clear_buffer_dirty(bhs[i_bhs]);
 677         }
 678 unlock_err_out:
 679         /* Current state: all buffers are clean, unlocked, and uptodate. */
 680         /* Remove the mst protection fixups again. */
 681         post_write_mst_fixup((NTFS_RECORD*)kmirr);
 682         flush_dcache_page(page);
 683         unlock_page(page);
 684         ntfs_unmap_page(page);
 685         if (unlikely(err)) {
 686                 /* I/O error during writing.  This is really bad! */
 687                 ntfs_error(vol->sb, "I/O error while writing mft mirror "
 688                                 "record 0x%lx!  You should unmount the volume "
 689                                 "and run chkdsk or ntfsfix.", ni->mft_no);
 690                 goto err_out;
 691         }
 692         ntfs_debug("Done.");
 693         return 0;
 694 err_out:
 695         ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error code %i).  "
 696                         "Volume will be left marked dirty on umount.  Run "
 697                         "ntfsfix on the partition after umounting to correct "
 698                         "this.", -err);
 699         /* We don't want to clear the dirty bit on umount. */
 700         NVolSetErrors(vol);
 701         return err;
 702 }
 703
 704 /**
 705  * write_mft_record_nolock - write out a mapped (extent) mft record
 706  * @ni:         ntfs inode describing the mapped (extent) mft record
 707  * @m:          mapped (extent) mft record to write
 708  * @sync:       if true, wait for i/o completion
 709  *
 710  * Write the mapped (extent) mft record @m described by the (regular or extent)
 711  * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
 712  * the mft mirror, that is also updated.
 713  *
 714  * On success, clean the mft record and return 0.  On error, leave the mft
 715  * record dirty and return -errno.  The caller should call make_bad_inode() on
 716  * the base inode to ensure no more access happens to this inode.  We do not do
 717  * it here as the caller may want to finish writing other extent mft records
 718  * first to minimize on-disk metadata inconsistencies.
 719  *
 720  * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
 721  * However, if the mft record has a counterpart in the mft mirror and @sync is
 722  * true, we write the mft record, wait for i/o completion, and only then write
 723  * the mft mirror copy.  This ensures that if the system crashes either the mft
 724  * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
 725  * false on the other hand, we start i/o on both and then wait for completion
 726  * on them.  This provides a speedup but no longer guarantees that you will end
 727  * up with a self-consistent mft record in the case of a crash but if you asked
 728  * for asynchronous writing you probably do not care about that anyway.
 729  *
 730  * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
 731  * schedule i/o via ->writepage or do it via kntfsd or whatever.
 732  */
 733 int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 734 {
 735         ntfs_volume *vol = ni->vol;
 736         struct page *page = ni->page;
 737         unsigned int blocksize = vol->sb->s_blocksize;
 738         int max_bhs = vol->mft_record_size / blocksize;
 739         struct buffer_head *bhs[max_bhs];
 740         struct buffer_head *bh, *head;
 741         unsigned int block_start, block_end, m_start, m_end;
 742         int i_bhs, nr_bhs, err = 0;
 743
 744         ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
 745         BUG_ON(NInoAttr(ni));
 746         BUG_ON(!max_bhs);
 747         BUG_ON(!PageLocked(page));
 748         /*
 749          * If the ntfs_inode is clean no need to do anything.  If it is dirty,
 750          * mark it as clean now so that it can be redirtied later on if needed.
 751          * There is no danger of races since the caller is holding the locks
 752          * for the mft record @m and the page it is in.
 753          */
 754         if (!NInoTestClearDirty(ni))
 755                 goto done;
 756         /* Make sure we have mapped buffers. */
 757         if (!page_has_buffers(page)) {
 758 no_buffers_err_out:
 759                 ntfs_error(vol->sb, "Writing mft records without existing "
 760                                 "buffers is not implemented yet.  %s",
 761                                 ntfs_please_email);
 762                 err = -EOPNOTSUPP;
 763                 goto err_out;
 764         }
 765         bh = head = page_buffers(page);
 766         if (!bh)
 767                 goto no_buffers_err_out;
 768         nr_bhs = 0;
 769         block_start = 0;
 770         m_start = ni->page_ofs;
 771         m_end = m_start + vol->mft_record_size;
 772         do {
 773                 block_end = block_start + blocksize;
 774                 /*
 775                  * If the buffer is outside the mft record, just skip it,
 776                  * clearing it if it is dirty to make sure it is not written
 777                  * out.  It should never be marked dirty but better be safe.
 778                  */
 779                 if ((block_end <= m_start) || (block_start >= m_end)) {
 780                         if (buffer_dirty(bh)) {
 781                                 ntfs_warning(vol->sb, "Clearing dirty mft "
 782                                                 "record page buffer.  %s",
 783                                                 ntfs_please_email);
 784                                 clear_buffer_dirty(bh);
 785                         }
 786                         continue;
 787                 }
 788                 if (!buffer_mapped(bh)) {
 789                         ntfs_error(vol->sb, "Writing mft records without "
 790                                         "existing mapped buffers is not "
 791                                         "implemented yet.  %s",
 792                                         ntfs_please_email);
 793                         err = -EOPNOTSUPP;
 794                         continue;
 795                 }
 796                 if (!buffer_uptodate(bh)) {
 797                         ntfs_error(vol->sb, "Writing mft records without "
 798                                         "existing uptodate buffers is not "
 799                                         "implemented yet.  %s",
 800                                         ntfs_please_email);
 801                         err = -EOPNOTSUPP;
 802                         continue;
 803                 }
 804                 BUG_ON(!nr_bhs && (m_start != block_start));
 805                 BUG_ON(nr_bhs >= max_bhs);
 806                 bhs[nr_bhs++] = bh;
 807                 BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
 808         } while (block_start = block_end, (bh = bh->b_this_page) != head);
 809         if (unlikely(err))
 810                 goto cleanup_out;
 811         /* Apply the mst protection fixups. */
 812         err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
 813         if (err) {
 814                 ntfs_error(vol->sb, "Failed to apply mst fixups!");
 815                 goto cleanup_out;
 816         }
 817         flush_dcache_mft_record_page(ni);
 818         /* Lock buffers and start synchronous write i/o on them. */
 819         for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 820                 struct buffer_head *tbh = bhs[i_bhs];
 821
 822                 if (unlikely(test_set_buffer_locked(tbh)))
 823                         BUG();
 824                 BUG_ON(!buffer_uptodate(tbh));
 825                 if (buffer_dirty(tbh))
 826                         clear_buffer_dirty(tbh);
 827                 get_bh(tbh);
 828                 tbh->b_end_io = end_buffer_write_sync;
 829                 submit_bh(WRITE, tbh);
 830         }
 831         /* Synchronize the mft mirror now if not @sync. */
 832         if (!sync && ni->mft_no < vol->mftmirr_size)
 833                 sync_mft_mirror(ni, m, sync);
 834         /* Wait on i/o completion of buffers. */
 835         for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 836                 struct buffer_head *tbh = bhs[i_bhs];
 837
 838                 wait_on_buffer(tbh);
 839                 if (unlikely(!buffer_uptodate(tbh))) {
 840                         err = -EIO;
 841                         /*
 842                          * Set the buffer uptodate so the page & buffer states
 843                          * don't become out of sync.
 844                          */
 845                         if (PageUptodate(page))
 846                                 set_buffer_uptodate(tbh);
 847                 }
 848         }
 849         /* If @sync, now synchronize the mft mirror. */
 850         if (sync && ni->mft_no < vol->mftmirr_size)
 851                 sync_mft_mirror(ni, m, sync);
 852         /* Remove the mst protection fixups again. */
 853         post_write_mst_fixup((NTFS_RECORD*)m);
 854         flush_dcache_mft_record_page(ni);
 855         if (unlikely(err)) {
 856                 /* I/O error during writing.  This is really bad! */
 857                 ntfs_error(vol->sb, "I/O error while writing mft record "
 858                                 "0x%lx!  Marking base inode as bad.  You "
 859                                 "should unmount the volume and run chkdsk.",
 860                                 ni->mft_no);
 861                 goto err_out;
 862         }
 863 done:
 864         ntfs_debug("Done.");
 865         return 0;
 866 cleanup_out:
 867         /* Clean the buffers. */
 868         for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
 869                 clear_buffer_dirty(bhs[i_bhs]);
 870 err_out:
 871         /*
 872          * Current state: all buffers are clean, unlocked, and uptodate.
 873          * The caller should mark the base inode as bad so that no more i/o
 874          * happens.  ->clear_inode() will still be invoked so all extent inodes
 875          * and other allocated memory will be freed.
 876          */
 877         if (err == -ENOMEM) {
 878                 ntfs_error(vol->sb, "Not enough memory to write mft record.  "
 879                                 "Redirtying so the write is retried later.");
 880                 mark_mft_record_dirty(ni);
 881                 err = 0;
 882         }
 883         return err;
 884 }
 885
 886 /**
 887  * ntfs_mft_writepage - check if a metadata page contains dirty mft records
 888  * @page:       metadata page possibly containing dirty mft records
 889  * @wbc:        writeback control structure
 890  *
 891  * This is called from the VM when it wants to have a dirty $MFT/$DATA metadata
 892  * page cache page cleaned.  The VM has already locked the page and marked it
 893  * clean.  Instead of writing the page as a conventional ->writepage function
 894  * would do, we check if the page still contains any dirty mft records (it must
 895  * have done at some point in the past since the page was marked dirty) and if
 896  * none are found, i.e. all mft records are clean, we unlock the page and
 897  * return.  The VM is then free to do with the page as it pleases.  If on the
 898  * other hand we do find any dirty mft records in the page, we redirty the page
 899  * before unlocking it and returning so the VM knows that the page is still
 900  * busy and cannot be thrown out.
 901  *
 902  * Note, we do not actually write any dirty mft records here because they are
 903  * dirty inodes and hence will be written by the VFS inode dirty code paths.
 904  * There is no need to write them from the VM page dirty code paths, too and in
 905  * fact once we implement journalling it would be a complete nightmare having
 906  * two code paths leading to mft record writeout.
 907  */
 908 static int ntfs_mft_writepage(struct page *page, struct writeback_control *wbc)
 909 {
 910         struct inode *mft_vi = page->mapping->host;
 911         struct super_block *sb = mft_vi->i_sb;
 912         ntfs_volume *vol = NTFS_SB(sb);
 913         u8 *maddr;
 914         MFT_RECORD *m;
 915         ntfs_inode **extent_nis;
 916         unsigned long mft_no;
 917         int nr, i, j;
 918         BOOL is_dirty = FALSE;
 919
 920         BUG_ON(!PageLocked(page));
 921         BUG_ON(PageWriteback(page));
 922         BUG_ON(mft_vi != vol->mft_ino);
 923         /* The first mft record number in the page. */
 924         mft_no = page->index << (PAGE_CACHE_SHIFT - vol->mft_record_size_bits);
 925         /* Number of mft records in the page. */
 926         nr = PAGE_CACHE_SIZE >> vol->mft_record_size_bits;
 927         BUG_ON(!nr);
 928         ntfs_debug("Entering for %i inodes starting at 0x%lx.", nr, mft_no);
 929         /* Iterate over the mft records in the page looking for a dirty one. */
 930         maddr = (u8*)kmap(page);
 931         for (i = 0; i < nr; ++i, ++mft_no, maddr += vol->mft_record_size) {
 932                 struct inode *vi;
 933                 ntfs_inode *ni, *eni;
 934                 ntfs_attr na;
 935
 936                 na.mft_no = mft_no;
 937                 na.name = NULL;
 938                 na.name_len = 0;
 939                 na.type = AT_UNUSED;
 940                 /*
 941                  * Check if the inode corresponding to this mft record is in
 942                  * the VFS inode cache and obtain a reference to it if it is.
 943                  */
 944                 ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
 945                 /*
 946                  * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from
 947                  * here or we deadlock because the inode is already locked by
 948                  * the kernel (fs/fs-writeback.c::__sync_single_inode()) and
 949                  * ilookup5() waits until the inode is unlocked before
 950                  * returning it and it never gets unlocked because
 951                  * ntfs_mft_writepage() never returns.  )-:  Fortunately, we
 952                  * have inode 0 pinned in icache for the duration of the mount
 953                  * so we can access it directly.
 954                  */
 955                 if (!mft_no) {
 956                         /* Balance the below iput(). */
 957                         vi = igrab(mft_vi);
 958                         BUG_ON(vi != mft_vi);
 959                 } else
 960                         vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
 961                 if (vi) {
 962                         ntfs_debug("Inode 0x%lx is in icache.", mft_no);
 963                         /* The inode is in icache.  Check if it is dirty. */
 964                         ni = NTFS_I(vi);
 965                         if (!NInoDirty(ni)) {
 966                                 /* The inode is not dirty, skip this record. */
 967                                 ntfs_debug("Inode 0x%lx is not dirty, "
 968                                                 "continuing search.", mft_no);
 969                                 iput(vi);
 970                                 continue;
 971                         }
 972                         ntfs_debug("Inode 0x%lx is dirty, aborting search.",
 973                                         mft_no);
 974                         /* The inode is dirty, no need to search further. */
 975                         iput(vi);
 976                         is_dirty = TRUE;
 977                         break;
 978                 }
 979                 ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
 980                 /* The inode is not in icache. */
 981                 /* Skip the record if it is not a mft record (type "FILE"). */
 982                 if (!ntfs_is_mft_recordp(maddr)) {
 983                         ntfs_debug("Mft record 0x%lx is not a FILE record, "
 984                                         "continuing search.", mft_no);
 985                         continue;
 986                 }
 987                 m = (MFT_RECORD*)maddr;
 988                 /*
 989                  * Skip the mft record if it is not in use.  FIXME:  What about
 990                  * deleted/deallocated (extent) inodes?  (AIA)
 991                  */
 992                 if (!(m->flags & MFT_RECORD_IN_USE)) {
 993                         ntfs_debug("Mft record 0x%lx is not in use, "
 994                                         "continuing search.", mft_no);
 995                         continue;
 996                 }
 997                 /* Skip the mft record if it is a base inode. */
 998                 if (!m->base_mft_record) {
 999                         ntfs_debug("Mft record 0x%lx is a base record, "
1000                                         "continuing search.", mft_no);
1001                         continue;
1002                 }
1003                 /*
1004                  * This is an extent mft record.  Check if the inode
1005                  * corresponding to its base mft record is in icache.
1006                  */
1007                 na.mft_no = MREF_LE(m->base_mft_record);
1008                 ntfs_debug("Mft record 0x%lx is an extent record.  Looking "
1009                                 "for base inode 0x%lx in icache.", mft_no,
1010                                 na.mft_no);
1011                 vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode,
1012                                 &na);
1013                 if (!vi) {
1014                         /*
1015                          * The base inode is not in icache.  Skip this extent
1016                          * mft record.
1017                          */
1018                         ntfs_debug("Base inode 0x%lx is not in icache, "
1019                                         "continuing search.", na.mft_no);
1020                         continue;
1021                 }
1022                 ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
1023                 /*
1024                  * The base inode is in icache.  Check if it has the extent
1025                  * inode corresponding to this extent mft record attached.
1026                  */
1027                 ni = NTFS_I(vi);
1028                 down(&ni->extent_lock);
1029                 if (ni->nr_extents <= 0) {
1030                         /*
1031                          * The base inode has no attached extent inodes.  Skip
1032                          * this extent mft record.
1033                          */
1034                         up(&ni->extent_lock);
1035                         iput(vi);
1036                         continue;
1037                 }
1038                 /* Iterate over the attached extent inodes. */
1039                 extent_nis = ni->ext.extent_ntfs_inos;
1040                 for (eni = NULL, j = 0; j < ni->nr_extents; ++j) {
1041                         if (mft_no == extent_nis[j]->mft_no) {
1042                                 /*
1043                                  * Found the extent inode corresponding to this
1044                                  * extent mft record.
1045                                  */
1046                                 eni = extent_nis[j];
1047                                 break;
1048                         }
1049                 }
1050                 /*
1051                  * If the extent inode was not attached to the base inode, skip
1052                  * this extent mft record.
1053                  */
1054                 if (!eni) {
1055                         up(&ni->extent_lock);
1056                         iput(vi);
1057                         continue;
1058                 }
1059                 /*
1060                  * Found the extent inode corrsponding to this extent mft
1061                  * record.  If it is dirty, no need to search further.
1062                  */
1063                 if (NInoDirty(eni)) {
1064                         up(&ni->extent_lock);
1065                         iput(vi);
1066                         is_dirty = TRUE;
1067                         break;
1068                 }
1069                 /* The extent inode is not dirty, so do the next record. */
1070                 up(&ni->extent_lock);
1071                 iput(vi);
1072         }
1073         kunmap(page);
1074         /* If a dirty mft record was found, redirty the page. */
1075         if (is_dirty) {
1076                 ntfs_debug("Inode 0x%lx is dirty.  Redirtying the page "
1077                                 "starting at inode 0x%lx.", mft_no,
1078                                 page->index << (PAGE_CACHE_SHIFT -
1079                                 vol->mft_record_size_bits));
1080                 redirty_page_for_writepage(wbc, page);
1081                 unlock_page(page);
1082         } else {
1083                 /*
1084                  * Keep the VM happy.  This must be done otherwise the
1085                  * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1086                  * the page is clean.
1087                  */
1088                 BUG_ON(PageWriteback(page));
1089                 set_page_writeback(page);
1090                 unlock_page(page);
1091                 end_page_writeback(page);
1092         }
1093         ntfs_debug("Done.");
1094         return 0;
1095 }
1096
1097 #endif /* NTFS_RW */