+/**
+ * ntfs_write_mst_block - write a @page to the backing store
+ * @page: page cache page to write out
+ * @wbc: writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, mst protected
+ * attributes to their backing store. The only supported attributes are index
+ * allocation and $MFT/$DATA. Both directory inodes and index inodes are
+ * supported for the index allocation case.
+ *
+ * The page must remain locked for the duration of the write because we apply
+ * the mst fixups, write, and then undo the fixups, so if we were to unlock the
+ * page before undoing the fixups, any other user of the page will see the
+ * page contents as corrupt.
+ *
+ * We clear the page uptodate flag for the duration of the function to ensure
+ * exclusion for the $MFT/$DATA case against someone mapping an mft record we
+ * are about to apply the mst fixups to.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_write_block(), ntfs_mft_writepage(), and
+ * write_mft_record_nolock().
+ */
+static int ntfs_write_mst_block(struct page *page,
+ struct writeback_control *wbc)
+{
+ sector_t block, dblock, rec_block;
+ struct inode *vi = page->mapping->host;
+ ntfs_inode *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
+ u8 *kaddr;
+ unsigned int rec_size = ni->itype.index.block_size;
+ ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
+ struct buffer_head *bh, *head, *tbh, *rec_start_bh;
+ struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+ runlist_element *rl;
+ int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
+ unsigned bh_size, rec_size_bits;
+ BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
+ unsigned char bh_size_bits;
+
+ ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+ "0x%lx.", vi->i_ino, ni->type, page->index);
+ BUG_ON(!NInoNonResident(ni));
+ BUG_ON(!NInoMstProtected(ni));
+ is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
+ /*
+ * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
+ * in its page cache were to be marked dirty. However this should
+ * never happen with the current driver and considering we do not
+ * handle this case here we do want to BUG(), at least for now.
+ */
+ BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
+ (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
+ bh_size = vol->sb->s_blocksize;
+ bh_size_bits = vol->sb->s_blocksize_bits;
+ max_bhs = PAGE_CACHE_SIZE / bh_size;
+ BUG_ON(!max_bhs);
+ BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
+
+ /* Were we called for sync purposes? */
+ sync = (wbc->sync_mode == WB_SYNC_ALL);
+
+ /* Make sure we have mapped buffers. */
+ bh = head = page_buffers(page);
+ BUG_ON(!bh);
+
+ rec_size_bits = ni->itype.index.block_size_bits;
+ BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
+ bhs_per_rec = rec_size >> bh_size_bits;
+ BUG_ON(!bhs_per_rec);
+
+ /* The first block in the page. */
+ rec_block = block = (sector_t)page->index <<
+ (PAGE_CACHE_SHIFT - bh_size_bits);
+
+ /* The first out of bounds block for the data size. */
+ dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
+
+ rl = NULL;
+ err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
+ page_is_dirty = rec_is_dirty = FALSE;
+ rec_start_bh = NULL;
+ do {
+ BOOL is_retry = FALSE;
+
+ if (likely(block < rec_block)) {
+ if (unlikely(block >= dblock)) {
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ /*
+ * This block is not the first one in the record. We
+ * ignore the buffer's dirty state because we could
+ * have raced with a parallel mark_ntfs_record_dirty().
+ */
+ if (!rec_is_dirty)
+ continue;
+ if (unlikely(err2)) {
+ if (err2 != -ENOMEM)
+ clear_buffer_dirty(bh);
+ continue;
+ }
+ } else /* if (block == rec_block) */ {
+ BUG_ON(block > rec_block);
+ /* This block is the first one in the record. */
+ rec_block += bhs_per_rec;
+ err2 = 0;
+ if (unlikely(block >= dblock)) {
+ clear_buffer_dirty(bh);
+ continue;
+ }
+ if (!buffer_dirty(bh)) {
+ /* Clean records are not written out. */
+ rec_is_dirty = FALSE;
+ continue;
+ }
+ rec_is_dirty = TRUE;
+ rec_start_bh = bh;
+ }
+ /* Need to map the buffer if it is not mapped already. */
+ if (unlikely(!buffer_mapped(bh))) {
+ VCN vcn;
+ LCN lcn;
+ unsigned int vcn_ofs;
+
+ bh->b_bdev = vol->sb->s_bdev;
+ /* Obtain the vcn and offset of the current block. */
+ vcn = (VCN)block << bh_size_bits;
+ vcn_ofs = vcn & vol->cluster_size_mask;
+ vcn >>= vol->cluster_size_bits;
+ if (!rl) {
+lock_retry_remap:
+ down_read(&ni->runlist.lock);
+ rl = ni->runlist.rl;
+ }
+ if (likely(rl != NULL)) {
+ /* Seek to element containing target vcn. */
+ while (rl->length && rl[1].vcn <= vcn)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+ } else
+ lcn = LCN_RL_NOT_MAPPED;
+ /* Successful remap. */
+ if (likely(lcn >= 0)) {
+ /* Setup buffer head to correct block. */
+ bh->b_blocknr = ((lcn <<
+ vol->cluster_size_bits) +
+ vcn_ofs) >> bh_size_bits;
+ set_buffer_mapped(bh);
+ } else {
+ /*
+ * Remap failed. Retry to map the runlist once
+ * unless we are working on $MFT which always
+ * has the whole of its runlist in memory.
+ */
+ if (!is_mft && !is_retry &&
+ lcn == LCN_RL_NOT_MAPPED) {
+ is_retry = TRUE;
+ /*
+ * Attempt to map runlist, dropping
+ * lock for the duration.
+ */
+ up_read(&ni->runlist.lock);
+ err2 = ntfs_map_runlist(ni, vcn);
+ if (likely(!err2))
+ goto lock_retry_remap;
+ if (err2 == -ENOMEM)
+ page_is_dirty = TRUE;
+ lcn = err2;
+ } else {
+ err2 = -EIO;
+ if (!rl)
+ up_read(&ni->runlist.lock);
+ }
+ /* Hard error. Abort writing this record. */
+ if (!err || err == -ENOMEM)
+ err = err2;
+ bh->b_blocknr = -1;
+ ntfs_error(vol->sb, "Cannot write ntfs record "
+ "0x%llx (inode 0x%lx, "
+ "attribute type 0x%x) because "
+ "its location on disk could "
+ "not be determined (error "
+ "code %lli).",
+ (long long)block <<
+ bh_size_bits >>
+ vol->mft_record_size_bits,
+ ni->mft_no, ni->type,
+ (long long)lcn);
+ /*
+ * If this is not the first buffer, remove the
+ * buffers in this record from the list of
+ * buffers to write and clear their dirty bit
+ * if not error -ENOMEM.
+ */
+ if (rec_start_bh != bh) {
+ while (bhs[--nr_bhs] != rec_start_bh)
+ ;
+ if (err2 != -ENOMEM) {
+ do {
+ clear_buffer_dirty(
+ rec_start_bh);
+ } while ((rec_start_bh =
+ rec_start_bh->
+ b_this_page) !=
+ bh);
+ }
+ }
+ continue;
+ }
+ }
+ BUG_ON(!buffer_uptodate(bh));
+ BUG_ON(nr_bhs >= max_bhs);
+ bhs[nr_bhs++] = bh;
+ } while (block++, (bh = bh->b_this_page) != head);
+ if (unlikely(rl))
+ up_read(&ni->runlist.lock);
+ /* If there were no dirty buffers, we are done. */
+ if (!nr_bhs)
+ goto done;
+ /* Map the page so we can access its contents. */
+ kaddr = kmap(page);
+ /* Clear the page uptodate flag whilst the mst fixups are applied. */
+ BUG_ON(!PageUptodate(page));
+ ClearPageUptodate(page);
+ for (i = 0; i < nr_bhs; i++) {
+ unsigned int ofs;
+
+ /* Skip buffers which are not at the beginning of records. */
+ if (i % bhs_per_rec)
+ continue;
+ tbh = bhs[i];
+ ofs = bh_offset(tbh);
+ if (is_mft) {
+ ntfs_inode *tni;
+ unsigned long mft_no;
+
+ /* Get the mft record number. */
+ mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+ >> rec_size_bits;
+ /* Check whether to write this mft record. */
+ tni = NULL;
+ if (!ntfs_may_write_mft_record(vol, mft_no,
+ (MFT_RECORD*)(kaddr + ofs), &tni)) {
+ /*
+ * The record should not be written. This
+ * means we need to redirty the page before
+ * returning.
+ */
+ page_is_dirty = TRUE;
+ /*
+ * Remove the buffers in this mft record from
+ * the list of buffers to write.
+ */
+ do {
+ bhs[i] = NULL;
+ } while (++i % bhs_per_rec);
+ continue;
+ }
+ /*
+ * The record should be written. If a locked ntfs
+ * inode was returned, add it to the array of locked
+ * ntfs inodes.
+ */
+ if (tni)
+ locked_nis[nr_locked_nis++] = tni;
+ }
+ /* Apply the mst protection fixups. */
+ err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
+ rec_size);
+ if (unlikely(err2)) {
+ if (!err || err == -ENOMEM)
+ err = -EIO;
+ ntfs_error(vol->sb, "Failed to apply mst fixups "
+ "(inode 0x%lx, attribute type 0x%x, "
+ "page index 0x%lx, page offset 0x%x)!"
+ " Unmount and run chkdsk.", vi->i_ino,
+ ni->type, page->index, ofs);
+ /*
+ * Mark all the buffers in this record clean as we do
+ * not want to write corrupt data to disk.
+ */
+ do {
+ clear_buffer_dirty(bhs[i]);
+ bhs[i] = NULL;
+ } while (++i % bhs_per_rec);
+ continue;
+ }
+ nr_recs++;
+ }
+ /* If no records are to be written out, we are done. */
+ if (!nr_recs)
+ goto unm_done;
+ flush_dcache_page(page);
+ /* Lock buffers and start synchronous write i/o on them. */
+ for (i = 0; i < nr_bhs; i++) {
+ tbh = bhs[i];
+ if (!tbh)
+ continue;
+ if (unlikely(test_set_buffer_locked(tbh)))
+ BUG();
+ /* The buffer dirty state is now irrelevant, just clean it. */
+ clear_buffer_dirty(tbh);
+ BUG_ON(!buffer_uptodate(tbh));
+ BUG_ON(!buffer_mapped(tbh));
+ get_bh(tbh);
+ tbh->b_end_io = end_buffer_write_sync;
+ submit_bh(WRITE, tbh);
+ }
+ /* Synchronize the mft mirror now if not @sync. */
+ if (is_mft && !sync)
+ goto do_mirror;
+do_wait:
+ /* Wait on i/o completion of buffers. */
+ for (i = 0; i < nr_bhs; i++) {
+ tbh = bhs[i];
+ if (!tbh)
+ continue;
+ wait_on_buffer(tbh);
+ if (unlikely(!buffer_uptodate(tbh))) {
+ ntfs_error(vol->sb, "I/O error while writing ntfs "
+ "record buffer (inode 0x%lx, "
+ "attribute type 0x%x, page index "
+ "0x%lx, page offset 0x%lx)! Unmount "
+ "and run chkdsk.", vi->i_ino, ni->type,
+ page->index, bh_offset(tbh));
+ if (!err || err == -ENOMEM)
+ err = -EIO;
+ /*
+ * Set the buffer uptodate so the page and buffer
+ * states do not become out of sync.
+ */
+ set_buffer_uptodate(tbh);
+ }
+ }
+ /* If @sync, now synchronize the mft mirror. */
+ if (is_mft && sync) {
+do_mirror:
+ for (i = 0; i < nr_bhs; i++) {
+ unsigned long mft_no;
+ unsigned int ofs;
+
+ /*
+ * Skip buffers which are not at the beginning of
+ * records.
+ */
+ if (i % bhs_per_rec)
+ continue;
+ tbh = bhs[i];
+ /* Skip removed buffers (and hence records). */
+ if (!tbh)
+ continue;
+ ofs = bh_offset(tbh);
+ /* Get the mft record number. */
+ mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+ >> rec_size_bits;
+ if (mft_no < vol->mftmirr_size)
+ ntfs_sync_mft_mirror(vol, mft_no,
+ (MFT_RECORD*)(kaddr + ofs),
+ sync);
+ }
+ if (!sync)
+ goto do_wait;
+ }
+ /* Remove the mst protection fixups again. */
+ for (i = 0; i < nr_bhs; i++) {
+ if (!(i % bhs_per_rec)) {
+ tbh = bhs[i];
+ if (!tbh)
+ continue;
+ post_write_mst_fixup((NTFS_RECORD*)(kaddr +
+ bh_offset(tbh)));
+ }
+ }
+ flush_dcache_page(page);
+unm_done:
+ /* Unlock any locked inodes. */
+ while (nr_locked_nis-- > 0) {
+ ntfs_inode *tni, *base_tni;
+
+ tni = locked_nis[nr_locked_nis];
+ /* Get the base inode. */
+ mutex_lock(&tni->extent_lock);
+ if (tni->nr_extents >= 0)
+ base_tni = tni;
+ else {
+ base_tni = tni->ext.base_ntfs_ino;
+ BUG_ON(!base_tni);
+ }
+ mutex_unlock(&tni->extent_lock);
+ ntfs_debug("Unlocking %s inode 0x%lx.",
+ tni == base_tni ? "base" : "extent",
+ tni->mft_no);
+ mutex_unlock(&tni->mrec_lock);
+ atomic_dec(&tni->count);
+ iput(VFS_I(base_tni));
+ }
+ SetPageUptodate(page);
+ kunmap(page);
+done:
+ if (unlikely(err && err != -ENOMEM)) {
+ /*
+ * Set page error if there is only one ntfs record in the page.
+ * Otherwise we would loose per-record granularity.
+ */
+ if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
+ SetPageError(page);
+ NVolSetErrors(vol);
+ }
+ if (page_is_dirty) {
+ ntfs_debug("Page still contains one or more dirty ntfs "
+ "records. Redirtying the page starting at "
+ "record 0x%lx.", page->index <<
+ (PAGE_CACHE_SHIFT - rec_size_bits));
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ } else {
+ /*
+ * Keep the VM happy. This must be done otherwise the
+ * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
+ * the page is clean.
+ */
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ }
+ if (likely(!err))
+ ntfs_debug("Done.");
+ return err;
+}
+