2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
4 * Copyright (c) 2001-2004 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #include <linux/swap.h>
28 * __format_mft_record - initialize an empty mft record
29 * @m: mapped, pinned and locked for writing mft record
30 * @size: size of the mft record
31 * @rec_no: mft record number / inode number
33 * Private function to initialize an empty mft record. Use one of the two
34 * provided format_mft_record() functions instead.
36 static void __format_mft_record(MFT_RECORD *m, const int size,
37 const unsigned long rec_no)
42 m->magic = magic_FILE;
43 /* Aligned to 2-byte boundary. */
44 m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
45 m->usa_count = cpu_to_le16(size / NTFS_BLOCK_SIZE + 1);
46 /* Set the update sequence number to 1. */
47 *(u16*)((char*)m + ((sizeof(MFT_RECORD) + 1) & ~1)) = cpu_to_le16(1);
48 m->lsn = cpu_to_le64(0LL);
49 m->sequence_number = cpu_to_le16(1);
50 m->link_count = cpu_to_le16(0);
51 /* Aligned to 8-byte boundary. */
52 m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
53 (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
54 m->flags = cpu_to_le16(0);
56 * Using attrs_offset plus eight bytes (for the termination attribute),
57 * aligned to 8-byte boundary.
59 m->bytes_in_use = cpu_to_le32((le16_to_cpu(m->attrs_offset) + 8 + 7) &
61 m->bytes_allocated = cpu_to_le32(size);
62 m->base_mft_record = cpu_to_le64((MFT_REF)0);
63 m->next_attr_instance = cpu_to_le16(0);
64 a = (ATTR_RECORD*)((char*)m + le16_to_cpu(m->attrs_offset));
66 a->length = cpu_to_le32(0);
70 * format_mft_record - initialize an empty mft record
71 * @ni: ntfs inode of mft record
72 * @mft_rec: mapped, pinned and locked mft record (optional)
74 * Initialize an empty mft record. This is used when extending the MFT.
76 * If @mft_rec is NULL, we call map_mft_record() to obtain the
77 * record and we unmap it again when finished.
79 * We return 0 on success or -errno on error.
81 int format_mft_record(ntfs_inode *ni, MFT_RECORD *mft_rec)
88 m = map_mft_record(ni);
92 __format_mft_record(m, ni->vol->mft_record_size, ni->mft_no);
94 // FIXME: Need to set the mft record dirty!
101 * ntfs_readpage - external declaration, function is in fs/ntfs/aops.c
103 extern int ntfs_readpage(struct file *, struct page *);
107 * ntfs_mft_writepage - forward declaration, function is further below
109 static int ntfs_mft_writepage(struct page *page, struct writeback_control *wbc);
113 * ntfs_mft_aops - address space operations for access to $MFT
115 * Address space operations for access to $MFT. This allows us to simply use
116 * ntfs_map_page() in map_mft_record_page().
118 struct address_space_operations ntfs_mft_aops = {
119 .readpage = ntfs_readpage, /* Fill page with data. */
120 .sync_page = block_sync_page, /* Currently, just unplugs the
121 disk request queue. */
123 .writepage = ntfs_mft_writepage, /* Write out the dirty mft
124 records in a page. */
129 * map_mft_record_page - map the page in which a specific mft record resides
130 * @ni: ntfs inode whose mft record page to map
132 * This maps the page in which the mft record of the ntfs inode @ni is situated
133 * and returns a pointer to the mft record within the mapped page.
135 * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
136 * contains the negative error code returned.
138 static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
140 ntfs_volume *vol = ni->vol;
141 struct inode *mft_vi = vol->mft_ino;
143 unsigned long index, ofs, end_index;
147 * The index into the page cache and the offset within the page cache
148 * page of the wanted mft record. FIXME: We need to check for
149 * overflowing the unsigned long, but I don't think we would ever get
150 * here if the volume was that big...
152 index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
153 ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
155 /* The maximum valid index into the page cache for $MFT's data. */
156 end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
158 /* If the wanted index is out of bounds the mft record doesn't exist. */
159 if (unlikely(index >= end_index)) {
160 if (index > end_index || (mft_vi->i_size & ~PAGE_CACHE_MASK) <
161 ofs + vol->mft_record_size) {
162 page = ERR_PTR(-ENOENT);
166 /* Read, map, and pin the page. */
167 page = ntfs_map_page(mft_vi->i_mapping, index);
168 if (likely(!IS_ERR(page))) {
171 return page_address(page) + ofs;
176 ntfs_error(vol->sb, "Failed with error code %lu.", -PTR_ERR(page));
181 * map_mft_record - map, pin and lock an mft record
182 * @ni: ntfs inode whose MFT record to map
184 * First, take the mrec_lock semaphore. We might now be sleeping, while waiting
185 * for the semaphore if it was already locked by someone else.
187 * The page of the record is mapped using map_mft_record_page() before being
188 * returned to the caller.
190 * This in turn uses ntfs_map_page() to get the page containing the wanted mft
191 * record (it in turn calls read_cache_page() which reads it in from disk if
192 * necessary, increments the use count on the page so that it cannot disappear
193 * under us and returns a reference to the page cache page).
195 * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
196 * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
197 * and the post-read mst fixups on each mft record in the page have been
198 * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
199 * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
200 * ntfs_map_page() waits for PG_locked to become clear and checks if
201 * PG_uptodate is set and returns an error code if not. This provides
202 * sufficient protection against races when reading/using the page.
204 * However there is the write mapping to think about. Doing the above described
205 * checking here will be fine, because when initiating the write we will set
206 * PG_locked and clear PG_uptodate making sure nobody is touching the page
207 * contents. Doing the locking this way means that the commit to disk code in
208 * the page cache code paths is automatically sufficiently locked with us as
209 * we will not touch a page that has been locked or is not uptodate. The only
210 * locking problem then is them locking the page while we are accessing it.
212 * So that code will end up having to own the mrec_lock of all mft
213 * records/inodes present in the page before I/O can proceed. In that case we
214 * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
215 * accessing anything without owning the mrec_lock semaphore. But we do need
216 * to use them because of the read_cache_page() invocation and the code becomes
217 * so much simpler this way that it is well worth it.
219 * The mft record is now ours and we return a pointer to it. You need to check
220 * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
223 * NOTE: Caller is responsible for setting the mft record dirty before calling
224 * unmap_mft_record(). This is obviously only necessary if the caller really
225 * modified the mft record...
226 * Q: Do we want to recycle one of the VFS inode state bits instead?
227 * A: No, the inode ones mean we want to change the mft record, not we want to
230 MFT_RECORD *map_mft_record(ntfs_inode *ni)
234 ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
236 /* Make sure the ntfs inode doesn't go away. */
237 atomic_inc(&ni->count);
239 /* Serialize access to this mft record. */
240 down(&ni->mrec_lock);
242 m = map_mft_record_page(ni);
243 if (likely(!IS_ERR(m)))
247 atomic_dec(&ni->count);
248 ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
253 * unmap_mft_record_page - unmap the page in which a specific mft record resides
254 * @ni: ntfs inode whose mft record page to unmap
256 * This unmaps the page in which the mft record of the ntfs inode @ni is
257 * situated and returns. This is a NOOP if highmem is not configured.
259 * The unmap happens via ntfs_unmap_page() which in turn decrements the use
260 * count on the page thus releasing it from the pinned state.
262 * We do not actually unmap the page from memory of course, as that will be
263 * done by the page cache code itself when memory pressure increases or
266 static inline void unmap_mft_record_page(ntfs_inode *ni)
270 // TODO: If dirty, blah...
271 ntfs_unmap_page(ni->page);
278 * unmap_mft_record - release a mapped mft record
279 * @ni: ntfs inode whose MFT record to unmap
281 * We release the page mapping and the mrec_lock mutex which unmaps the mft
282 * record and releases it for others to get hold of. We also release the ntfs
283 * inode by decrementing the ntfs inode reference count.
285 * NOTE: If caller has modified the mft record, it is imperative to set the mft
286 * record dirty BEFORE calling unmap_mft_record().
288 void unmap_mft_record(ntfs_inode *ni)
290 struct page *page = ni->page;
294 ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
296 unmap_mft_record_page(ni);
298 atomic_dec(&ni->count);
300 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
301 * ntfs_clear_extent_inode() in the extent inode case, and to the
302 * caller in the non-extent, yet pure ntfs inode case, to do the actual
303 * tear down of all structures and freeing of all allocated memory.
309 * map_extent_mft_record - load an extent inode and attach it to its base
310 * @base_ni: base ntfs inode
311 * @mref: mft reference of the extent inode to load (in little endian)
312 * @ntfs_ino: on successful return, pointer to the ntfs_inode structure
314 * Load the extent mft record @mref and attach it to its base inode @base_ni.
315 * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
316 * PTR_ERR(result) gives the negative error code.
318 * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
319 * structure of the mapped extent inode.
321 MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
322 ntfs_inode **ntfs_ino)
325 ntfs_inode *ni = NULL;
326 ntfs_inode **extent_nis = NULL;
328 unsigned long mft_no = MREF_LE(mref);
329 u16 seq_no = MSEQNO_LE(mref);
330 BOOL destroy_ni = FALSE;
332 ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
333 mft_no, base_ni->mft_no);
334 /* Make sure the base ntfs inode doesn't go away. */
335 atomic_inc(&base_ni->count);
337 * Check if this extent inode has already been added to the base inode,
338 * in which case just return it. If not found, add it to the base
339 * inode before returning it.
341 down(&base_ni->extent_lock);
342 if (base_ni->nr_extents > 0) {
343 extent_nis = base_ni->ext.extent_ntfs_inos;
344 for (i = 0; i < base_ni->nr_extents; i++) {
345 if (mft_no != extent_nis[i]->mft_no)
348 /* Make sure the ntfs inode doesn't go away. */
349 atomic_inc(&ni->count);
353 if (likely(ni != NULL)) {
354 up(&base_ni->extent_lock);
355 atomic_dec(&base_ni->count);
356 /* We found the record; just have to map and return it. */
357 m = map_mft_record(ni);
358 /* map_mft_record() has incremented this on success. */
359 atomic_dec(&ni->count);
360 if (likely(!IS_ERR(m))) {
361 /* Verify the sequence number. */
362 if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
363 ntfs_debug("Done 1.");
367 unmap_mft_record(ni);
368 ntfs_error(base_ni->vol->sb, "Found stale extent mft "
369 "reference! Corrupt file system. "
371 return ERR_PTR(-EIO);
374 ntfs_error(base_ni->vol->sb, "Failed to map extent "
375 "mft record, error code %ld.", -PTR_ERR(m));
378 /* Record wasn't there. Get a new ntfs inode and initialize it. */
379 ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
381 up(&base_ni->extent_lock);
382 atomic_dec(&base_ni->count);
383 return ERR_PTR(-ENOMEM);
385 ni->vol = base_ni->vol;
388 ni->ext.base_ntfs_ino = base_ni;
389 /* Now map the record. */
390 m = map_mft_record(ni);
391 if (unlikely(IS_ERR(m))) {
392 up(&base_ni->extent_lock);
393 atomic_dec(&base_ni->count);
394 ntfs_clear_extent_inode(ni);
397 /* Verify the sequence number. */
398 if (unlikely(le16_to_cpu(m->sequence_number) != seq_no)) {
399 ntfs_error(base_ni->vol->sb, "Found stale extent mft "
400 "reference! Corrupt file system. Run chkdsk.");
405 /* Attach extent inode to base inode, reallocating memory if needed. */
406 if (!(base_ni->nr_extents & 3)) {
408 int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
410 tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS);
411 if (unlikely(!tmp)) {
412 ntfs_error(base_ni->vol->sb, "Failed to allocate "
415 m = ERR_PTR(-ENOMEM);
418 if (base_ni->ext.extent_ntfs_inos) {
419 memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
420 4 * sizeof(ntfs_inode *));
421 kfree(base_ni->ext.extent_ntfs_inos);
423 base_ni->ext.extent_ntfs_inos = tmp;
425 base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
426 up(&base_ni->extent_lock);
427 atomic_dec(&base_ni->count);
428 ntfs_debug("Done 2.");
432 unmap_mft_record(ni);
433 up(&base_ni->extent_lock);
434 atomic_dec(&base_ni->count);
436 * If the extent inode was not attached to the base inode we need to
437 * release it or we will leak memory.
440 ntfs_clear_extent_inode(ni);
447 * __mark_mft_record_dirty - set the mft record and the page containing it dirty
448 * @ni: ntfs inode describing the mapped mft record
450 * Internal function. Users should call mark_mft_record_dirty() instead.
452 * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
453 * as well as the page containing the mft record, dirty. Also, mark the base
454 * vfs inode dirty. This ensures that any changes to the mft record are
455 * written out to disk.
457 * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
458 * on the base vfs inode, because even though file data may have been modified,
459 * it is dirty in the inode meta data rather than the data page cache of the
460 * inode, and thus there are no data pages that need writing out. Therefore, a
461 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
462 * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
463 * ensure ->write_inode is called from generic_osync_inode() and this needs to
464 * happen or the file data would not necessarily hit the device synchronously,
465 * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC
466 * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
467 * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
470 void __mark_mft_record_dirty(ntfs_inode *ni)
472 struct page *page = ni->page;
475 ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
477 BUG_ON(NInoAttr(ni));
480 * Set the page containing the mft record dirty. This also marks the
481 * $MFT inode dirty (I_DIRTY_PAGES).
483 __set_page_dirty_nobuffers(page);
485 /* Determine the base vfs inode and mark it dirty, too. */
486 down(&ni->extent_lock);
487 if (likely(ni->nr_extents >= 0))
490 base_ni = ni->ext.base_ntfs_ino;
491 up(&ni->extent_lock);
492 __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
495 static const char *ntfs_please_email = "Please email "
496 "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
497 "this message. Thank you.";
500 * sync_mft_mirror_umount - synchronise an mft record to the mft mirror
501 * @ni: ntfs inode whose mft record to synchronize
502 * @m: mapped, mst protected (extent) mft record to synchronize
504 * Write the mapped, mst protected (extent) mft record @m described by the
505 * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr) bypassing
506 * the page cache and the $MFTMirr inode itself.
508 * This function is only for use at umount time when the mft mirror inode has
509 * already been disposed off. We BUG() if we are called while the mft mirror
510 * inode is still attached to the volume.
512 * On success return 0. On error return -errno.
514 * NOTE: This function is not implemented yet as I am not convinced it can
515 * actually be triggered considering the sequence of commits we do in super.c::
516 * ntfs_put_super(). But just in case we provide this place holder as the
517 * alternative would be either to BUG() or to get a NULL pointer dereference
520 static int sync_mft_mirror_umount(ntfs_inode *ni, MFT_RECORD *m)
522 ntfs_volume *vol = ni->vol;
524 BUG_ON(vol->mftmirr_ino);
525 ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
526 "implemented yet. %s", ntfs_please_email);
531 * sync_mft_mirror - synchronize an mft record to the mft mirror
532 * @ni: ntfs inode whose mft record to synchronize
533 * @m: mapped, mst protected (extent) mft record to synchronize
534 * @sync: if true, wait for i/o completion
536 * Write the mapped, mst protected (extent) mft record @m described by the
537 * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr).
539 * On success return 0. On error return -errno and set the volume errors flag
540 * in the ntfs_volume to which @ni belongs.
542 * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
544 * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
545 * schedule i/o via ->writepage or do it via kntfsd or whatever.
547 static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync)
549 ntfs_volume *vol = ni->vol;
551 unsigned int blocksize = vol->sb->s_blocksize;
552 int max_bhs = vol->mft_record_size / blocksize;
553 struct buffer_head *bhs[max_bhs];
554 struct buffer_head *bh, *head;
556 unsigned int block_start, block_end, m_start, m_end;
557 int i_bhs, nr_bhs, err = 0;
559 ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
561 if (unlikely(!vol->mftmirr_ino)) {
562 /* This could happen during umount... */
563 err = sync_mft_mirror_umount(ni, m);
568 /* Get the page containing the mirror copy of the mft record @m. */
569 page = ntfs_map_page(vol->mftmirr_ino->i_mapping, ni->mft_no >>
570 (PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
571 if (unlikely(IS_ERR(page))) {
572 ntfs_error(vol->sb, "Failed to map mft mirror page.");
577 * Exclusion against other writers. This should never be a problem
578 * since the page in which the mft record @m resides is also locked and
579 * hence any other writers would be held up there but it is better to
580 * make sure no one is writing from elsewhere.
583 /* The address in the page of the mirror copy of the mft record @m. */
584 kmirr = page_address(page) + ((ni->mft_no << vol->mft_record_size_bits)
586 /* Copy the mst protected mft record to the mirror. */
587 memcpy(kmirr, m, vol->mft_record_size);
588 /* Make sure we have mapped buffers. */
589 if (!page_has_buffers(page)) {
591 ntfs_error(vol->sb, "Writing mft mirror records without "
592 "existing buffers is not implemented yet. %s",
597 bh = head = page_buffers(page);
599 goto no_buffers_err_out;
602 m_start = kmirr - (u8*)page_address(page);
603 m_end = m_start + vol->mft_record_size;
605 block_end = block_start + blocksize;
607 * If the buffer is outside the mft record, just skip it,
608 * clearing it if it is dirty to make sure it is not written
609 * out. It should never be marked dirty but better be safe.
611 if ((block_end <= m_start) || (block_start >= m_end)) {
612 if (buffer_dirty(bh)) {
613 ntfs_warning(vol->sb, "Clearing dirty mft "
614 "record page buffer. %s",
616 clear_buffer_dirty(bh);
620 if (!buffer_mapped(bh)) {
621 ntfs_error(vol->sb, "Writing mft mirror records "
622 "without existing mapped buffers is "
623 "not implemented yet. %s",
628 if (!buffer_uptodate(bh)) {
629 ntfs_error(vol->sb, "Writing mft mirror records "
630 "without existing uptodate buffers is "
631 "not implemented yet. %s",
636 BUG_ON(!nr_bhs && (m_start != block_start));
637 BUG_ON(nr_bhs >= max_bhs);
639 BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
640 } while (block_start = block_end, (bh = bh->b_this_page) != head);
642 /* Lock buffers and start synchronous write i/o on them. */
643 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
644 struct buffer_head *tbh = bhs[i_bhs];
646 if (unlikely(test_set_buffer_locked(tbh)))
648 BUG_ON(!buffer_uptodate(tbh));
649 if (buffer_dirty(tbh))
650 clear_buffer_dirty(tbh);
652 tbh->b_end_io = end_buffer_write_sync;
653 submit_bh(WRITE, tbh);
655 /* Wait on i/o completion of buffers. */
656 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
657 struct buffer_head *tbh = bhs[i_bhs];
660 if (unlikely(!buffer_uptodate(tbh))) {
663 * Set the buffer uptodate so the page & buffer
664 * states don't become out of sync.
666 if (PageUptodate(page))
667 set_buffer_uptodate(tbh);
670 } else /* if (unlikely(err)) */ {
671 /* Clean the buffers. */
672 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
673 clear_buffer_dirty(bhs[i_bhs]);
676 /* Current state: all buffers are clean, unlocked, and uptodate. */
677 /* Remove the mst protection fixups again. */
678 post_write_mst_fixup((NTFS_RECORD*)kmirr);
679 flush_dcache_page(page);
681 ntfs_unmap_page(page);
683 /* I/O error during writing. This is really bad! */
684 ntfs_error(vol->sb, "I/O error while writing mft mirror "
685 "record 0x%lx! You should unmount the volume "
686 "and run chkdsk or ntfsfix.", ni->mft_no);
692 ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error code %i). "
693 "Volume will be left marked dirty on umount. Run "
694 "ntfsfix on the partition after umounting to correct "
696 /* We don't want to clear the dirty bit on umount. */
702 * write_mft_record_nolock - write out a mapped (extent) mft record
703 * @ni: ntfs inode describing the mapped (extent) mft record
704 * @m: mapped (extent) mft record to write
705 * @sync: if true, wait for i/o completion
707 * Write the mapped (extent) mft record @m described by the (regular or extent)
708 * ntfs inode @ni to backing store. If the mft record @m has a counterpart in
709 * the mft mirror, that is also updated.
711 * On success, clean the mft record and return 0. On error, leave the mft
712 * record dirty and return -errno. The caller should call make_bad_inode() on
713 * the base inode to ensure no more access happens to this inode. We do not do
714 * it here as the caller may want to finish writing other extent mft records
715 * first to minimize on-disk metadata inconsistencies.
717 * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
718 * However, if the mft record has a counterpart in the mft mirror and @sync is
719 * true, we write the mft record, wait for i/o completion, and only then write
720 * the mft mirror copy. This ensures that if the system crashes either the mft
721 * or the mft mirror will contain a self-consistent mft record @m. If @sync is
722 * false on the other hand, we start i/o on both and then wait for completion
723 * on them. This provides a speedup but no longer guarantees that you will end
724 * up with a self-consistent mft record in the case of a crash but if you asked
725 * for asynchronous writing you probably do not care about that anyway.
727 * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
728 * schedule i/o via ->writepage or do it via kntfsd or whatever.
730 int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
732 ntfs_volume *vol = ni->vol;
733 struct page *page = ni->page;
734 unsigned int blocksize = vol->sb->s_blocksize;
735 int max_bhs = vol->mft_record_size / blocksize;
736 struct buffer_head *bhs[max_bhs];
737 struct buffer_head *bh, *head;
738 unsigned int block_start, block_end, m_start, m_end;
739 int i_bhs, nr_bhs, err = 0;
741 ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
742 BUG_ON(NInoAttr(ni));
745 BUG_ON(!PageLocked(page));
747 * If the ntfs_inode is clean no need to do anything. If it is dirty,
748 * mark it as clean now so that it can be redirtied later on if needed.
749 * There is no danger of races as as long as the caller is holding the
750 * locks for the mft record @m and the page it is in.
752 if (!NInoTestClearDirty(ni))
754 /* Make sure we have mapped buffers. */
755 if (!page_has_buffers(page)) {
757 ntfs_error(vol->sb, "Writing mft records without existing "
758 "buffers is not implemented yet. %s",
763 bh = head = page_buffers(page);
765 goto no_buffers_err_out;
768 m_start = ni->page_ofs;
769 m_end = m_start + vol->mft_record_size;
771 block_end = block_start + blocksize;
773 * If the buffer is outside the mft record, just skip it,
774 * clearing it if it is dirty to make sure it is not written
775 * out. It should never be marked dirty but better be safe.
777 if ((block_end <= m_start) || (block_start >= m_end)) {
778 if (buffer_dirty(bh)) {
779 ntfs_warning(vol->sb, "Clearing dirty mft "
780 "record page buffer. %s",
782 clear_buffer_dirty(bh);
786 if (!buffer_mapped(bh)) {
787 ntfs_error(vol->sb, "Writing mft records without "
788 "existing mapped buffers is not "
789 "implemented yet. %s",
794 if (!buffer_uptodate(bh)) {
795 ntfs_error(vol->sb, "Writing mft records without "
796 "existing uptodate buffers is not "
797 "implemented yet. %s",
802 BUG_ON(!nr_bhs && (m_start != block_start));
803 BUG_ON(nr_bhs >= max_bhs);
805 BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
806 } while (block_start = block_end, (bh = bh->b_this_page) != head);
809 /* Apply the mst protection fixups. */
810 err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
812 ntfs_error(vol->sb, "Failed to apply mst fixups!");
815 flush_dcache_mft_record_page(ni);
816 /* Lock buffers and start synchronous write i/o on them. */
817 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
818 struct buffer_head *tbh = bhs[i_bhs];
820 if (unlikely(test_set_buffer_locked(tbh)))
822 BUG_ON(!buffer_uptodate(tbh));
823 if (buffer_dirty(tbh))
824 clear_buffer_dirty(tbh);
826 tbh->b_end_io = end_buffer_write_sync;
827 submit_bh(WRITE, tbh);
829 /* Synchronize the mft mirror now if not @sync. */
830 if (!sync && ni->mft_no < vol->mftmirr_size)
831 sync_mft_mirror(ni, m, sync);
832 /* Wait on i/o completion of buffers. */
833 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
834 struct buffer_head *tbh = bhs[i_bhs];
837 if (unlikely(!buffer_uptodate(tbh))) {
840 * Set the buffer uptodate so the page & buffer states
841 * don't become out of sync.
843 if (PageUptodate(page))
844 set_buffer_uptodate(tbh);
847 /* If @sync, now synchronize the mft mirror. */
848 if (sync && ni->mft_no < vol->mftmirr_size)
849 sync_mft_mirror(ni, m, sync);
850 /* Remove the mst protection fixups again. */
851 post_write_mst_fixup((NTFS_RECORD*)m);
852 flush_dcache_mft_record_page(ni);
854 /* I/O error during writing. This is really bad! */
855 ntfs_error(vol->sb, "I/O error while writing mft record "
856 "0x%lx! Marking base inode as bad. You "
857 "should unmount the volume and run chkdsk.",
865 /* Clean the buffers. */
866 for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
867 clear_buffer_dirty(bhs[i_bhs]);
870 * Current state: all buffers are clean, unlocked, and uptodate.
871 * The caller should mark the base inode as bad so that no more i/o
872 * happens. ->clear_inode() will still be invoked so all extent inodes
873 * and other allocated memory will be freed.
875 if (err == -ENOMEM) {
876 ntfs_error(vol->sb, "Not enough memory to write mft record. "
877 "Redirtying so the write is retried later.");
878 mark_mft_record_dirty(ni);
885 * ntfs_mft_writepage - check if a metadata page contains dirty mft records
886 * @page: metadata page possibly containing dirty mft records
887 * @wbc: writeback control structure
889 * This is called from the VM when it wants to have a dirty $MFT/$DATA metadata
890 * page cache page cleaned. The VM has already locked the page and marked it
891 * clean. Instead of writing the page as a conventional ->writepage function
892 * would do, we check if the page still contains any dirty mft records (it must
893 * have done at some point in the past since the page was marked dirty) and if
894 * none are found, i.e. all mft records are clean, we unlock the page and
895 * return. The VM is then free to do with the page as it pleases. If on the
896 * other hand we do find any dirty mft records in the page, we redirty the page
897 * before unlocking it and returning so the VM knows that the page is still
898 * busy and cannot be thrown out.
900 * Note, we do not actually write any dirty mft records here because they are
901 * dirty inodes and hence will be written by the VFS inode dirty code paths.
902 * There is no need to write them from the VM page dirty code paths, too and in
903 * fact once we implement journalling it would be a complete nightmare having
904 * two code paths leading to mft record writeout.
906 static int ntfs_mft_writepage(struct page *page, struct writeback_control *wbc)
908 struct inode *mft_vi = page->mapping->host;
909 struct super_block *sb = mft_vi->i_sb;
910 ntfs_volume *vol = NTFS_SB(sb);
913 ntfs_inode **extent_nis;
914 unsigned long mft_no;
916 BOOL is_dirty = FALSE;
918 BUG_ON(mft_vi != vol->mft_ino);
919 /* The first mft record number in the page. */
920 mft_no = page->index << (PAGE_CACHE_SHIFT - vol->mft_record_size_bits);
921 /* Number of mft records in the page. */
922 nr = PAGE_CACHE_SIZE >> vol->mft_record_size_bits;
924 ntfs_debug("Entering for %i inodes starting at 0x%lx.", nr, mft_no);
925 /* Iterate over the mft records in the page looking for a dirty one. */
926 maddr = (u8*)kmap(page);
927 for (i = 0; i < nr; ++i, ++mft_no, maddr += vol->mft_record_size) {
929 ntfs_inode *ni, *eni;
937 * Check if the inode corresponding to this mft record is in
938 * the VFS inode cache and obtain a reference to it if it is.
940 ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
942 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from
943 * here or we deadlock because the inode is already locked by
944 * the kernel (fs/fs-writeback.c::__sync_single_inode()) and
945 * ilookup5() waits until the inode is unlocked before
946 * returning it and it never gets unlocked because
947 * ntfs_mft_writepage() never returns. )-: Fortunately, we
948 * have inode 0 pinned in icache for the duration of the mount
949 * so we can access it directly.
952 /* Balance the below iput(). */
954 BUG_ON(vi != mft_vi);
956 vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
958 ntfs_debug("Inode 0x%lx is in icache.", mft_no);
959 /* The inode is in icache. Check if it is dirty. */
961 if (!NInoDirty(ni)) {
962 /* The inode is not dirty, skip this record. */
963 ntfs_debug("Inode 0x%lx is not dirty, "
964 "continuing search.", mft_no);
968 ntfs_debug("Inode 0x%lx is dirty, aborting search.",
970 /* The inode is dirty, no need to search further. */
975 ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
976 /* The inode is not in icache. */
977 /* Skip the record if it is not a mft record (type "FILE"). */
978 if (!ntfs_is_mft_recordp(maddr)) {
979 ntfs_debug("Mft record 0x%lx is not a FILE record, "
980 "continuing search.", mft_no);
983 m = (MFT_RECORD*)maddr;
985 * Skip the mft record if it is not in use. FIXME: What about
986 * deleted/deallocated (extent) inodes? (AIA)
988 if (!(m->flags & MFT_RECORD_IN_USE)) {
989 ntfs_debug("Mft record 0x%lx is not in use, "
990 "continuing search.", mft_no);
993 /* Skip the mft record if it is a base inode. */
994 if (!m->base_mft_record) {
995 ntfs_debug("Mft record 0x%lx is a base record, "
996 "continuing search.", mft_no);
1000 * This is an extent mft record. Check if the inode
1001 * corresponding to its base mft record is in icache.
1003 na.mft_no = MREF_LE(m->base_mft_record);
1004 ntfs_debug("Mft record 0x%lx is an extent record. Looking "
1005 "for base inode 0x%lx in icache.", mft_no,
1007 vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode,
1011 * The base inode is not in icache. Skip this extent
1014 ntfs_debug("Base inode 0x%lx is not in icache, "
1015 "continuing search.", na.mft_no);
1018 ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
1020 * The base inode is in icache. Check if it has the extent
1021 * inode corresponding to this extent mft record attached.
1024 down(&ni->extent_lock);
1025 if (ni->nr_extents <= 0) {
1027 * The base inode has no attached extent inodes. Skip
1028 * this extent mft record.
1030 up(&ni->extent_lock);
1034 /* Iterate over the attached extent inodes. */
1035 extent_nis = ni->ext.extent_ntfs_inos;
1036 for (eni = NULL, j = 0; j < ni->nr_extents; ++j) {
1037 if (mft_no == extent_nis[j]->mft_no) {
1039 * Found the extent inode corresponding to this
1040 * extent mft record.
1042 eni = extent_nis[j];
1047 * If the extent inode was not attached to the base inode, skip
1048 * this extent mft record.
1051 up(&ni->extent_lock);
1056 * Found the extent inode corrsponding to this extent mft
1057 * record. If it is dirty, no need to search further.
1059 if (NInoDirty(eni)) {
1060 up(&ni->extent_lock);
1065 /* The extent inode is not dirty, so do the next record. */
1066 up(&ni->extent_lock);
1070 /* If a dirty mft record was found, redirty the page. */
1072 ntfs_debug("Inode 0x%lx is dirty. Redirtying the page "
1073 "starting at inode 0x%lx.", mft_no,
1074 page->index << (PAGE_CACHE_SHIFT -
1075 vol->mft_record_size_bits));
1076 redirty_page_for_writepage(wbc, page);
1080 * Keep the VM happy. This must be done otherwise the
1081 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1082 * the page is clean.
1084 BUG_ON(PageWriteback(page));
1085 set_page_writeback(page);
1087 end_page_writeback(page);
1089 ntfs_debug("Done.");
1093 #endif /* NTFS_RW */