vserver 1.9.5.x5
[linux-2.6.git] / fs / ntfs / dir.c
index 1f0de90..9357756 100644 (file)
  */
 
 #include <linux/smp_lock.h>
-#include "ntfs.h"
+#include <linux/buffer_head.h>
+
 #include "dir.h"
+#include "aops.h"
+#include "attrib.h"
+#include "mft.h"
+#include "debug.h"
+#include "ntfs.h"
 
 /**
  * The little endian Unicode string $I30 as a global constant.
  */
-uchar_t I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
-               const_cpu_to_le16('3'), const_cpu_to_le16('0'),
-               const_cpu_to_le16(0) };
+ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
+               const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 };
 
 /**
  * ntfs_lookup_inode_by_name - find an inode in a directory given its name
@@ -63,8 +68,16 @@ uchar_t I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
  * This is to avoid polluting the dcache with short file names. We want them to
  * work but we don't care for how quickly one can access them. This also fixes
  * the dcache aliasing issues.
+ *
+ * Locking:  - Caller must hold i_sem on the directory.
+ *          - Each page cache page in the index allocation mapping must be
+ *            locked whilst being accessed otherwise we may find a corrupt
+ *            page due to it being under ->writepage at the moment which
+ *            applies the mst protection fixups before writing out and then
+ *            removes them again after the write is complete after which it 
+ *            unlocks the page.
  */
-MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const uchar_t *uname,
+MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
                const int uname_len, ntfs_name **res)
 {
        ntfs_volume *vol = dir_ni->vol;
@@ -75,7 +88,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const uchar_t *uname,
        INDEX_ALLOCATION *ia;
        u8 *index_end;
        u64 mref;
-       attr_search_context *ctx;
+       ntfs_attr_search_ctx *ctx;
        int err, rc;
        VCN vcn, old_vcn;
        struct address_space *ia_mapping;
@@ -83,24 +96,30 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const uchar_t *uname,
        u8 *kaddr;
        ntfs_name *name = NULL;
 
+       BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
+       BUG_ON(NInoAttr(dir_ni));
        /* Get hold of the mft record for the directory. */
        m = map_mft_record(dir_ni);
-       if (unlikely(IS_ERR(m))) {
+       if (IS_ERR(m)) {
                ntfs_error(sb, "map_mft_record() failed with error code %ld.",
                                -PTR_ERR(m));
                return ERR_MREF(PTR_ERR(m));
        }
-       ctx = get_attr_search_ctx(dir_ni, m);
+       ctx = ntfs_attr_get_search_ctx(dir_ni, m);
        if (unlikely(!ctx)) {
                err = -ENOMEM;
                goto err_out;
        }
        /* Find the index root attribute in the mft record. */
-       if (!lookup_attr(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, 0,
-                       ctx)) {
-               ntfs_error(sb, "Index root attribute missing in directory "
-                               "inode 0x%lx.", dir_ni->mft_no);
-               err = -EIO;
+       err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+                       0, ctx);
+       if (unlikely(err)) {
+               if (err == -ENOENT) {
+                       ntfs_error(sb, "Index root attribute missing in "
+                                       "directory inode 0x%lx.",
+                                       dir_ni->mft_no);
+                       err = -EIO;
+               }
                goto err_out;
        }
        /* Get to the index root value (it's been verified in read_inode). */
@@ -135,7 +154,7 @@ MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const uchar_t *uname,
                 * returning.
                 */
                if (ntfs_are_names_equal(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length,
                                CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
 found_it:
@@ -169,7 +188,7 @@ found_it:
                                *res = NULL;
                        }
                        mref = le64_to_cpu(ie->data.dir.indexed_file);
-                       put_attr_search_ctx(ctx);
+                       ntfs_attr_put_search_ctx(ctx);
                        unmap_mft_record(dir_ni);
                        return mref;
                }
@@ -186,7 +205,7 @@ found_it:
                if (!NVolCaseSensitive(vol) &&
                                ie->key.file_name.file_name_type &&
                                ntfs_are_names_equal(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length,
                                IGNORE_CASE, vol->upcase, vol->upcase_len)) {
                        int name_size = sizeof(ntfs_name);
@@ -206,7 +225,7 @@ found_it:
                        }
 
                        if (type != FILE_NAME_DOS)
-                               name_size += len * sizeof(uchar_t);
+                               name_size += len * sizeof(ntfschar);
                        name = kmalloc(name_size, GFP_NOFS);
                        if (!name) {
                                err = -ENOMEM;
@@ -217,7 +236,7 @@ found_it:
                        if (type != FILE_NAME_DOS) {
                                name->len = len;
                                memcpy(name->name, ie->key.file_name.file_name,
-                                               len * sizeof(uchar_t));
+                                               len * sizeof(ntfschar));
                        } else
                                name->len = 0;
                        *res = name;
@@ -227,7 +246,7 @@ found_it:
                 * know which way in the B+tree we have to go.
                 */
                rc = ntfs_collate_names(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, 1,
                                IGNORE_CASE, vol->upcase, vol->upcase_len);
                /*
@@ -246,7 +265,7 @@ found_it:
                 * collation.
                 */
                rc = ntfs_collate_names(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, 1,
                                CASE_SENSITIVE, vol->upcase, vol->upcase_len);
                if (rc == -1)
@@ -268,7 +287,7 @@ found_it:
         */
        if (!(ie->flags & INDEX_ENTRY_NODE)) {
                if (name) {
-                       put_attr_search_ctx(ctx);
+                       ntfs_attr_put_search_ctx(ctx);
                        unmap_mft_record(dir_ni);
                        return name->mref;
                }
@@ -281,17 +300,16 @@ found_it:
                ntfs_error(sb, "No index allocation attribute but index entry "
                                "requires one. Directory inode 0x%lx is "
                                "corrupt or driver bug.", dir_ni->mft_no);
-               err = -EIO;
                goto err_out;
        }
        /* Get the starting vcn of the index_block holding the child node. */
-       vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+       vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
        ia_mapping = VFS_I(dir_ni)->i_mapping;
        /*
         * We are done with the index root and the mft record. Release them,
         * otherwise we deadlock with ntfs_map_page().
         */
-       put_attr_search_ctx(ctx);
+       ntfs_attr_put_search_ctx(ctx);
        unmap_mft_record(dir_ni);
        m = NULL;
        ctx = NULL;
@@ -309,6 +327,7 @@ descend_into_child_node:
                err = PTR_ERR(page);
                goto err_out;
        }
+       lock_page(page);
        kaddr = (u8*)page_address(page);
 fast_descend_into_child_node:
        /* Get to the index allocation block. */
@@ -318,7 +337,13 @@ fast_descend_into_child_node:
        if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
                ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
                                "inode 0x%lx or driver bug.", dir_ni->mft_no);
-               err = -EIO;
+               goto unm_err_out;
+       }
+       /* Catch multi sector transfer fixup errors. */
+       if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+               ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+                               "corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+                               (unsigned long long)vcn, dir_ni->mft_no);
                goto unm_err_out;
        }
        if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
@@ -328,7 +353,6 @@ fast_descend_into_child_node:
                                "bug.", (unsigned long long)
                                sle64_to_cpu(ia->index_block_vcn),
                                (unsigned long long)vcn, dir_ni->mft_no);
-               err = -EIO;
                goto unm_err_out;
        }
        if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
@@ -340,7 +364,6 @@ fast_descend_into_child_node:
                                (unsigned long long)vcn, dir_ni->mft_no,
                                le32_to_cpu(ia->index.allocated_size) + 0x18,
                                dir_ni->itype.index.block_size);
-               err = -EIO;
                goto unm_err_out;
        }
        index_end = (u8*)ia + dir_ni->itype.index.block_size;
@@ -350,7 +373,6 @@ fast_descend_into_child_node:
                                "Cannot access! This is probably a bug in the "
                                "driver.", (unsigned long long)vcn,
                                dir_ni->mft_no);
-               err = -EIO;
                goto unm_err_out;
        }
        index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
@@ -358,7 +380,6 @@ fast_descend_into_child_node:
                ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
                                "inode 0x%lx exceeds maximum size.",
                                (unsigned long long)vcn, dir_ni->mft_no);
-               err = -EIO;
                goto unm_err_out;
        }
        /* The first index entry. */
@@ -378,7 +399,6 @@ fast_descend_into_child_node:
                        ntfs_error(sb, "Index entry out of bounds in "
                                        "directory inode 0x%lx.",
                                        dir_ni->mft_no);
-                       err = -EIO;
                        goto unm_err_out;
                }
                /*
@@ -395,7 +415,7 @@ fast_descend_into_child_node:
                 * returning.
                 */
                if (ntfs_are_names_equal(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length,
                                CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
 found_it2:
@@ -429,6 +449,7 @@ found_it2:
                                *res = NULL;
                        }
                        mref = le64_to_cpu(ie->data.dir.indexed_file);
+                       unlock_page(page);
                        ntfs_unmap_page(page);
                        return mref;
                }
@@ -445,7 +466,7 @@ found_it2:
                if (!NVolCaseSensitive(vol) &&
                                ie->key.file_name.file_name_type &&
                                ntfs_are_names_equal(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length,
                                IGNORE_CASE, vol->upcase, vol->upcase_len)) {
                        int name_size = sizeof(ntfs_name);
@@ -461,12 +482,13 @@ found_it2:
                                                "this message to "
                                                "linux-ntfs-dev@lists."
                                                "sourceforge.net.");
+                               unlock_page(page);
                                ntfs_unmap_page(page);
                                goto dir_err_out;
                        }
 
                        if (type != FILE_NAME_DOS)
-                               name_size += len * sizeof(uchar_t);
+                               name_size += len * sizeof(ntfschar);
                        name = kmalloc(name_size, GFP_NOFS);
                        if (!name) {
                                err = -ENOMEM;
@@ -477,7 +499,7 @@ found_it2:
                        if (type != FILE_NAME_DOS) {
                                name->len = len;
                                memcpy(name->name, ie->key.file_name.file_name,
-                                               len * sizeof(uchar_t));
+                                               len * sizeof(ntfschar));
                        } else
                                name->len = 0;
                        *res = name;
@@ -487,7 +509,7 @@ found_it2:
                 * know which way in the B+tree we have to go.
                 */
                rc = ntfs_collate_names(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, 1,
                                IGNORE_CASE, vol->upcase, vol->upcase_len);
                /*
@@ -506,7 +528,7 @@ found_it2:
                 * collation.
                 */
                rc = ntfs_collate_names(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, 1,
                                CASE_SENSITIVE, vol->upcase, vol->upcase_len);
                if (rc == -1)
@@ -529,12 +551,12 @@ found_it2:
                        ntfs_error(sb, "Index entry with child node found in "
                                        "a leaf node in directory inode 0x%lx.",
                                        dir_ni->mft_no);
-                       err = -EIO;
                        goto unm_err_out;
                }
                /* Child node present, descend into it. */
                old_vcn = vcn;
-               vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+               vcn = sle64_to_cpup((sle64*)((u8*)ie +
+                               le16_to_cpu(ie->length) - 8));
                if (vcn >= 0) {
                        /* If vcn is in the same page cache page as old_vcn we
                         * recycle the mapped page. */
@@ -543,12 +565,12 @@ found_it2:
                                        vol->cluster_size_bits >>
                                        PAGE_CACHE_SHIFT)
                                goto fast_descend_into_child_node;
+                       unlock_page(page);
                        ntfs_unmap_page(page);
                        goto descend_into_child_node;
                }
                ntfs_error(sb, "Negative child node vcn in directory inode "
                                "0x%lx.", dir_ni->mft_no);
-               err = -EIO;
                goto unm_err_out;
        }
        /*
@@ -557,16 +579,20 @@ found_it2:
         * associated with it.
         */
        if (name) {
+               unlock_page(page);
                ntfs_unmap_page(page);
                return name->mref;
        }
        ntfs_debug("Entry not found.");
        err = -ENOENT;
 unm_err_out:
+       unlock_page(page);
        ntfs_unmap_page(page);
 err_out:
+       if (!err)
+               err = -EIO;
        if (ctx)
-               put_attr_search_ctx(ctx);
+               ntfs_attr_put_search_ctx(ctx);
        if (m)
                unmap_mft_record(dir_ni);
        if (name) {
@@ -575,8 +601,7 @@ err_out:
        }
        return ERR_MREF(err);
 dir_err_out:
-       ntfs_error(sb, "Corrupt directory. Aborting lookup.");
-       err = -EIO;
+       ntfs_error(sb, "Corrupt directory.  Aborting lookup.");
        goto err_out;
 }
 
@@ -607,7 +632,7 @@ dir_err_out:
  *
  * Note, @uname_len does not include the (optional) terminating NULL character.
  */
-u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const uchar_t *uname,
+u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
                const int uname_len)
 {
        ntfs_volume *vol = dir_ni->vol;
@@ -618,7 +643,7 @@ u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const uchar_t *uname,
        INDEX_ALLOCATION *ia;
        u8 *index_end;
        u64 mref;
-       attr_search_context *ctx;
+       ntfs_attr_search_ctx *ctx;
        int err, rc;
        IGNORE_CASE_BOOL ic;
        VCN vcn, old_vcn;
@@ -633,17 +658,21 @@ u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const uchar_t *uname,
                                -PTR_ERR(m));
                return ERR_MREF(PTR_ERR(m));
        }
-       ctx = get_attr_search_ctx(dir_ni, m);
+       ctx = ntfs_attr_get_search_ctx(dir_ni, m);
        if (!ctx) {
                err = -ENOMEM;
                goto err_out;
        }
        /* Find the index root attribute in the mft record. */
-       if (!lookup_attr(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, 0,
-                       ctx)) {
-               ntfs_error(sb, "Index root attribute missing in directory "
-                               "inode 0x%lx.", dir_ni->mft_no);
-               err = -EIO;
+       err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+                       0, ctx);
+       if (unlikely(err)) {
+               if (err == -ENOENT) {
+                       ntfs_error(sb, "Index root attribute missing in "
+                                       "directory inode 0x%lx.",
+                                       dir_ni->mft_no);
+                       err = -EIO;
+               }
                goto err_out;
        }
        /* Get to the index root value (it's been verified in read_inode). */
@@ -689,12 +718,12 @@ u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const uchar_t *uname,
                 * convert it to cpu format before returning.
                 */
                if (ntfs_are_names_equal(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, ic,
                                vol->upcase, vol->upcase_len)) {
 found_it:
                        mref = le64_to_cpu(ie->data.dir.indexed_file);
-                       put_attr_search_ctx(ctx);
+                       ntfs_attr_put_search_ctx(ctx);
                        unmap_mft_record(dir_ni);
                        return mref;
                }
@@ -703,7 +732,7 @@ found_it:
                 * know which way in the B+tree we have to go.
                 */
                rc = ntfs_collate_names(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, 1,
                                IGNORE_CASE, vol->upcase, vol->upcase_len);
                /*
@@ -722,7 +751,7 @@ found_it:
                 * collation.
                 */
                rc = ntfs_collate_names(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, 1,
                                CASE_SENSITIVE, vol->upcase, vol->upcase_len);
                if (rc == -1)
@@ -750,7 +779,6 @@ found_it:
                ntfs_error(sb, "No index allocation attribute but index entry "
                                "requires one. Directory inode 0x%lx is "
                                "corrupt or driver bug.", dir_ni->mft_no);
-               err = -EIO;
                goto err_out;
        }
        /* Get the starting vcn of the index_block holding the child node. */
@@ -760,7 +788,7 @@ found_it:
         * We are done with the index root and the mft record. Release them,
         * otherwise we deadlock with ntfs_map_page().
         */
-       put_attr_search_ctx(ctx);
+       ntfs_attr_put_search_ctx(ctx);
        unmap_mft_record(dir_ni);
        m = NULL;
        ctx = NULL;
@@ -778,6 +806,7 @@ descend_into_child_node:
                err = PTR_ERR(page);
                goto err_out;
        }
+       lock_page(page);
        kaddr = (u8*)page_address(page);
 fast_descend_into_child_node:
        /* Get to the index allocation block. */
@@ -787,7 +816,13 @@ fast_descend_into_child_node:
        if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
                ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
                                "inode 0x%lx or driver bug.", dir_ni->mft_no);
-               err = -EIO;
+               goto unm_err_out;
+       }
+       /* Catch multi sector transfer fixup errors. */
+       if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+               ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+                               "corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+                               (unsigned long long)vcn, dir_ni->mft_no);
                goto unm_err_out;
        }
        if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
@@ -797,7 +832,6 @@ fast_descend_into_child_node:
                                "bug.", (unsigned long long)
                                sle64_to_cpu(ia->index_block_vcn),
                                (unsigned long long)vcn, dir_ni->mft_no);
-               err = -EIO;
                goto unm_err_out;
        }
        if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
@@ -809,7 +843,6 @@ fast_descend_into_child_node:
                                (unsigned long long)vcn, dir_ni->mft_no,
                                le32_to_cpu(ia->index.allocated_size) + 0x18,
                                dir_ni->itype.index.block_size);
-               err = -EIO;
                goto unm_err_out;
        }
        index_end = (u8*)ia + dir_ni->itype.index.block_size;
@@ -819,7 +852,6 @@ fast_descend_into_child_node:
                                "Cannot access! This is probably a bug in the "
                                "driver.", (unsigned long long)vcn,
                                dir_ni->mft_no);
-               err = -EIO;
                goto unm_err_out;
        }
        index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
@@ -827,7 +859,6 @@ fast_descend_into_child_node:
                ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
                                "inode 0x%lx exceeds maximum size.",
                                (unsigned long long)vcn, dir_ni->mft_no);
-               err = -EIO;
                goto unm_err_out;
        }
        /* The first index entry. */
@@ -847,7 +878,6 @@ fast_descend_into_child_node:
                        ntfs_error(sb, "Index entry out of bounds in "
                                        "directory inode 0x%lx.",
                                        dir_ni->mft_no);
-                       err = -EIO;
                        goto unm_err_out;
                }
                /*
@@ -875,11 +905,12 @@ fast_descend_into_child_node:
                 * convert it to cpu format before returning.
                 */
                if (ntfs_are_names_equal(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, ic,
                                vol->upcase, vol->upcase_len)) {
 found_it2:
                        mref = le64_to_cpu(ie->data.dir.indexed_file);
+                       unlock_page(page);
                        ntfs_unmap_page(page);
                        return mref;
                }
@@ -888,7 +919,7 @@ found_it2:
                 * know which way in the B+tree we have to go.
                 */
                rc = ntfs_collate_names(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, 1,
                                IGNORE_CASE, vol->upcase, vol->upcase_len);
                /*
@@ -907,7 +938,7 @@ found_it2:
                 * collation.
                 */
                rc = ntfs_collate_names(uname, uname_len,
-                               (uchar_t*)&ie->key.file_name.file_name,
+                               (ntfschar*)&ie->key.file_name.file_name,
                                ie->key.file_name.file_name_length, 1,
                                CASE_SENSITIVE, vol->upcase, vol->upcase_len);
                if (rc == -1)
@@ -930,7 +961,6 @@ found_it2:
                        ntfs_error(sb, "Index entry with child node found in "
                                        "a leaf node in directory inode 0x%lx.",
                                        dir_ni->mft_no);
-                       err = -EIO;
                        goto unm_err_out;
                }
                /* Child node present, descend into it. */
@@ -944,50 +974,41 @@ found_it2:
                                        vol->cluster_size_bits >>
                                        PAGE_CACHE_SHIFT)
                                goto fast_descend_into_child_node;
+                       unlock_page(page);
                        ntfs_unmap_page(page);
                        goto descend_into_child_node;
                }
                ntfs_error(sb, "Negative child node vcn in directory inode "
                                "0x%lx.", dir_ni->mft_no);
-               err = -EIO;
                goto unm_err_out;
        }
        /* No child node, return -ENOENT. */
        ntfs_debug("Entry not found.");
        err = -ENOENT;
 unm_err_out:
+       unlock_page(page);
        ntfs_unmap_page(page);
 err_out:
+       if (!err)
+               err = -EIO;
        if (ctx)
-               put_attr_search_ctx(ctx);
+               ntfs_attr_put_search_ctx(ctx);
        if (m)
                unmap_mft_record(dir_ni);
        return ERR_MREF(err);
 dir_err_out:
        ntfs_error(sb, "Corrupt directory. Aborting lookup.");
-       err = -EIO;
        goto err_out;
 }
 
 #endif
 
-typedef union {
-       INDEX_ROOT *ir;
-       INDEX_ALLOCATION *ia;
-} index_union __attribute__ ((__transparent_union__));
-
-typedef enum {
-       INDEX_TYPE_ROOT,        /* index root */
-       INDEX_TYPE_ALLOCATION,  /* index allocation */
-} INDEX_TYPE;
-
 /**
  * ntfs_filldir - ntfs specific filldir method
  * @vol:       current ntfs volume
  * @fpos:      position in the directory
  * @ndir:      ntfs inode of current directory
- * @index_type:        specifies whether @iu is an index root or an index allocation
- * @iu:                index root or index allocation attribute to which @ie belongs
+ * @ia_page:   page in which the index allocation buffer @ie is in resides
  * @ie:                current index entry
  * @name:      buffer to use for the converted name
  * @dirent:    vfs filldir callback context
@@ -995,24 +1016,25 @@ typedef enum {
  *
  * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
  * callback.
+ *
+ * If @ia_page is not NULL it is the locked page containing the index
+ * allocation block containing the index entry @ie.
+ *
+ * Note, we drop (and then reacquire) the page lock on @ia_page across the
+ * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
+ * since ntfs_lookup() will lock the same page.  As an optimization, we do not
+ * retake the lock if we are returning a non-zero value as ntfs_readdir()
+ * would need to drop the lock immediately anyway.
  */
-static inline int ntfs_filldir(ntfs_volume *vol, loff_t *fpos,
-               ntfs_inode *ndir, const INDEX_TYPE index_type,
-               index_union iu, INDEX_ENTRY *ie, u8 *name,
-               void *dirent, filldir_t filldir)
+static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+               ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
+               u8 *name, void *dirent, filldir_t filldir)
 {
-       int name_len;
+       unsigned long mref;
+       int name_len, rc;
        unsigned dt_type;
        FILE_NAME_TYPE_FLAGS name_type;
 
-       /* Advance the position even if going to skip the entry. */
-       if (index_type == INDEX_TYPE_ALLOCATION)
-               *fpos = (u8*)ie - (u8*)iu.ia +
-                               (sle64_to_cpu(iu.ia->index_block_vcn) <<
-                               ndir->itype.index.vcn_size_bits) +
-                               vol->mft_record_size;
-       else /* if (index_type == INDEX_TYPE_ROOT) */
-               *fpos = (u8*)ie - (u8*)iu.ir;
        name_type = ie->key.file_name.file_name_type;
        if (name_type == FILE_NAME_DOS) {
                ntfs_debug("Skipping DOS name space entry.");
@@ -1027,7 +1049,7 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t *fpos,
                ntfs_debug("Skipping system file.");
                return 0;
        }
-       name_len = ntfs_ucstonls(vol, (uchar_t*)&ie->key.file_name.file_name,
+       name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
                        ie->key.file_name.file_name_length, &name,
                        NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
        if (name_len <= 0) {
@@ -1039,24 +1061,42 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t *fpos,
                dt_type = DT_DIR;
        else
                dt_type = DT_REG;
+       mref = MREF_LE(ie->data.dir.indexed_file);
+       /*
+        * Drop the page lock otherwise we deadlock with NFS when it calls
+        * ->lookup since ntfs_lookup() will lock the same page.
+        */
+       if (ia_page)
+               unlock_page(ia_page);
        ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
-                       "0x%lx, DT_%s.", name, name_len, *fpos,
-                       MREF_LE(ie->data.dir.indexed_file),
+                       "0x%lx, DT_%s.", name, name_len, fpos, mref,
                        dt_type == DT_DIR ? "DIR" : "REG");
-       return filldir(dirent, name, name_len, *fpos,
-                       MREF_LE(ie->data.dir.indexed_file), dt_type);
+       rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+       /* Relock the page but not if we are aborting ->readdir. */
+       if (!rc && ia_page)
+               lock_page(ia_page);
+       return rc;
 }
 
 /*
- * VFS calls readdir without BKL but with i_sem held. This protects the VFS
- * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
- * modifications).
- *
  * We use the same basic approach as the old NTFS driver, i.e. we parse the
  * index root entries and then the index allocation entries that are marked
  * as in use in the index bitmap.
+ *
  * While this will return the names in random order this doesn't matter for
- * readdir but OTOH results in a faster readdir.
+ * ->readdir but OTOH results in a faster ->readdir.
+ *
+ * VFS calls ->readdir without BKL but with i_sem held. This protects the VFS
+ * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
+ * modifications).
+ *
+ * Locking:  - Caller must hold i_sem on the directory.
+ *          - Each page cache page in the index allocation mapping must be
+ *            locked whilst being accessed otherwise we may find a corrupt
+ *            page due to it being under ->writepage at the moment which
+ *            applies the mst protection fixups before writing out and then
+ *            removes them again after the write is complete after which it 
+ *            unlocks the page.
  */
 static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
@@ -1067,7 +1107,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        ntfs_inode *ndir = NTFS_I(vdir);
        ntfs_volume *vol = NTFS_SB(sb);
        MFT_RECORD *m;
-       INDEX_ROOT *ir;
+       INDEX_ROOT *ir = NULL;
        INDEX_ENTRY *ie;
        INDEX_ALLOCATION *ia;
        u8 *name = NULL;
@@ -1075,7 +1115,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct address_space *ia_mapping, *bmp_mapping;
        struct page *bmp_page = NULL, *ia_page = NULL;
        u8 *kaddr, *bmp, *index_end;
-       attr_search_context *ctx;
+       ntfs_attr_search_ctx *ctx;
 
        fpos = filp->f_pos;
        ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
@@ -1120,12 +1160,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                goto skip_index_root;
        /* Get hold of the mft record for the directory. */
        m = map_mft_record(ndir);
-       if (unlikely(IS_ERR(m))) {
+       if (IS_ERR(m)) {
                err = PTR_ERR(m);
                m = NULL;
                goto err_out;
        }
-       ctx = get_attr_search_ctx(ndir, m);
+       ctx = ntfs_attr_get_search_ctx(ndir, m);
        if (unlikely(!ctx)) {
                err = -ENOMEM;
                goto err_out;
@@ -1133,15 +1173,36 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        /* Get the offset into the index root attribute. */
        ir_pos = (s64)fpos;
        /* Find the index root attribute in the mft record. */
-       if (unlikely(!lookup_attr(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0,
-                       NULL, 0, ctx))) {
+       err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+                       0, ctx);
+       if (unlikely(err)) {
                ntfs_error(sb, "Index root attribute missing in directory "
                                "inode 0x%lx.", vdir->i_ino);
                goto err_out;
        }
-       /* Get to the index root value (it's been verified in read_inode). */
-       ir = (INDEX_ROOT*)((u8*)ctx->attr +
-                       le16_to_cpu(ctx->attr->data.resident.value_offset));
+       /*
+        * Copy the index root attribute value to a buffer so that we can put
+        * the search context and unmap the mft record before calling the
+        * filldir() callback.  We need to do this because of NFSd which calls
+        * ->lookup() from its filldir callback() and this causes NTFS to
+        * deadlock as ntfs_lookup() maps the mft record of the directory and
+        * we have got it mapped here already.  The only solution is for us to
+        * unmap the mft record here so that a call to ntfs_lookup() is able to
+        * map the mft record without deadlocking.
+        */
+       rc = le32_to_cpu(ctx->attr->data.resident.value_length);
+       ir = (INDEX_ROOT*)kmalloc(rc, GFP_NOFS);
+       if (unlikely(!ir)) {
+               err = -ENOMEM;
+               goto err_out;
+       }
+       /* Copy the index root value (it has been verified in read_inode). */
+       memcpy(ir, (u8*)ctx->attr +
+                       le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
+       ntfs_attr_put_search_ctx(ctx);
+       unmap_mft_record(ndir);
+       ctx = NULL;
+       m = NULL;
        index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
        /* The first index entry. */
        ie = (INDEX_ENTRY*)((u8*)&ir->index +
@@ -1152,9 +1213,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
         * or signals an error (both covered by the rc test).
         */
        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-               ntfs_debug("In index root, offset 0x%x.", (u8*)ie - (u8*)ir);
+               ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
                /* Bounds checks. */
-               if (unlikely((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+               if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
                                (u8*)ie + le16_to_cpu(ie->key_length) >
                                index_end))
@@ -1165,24 +1226,19 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                /* Skip index root entry if continuing previous readdir. */
                if (ir_pos > (u8*)ie - (u8*)ir)
                        continue;
+               /* Advance the position even if going to skip the entry. */
+               fpos = (u8*)ie - (u8*)ir;
                /* Submit the name to the filldir callback. */
-               rc = ntfs_filldir(vol, &fpos, ndir, INDEX_TYPE_ROOT, ir, ie,
-                               name, dirent, filldir);
+               rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
+                               filldir);
                if (rc) {
-                       put_attr_search_ctx(ctx);
-                       unmap_mft_record(ndir);
+                       kfree(ir);
                        goto abort;
                }
        }
-       /*
-        * We are done with the index root and the mft record for that matter.
-        * We need to release it, otherwise we deadlock on ntfs_attr_iget()
-        * and/or ntfs_read_page().
-        */
-       put_attr_search_ctx(ctx);
-       unmap_mft_record(ndir);
-       m = NULL;
-       ctx = NULL;
+       /* We are done with the index root and can free the buffer. */
+       kfree(ir);
+       ir = NULL;
        /* If there is no index allocation attribute we are finished. */
        if (!NInoIndexAllocPresent(ndir))
                goto EOD;
@@ -1196,9 +1252,9 @@ skip_index_root:
        ia_mapping = vdir->i_mapping;
        bmp_vi = ndir->itype.index.bmp_ino;
        if (unlikely(!bmp_vi)) {
-               ntfs_debug("Inode %lu, regetting index bitmap.", vdir->i_ino);
+               ntfs_debug("Inode 0x%lx, regetting index bitmap.", vdir->i_ino);
                bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
-               if (unlikely(IS_ERR(bmp_vi))) {
+               if (IS_ERR(bmp_vi)) {
                        ntfs_error(sb, "Failed to get bitmap attribute.");
                        err = PTR_ERR(bmp_vi);
                        goto err_out;
@@ -1220,10 +1276,10 @@ get_next_bmp_page:
        ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
                        (unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
                        (unsigned long long)bmp_pos &
-                       ((PAGE_CACHE_SIZE * 8) - 1));
+                       (unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
        bmp_page = ntfs_map_page(bmp_mapping,
                        bmp_pos >> (3 + PAGE_CACHE_SHIFT));
-       if (unlikely(IS_ERR(bmp_page))) {
+       if (IS_ERR(bmp_page)) {
                ntfs_error(sb, "Reading index bitmap failed.");
                err = PTR_ERR(bmp_page);
                bmp_page = NULL;
@@ -1255,19 +1311,22 @@ find_next_index_buffer:
        /* If the current index buffer is in the same page we reuse the page. */
        if ((prev_ia_pos & PAGE_CACHE_MASK) != (ia_pos & PAGE_CACHE_MASK)) {
                prev_ia_pos = ia_pos;
-               if (likely(ia_page != NULL))
+               if (likely(ia_page != NULL)) {
+                       unlock_page(ia_page);
                        ntfs_unmap_page(ia_page);
+               }
                /*
                 * Map the page cache page containing the current ia_pos,
                 * reading it from disk if necessary.
                 */
                ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT);
-               if (unlikely(IS_ERR(ia_page))) {
+               if (IS_ERR(ia_page)) {
                        ntfs_error(sb, "Reading index allocation data failed.");
                        err = PTR_ERR(ia_page);
                        ia_page = NULL;
                        goto err_out;
                }
+               lock_page(ia_page);
                kaddr = (u8*)page_address(ia_page);
        }
        /* Get the current index buffer. */
@@ -1279,6 +1338,14 @@ find_next_index_buffer:
                                "inode 0x%lx or driver bug.", vdir->i_ino);
                goto err_out;
        }
+       /* Catch multi sector transfer fixup errors. */
+       if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+               ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+                               "corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
+                               (unsigned long long)ia_pos >>
+                               ndir->itype.index.vcn_size_bits, vdir->i_ino);
+               goto err_out;
+       }
        if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
                        ~(s64)(ndir->itype.index.block_size - 1)) >>
                        ndir->itype.index.vcn_size_bits)) {
@@ -1331,8 +1398,8 @@ find_next_index_buffer:
         */
        for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
                ntfs_debug("In index allocation, offset 0x%llx.",
-                               (unsigned long long)ia_start + ((u8*)ie -
-                               (u8*)ia));
+                               (unsigned long long)ia_start +
+                               (unsigned long long)((u8*)ie - (u8*)ia));
                /* Bounds checks. */
                if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
                                sizeof(INDEX_ENTRY_HEADER) > index_end ||
@@ -1345,10 +1412,21 @@ find_next_index_buffer:
                /* Skip index block entry if continuing previous readdir. */
                if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
                        continue;
-               /* Submit the name to the filldir callback. */
-               rc = ntfs_filldir(vol, &fpos, ndir, INDEX_TYPE_ALLOCATION, ia,
-                               ie, name, dirent, filldir);
+               /* Advance the position even if going to skip the entry. */
+               fpos = (u8*)ie - (u8*)ia +
+                               (sle64_to_cpu(ia->index_block_vcn) <<
+                               ndir->itype.index.vcn_size_bits) +
+                               vol->mft_record_size;
+               /*
+                * Submit the name to the @filldir callback.  Note,
+                * ntfs_filldir() drops the lock on @ia_page but it retakes it
+                * before returning, unless a non-zero value is returned in
+                * which case the page is left unlocked.
+                */
+               rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
+                               filldir);
                if (rc) {
+                       /* @ia_page is already unlocked in this case. */
                        ntfs_unmap_page(ia_page);
                        ntfs_unmap_page(bmp_page);
                        goto abort;
@@ -1356,8 +1434,10 @@ find_next_index_buffer:
        }
        goto find_next_index_buffer;
 unm_EOD:
-       if (ia_page)
+       if (ia_page) {
+               unlock_page(ia_page);
                ntfs_unmap_page(ia_page);
+       }
        ntfs_unmap_page(bmp_page);
 EOD:
        /* We are finished, set fpos to EOD. */
@@ -1377,12 +1457,16 @@ done:
 err_out:
        if (bmp_page)
                ntfs_unmap_page(bmp_page);
-       if (ia_page)
+       if (ia_page) {
+               unlock_page(ia_page);
                ntfs_unmap_page(ia_page);
+       }
+       if (ir)
+               kfree(ir);
        if (name)
                kfree(name);
        if (ctx)
-               put_attr_search_ctx(ctx);
+               ntfs_attr_put_search_ctx(ctx);
        if (m)
                unmap_mft_record(ndir);
        if (!err)
@@ -1417,10 +1501,69 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
        return 0;
 }
 
+#ifdef NTFS_RW
+
+/**
+ * ntfs_dir_fsync - sync a directory to disk
+ * @filp:      directory to be synced
+ * @dentry:    dentry describing the directory to sync
+ * @datasync:  if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a directory to disk.  Used for fsync, fdatasync, and
+ * msync system calls.  This function is based on file.c::ntfs_file_fsync().
+ *
+ * Write the mft record and all associated extent mft records as well as the
+ * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
+ *
+ * If @datasync is true, we do not wait on the inode(s) to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Note: In the past @filp could be NULL so we ignore it as we don't need it
+ * anyway.
+ *
+ * Locking: Caller must hold i_sem on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now.  We do write the $BITMAP attribute if it is present
+ * which is the important one for a directory so things are not too bad.
+ */
+static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
+               int datasync)
+{
+       struct inode *vi = dentry->d_inode;
+       ntfs_inode *ni = NTFS_I(vi);
+       int err, ret;
+
+       ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+       BUG_ON(!S_ISDIR(vi->i_mode));
+       if (NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino)
+               write_inode_now(ni->itype.index.bmp_ino, !datasync);
+       ret = ntfs_write_inode(vi, 1);
+       write_inode_now(vi, !datasync);
+       err = sync_blockdev(vi->i_sb->s_bdev);
+       if (unlikely(err && !ret))
+               ret = err;
+       if (likely(!ret))
+               ntfs_debug("Done.");
+       else
+               ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
+                               "%u.", datasync ? "data" : "", vi->i_ino, -ret);
+       return ret;
+}
+
+#endif /* NTFS_RW */
+
 struct file_operations ntfs_dir_ops = {
        .llseek         = generic_file_llseek,  /* Seek inside directory. */
        .read           = generic_read_dir,     /* Return -EISDIR. */
        .readdir        = ntfs_readdir,         /* Read directory contents. */
+#ifdef NTFS_RW
+       .fsync          = ntfs_dir_fsync,       /* Sync a directory to disk. */
+       /*.aio_fsync    = ,*/                   /* Sync all outstanding async
+                                                  i/o operations on a kiocb. */
+#endif /* NTFS_RW */
+       /*.ioctl        = ,*/                   /* Perform function on the
+                                                  mounted filesystem. */
        .open           = ntfs_dir_open,        /* Open directory. */
 };
-