patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / fs / ntfs / aops.c
1 /**
2  * aops.c - NTFS kernel address space operations and page cache handling.
3  *          Part of the Linux-NTFS project.
4  *
5  * Copyright (c) 2001-2004 Anton Altaparmakov
6  * Copyright (c) 2002 Richard Russon
7  *
8  * This program/include file is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as published
10  * by the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program/include file is distributed in the hope that it will be
14  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
15  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program (in the main directory of the Linux-NTFS
20  * distribution in the file COPYING); if not, write to the Free Software
21  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23
24 #include <linux/errno.h>
25 #include <linux/mm.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/buffer_head.h>
29
30 #include "ntfs.h"
31
32 /**
33  * ntfs_end_buffer_async_read - async io completion for reading attributes
34  * @bh:         buffer head on which io is completed
35  * @uptodate:   whether @bh is now uptodate or not
36  *
37  * Asynchronous I/O completion handler for reading pages belonging to the
38  * attribute address space of an inode. The inodes can either be files or
39  * directories or they can be fake inodes describing some attribute.
40  *
41  * If NInoMstProtected(), perform the post read mst fixups when all IO on the
42  * page has been completed and mark the page uptodate or set the error bit on
43  * the page. To determine the size of the records that need fixing up, we cheat
44  * a little bit by setting the index_block_size in ntfs_inode to the ntfs
45  * record size, and index_block_size_bits, to the log(base 2) of the ntfs
46  * record size.
47  */
48 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
49 {
50         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
51         unsigned long flags;
52         struct buffer_head *tmp;
53         struct page *page;
54         ntfs_inode *ni;
55         int page_uptodate = 1;
56
57         page = bh->b_page;
58         ni = NTFS_I(page->mapping->host);
59
60         if (likely(uptodate)) {
61                 s64 file_ofs;
62
63                 set_buffer_uptodate(bh);
64
65                 file_ofs = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
66                 /* Check for the current buffer head overflowing. */
67                 if (file_ofs + bh->b_size > ni->initialized_size) {
68                         char *addr;
69                         int ofs = 0;
70
71                         if (file_ofs < ni->initialized_size)
72                                 ofs = ni->initialized_size - file_ofs;
73                         addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
74                         memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
75                         flush_dcache_page(page);
76                         kunmap_atomic(addr, KM_BIO_SRC_IRQ);
77                 }
78         } else {
79                 clear_buffer_uptodate(bh);
80                 ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
81                                 (unsigned long long)bh->b_blocknr);
82                 SetPageError(page);
83         }
84
85         spin_lock_irqsave(&page_uptodate_lock, flags);
86         clear_buffer_async_read(bh);
87         unlock_buffer(bh);
88         tmp = bh;
89         do {
90                 if (!buffer_uptodate(tmp))
91                         page_uptodate = 0;
92                 if (buffer_async_read(tmp)) {
93                         if (likely(buffer_locked(tmp)))
94                                 goto still_busy;
95                         /* Async buffers must be locked. */
96                         BUG();
97                 }
98                 tmp = tmp->b_this_page;
99         } while (tmp != bh);
100         spin_unlock_irqrestore(&page_uptodate_lock, flags);
101         /*
102          * If none of the buffers had errors then we can set the page uptodate,
103          * but we first have to perform the post read mst fixups, if the
104          * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
105          */
106         if (!NInoMstProtected(ni)) {
107                 if (likely(page_uptodate && !PageError(page)))
108                         SetPageUptodate(page);
109         } else {
110                 char *addr;
111                 unsigned int i, recs, nr_err;
112                 u32 rec_size;
113
114                 rec_size = ni->itype.index.block_size;
115                 recs = PAGE_CACHE_SIZE / rec_size;
116                 addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
117                 for (i = nr_err = 0; i < recs; i++) {
118                         if (likely(!post_read_mst_fixup((NTFS_RECORD*)(addr +
119                                         i * rec_size), rec_size)))
120                                 continue;
121                         nr_err++;
122                         ntfs_error(ni->vol->sb, "post_read_mst_fixup() failed, "
123                                         "corrupt %s record 0x%llx. Run chkdsk.",
124                                         ni->mft_no ? "index" : "mft",
125                                         (unsigned long long)(((s64)page->index
126                                         << PAGE_CACHE_SHIFT >>
127                                         ni->itype.index.block_size_bits) + i));
128                 }
129                 flush_dcache_page(page);
130                 kunmap_atomic(addr, KM_BIO_SRC_IRQ);
131                 if (likely(!PageError(page))) {
132                         if (likely(!nr_err && recs)) {
133                                 if (likely(page_uptodate))
134                                         SetPageUptodate(page);
135                         } else {
136                                 ntfs_error(ni->vol->sb, "Setting page error, "
137                                                 "index 0x%lx.", page->index);
138                                 SetPageError(page);
139                         }
140                 }
141         }
142         unlock_page(page);
143         return;
144 still_busy:
145         spin_unlock_irqrestore(&page_uptodate_lock, flags);
146         return;
147 }
148
149 /**
150  * ntfs_read_block - fill a @page of an address space with data
151  * @page:       page cache page to fill with data
152  *
153  * Fill the page @page of the address space belonging to the @page->host inode.
154  * We read each buffer asynchronously and when all buffers are read in, our io
155  * completion handler ntfs_end_buffer_read_async(), if required, automatically
156  * applies the mst fixups to the page before finally marking it uptodate and
157  * unlocking it.
158  *
159  * We only enforce allocated_size limit because i_size is checked for in
160  * generic_file_read().
161  *
162  * Return 0 on success and -errno on error.
163  *
164  * Contains an adapted version of fs/buffer.c::block_read_full_page().
165  */
166 static int ntfs_read_block(struct page *page)
167 {
168         VCN vcn;
169         LCN lcn;
170         ntfs_inode *ni;
171         ntfs_volume *vol;
172         run_list_element *rl;
173         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
174         sector_t iblock, lblock, zblock;
175         unsigned int blocksize, vcn_ofs;
176         int i, nr;
177         unsigned char blocksize_bits;
178
179         ni = NTFS_I(page->mapping->host);
180         vol = ni->vol;
181
182         blocksize_bits = VFS_I(ni)->i_blkbits;
183         blocksize = 1 << blocksize_bits;
184
185         if (!page_has_buffers(page))
186                 create_empty_buffers(page, blocksize, 0);
187         bh = head = page_buffers(page);
188         if (unlikely(!bh)) {
189                 unlock_page(page);
190                 return -ENOMEM;
191         }
192
193         iblock = page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
194         lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
195         zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
196
197 #ifdef DEBUG
198         if (unlikely(!ni->run_list.rl && !ni->mft_no && !NInoAttr(ni)))
199                 panic("NTFS: $MFT/$DATA run list has been unmapped! This is a "
200                                 "very serious bug! Cannot continue...");
201 #endif
202
203         /* Loop through all the buffers in the page. */
204         rl = NULL;
205         nr = i = 0;
206         do {
207                 if (unlikely(buffer_uptodate(bh)))
208                         continue;
209                 if (unlikely(buffer_mapped(bh))) {
210                         arr[nr++] = bh;
211                         continue;
212                 }
213                 bh->b_bdev = vol->sb->s_bdev;
214                 /* Is the block within the allowed limits? */
215                 if (iblock < lblock) {
216                         BOOL is_retry = FALSE;
217
218                         /* Convert iblock into corresponding vcn and offset. */
219                         vcn = (VCN)iblock << blocksize_bits >>
220                                         vol->cluster_size_bits;
221                         vcn_ofs = ((VCN)iblock << blocksize_bits) &
222                                         vol->cluster_size_mask;
223                         if (!rl) {
224 lock_retry_remap:
225                                 down_read(&ni->run_list.lock);
226                                 rl = ni->run_list.rl;
227                         }
228                         if (likely(rl != NULL)) {
229                                 /* Seek to element containing target vcn. */
230                                 while (rl->length && rl[1].vcn <= vcn)
231                                         rl++;
232                                 lcn = vcn_to_lcn(rl, vcn);
233                         } else
234                                 lcn = (LCN)LCN_RL_NOT_MAPPED;
235                         /* Successful remap. */
236                         if (lcn >= 0) {
237                                 /* Setup buffer head to correct block. */
238                                 bh->b_blocknr = ((lcn << vol->cluster_size_bits)
239                                                 + vcn_ofs) >> blocksize_bits;
240                                 set_buffer_mapped(bh);
241                                 /* Only read initialized data blocks. */
242                                 if (iblock < zblock) {
243                                         arr[nr++] = bh;
244                                         continue;
245                                 }
246                                 /* Fully non-initialized data block, zero it. */
247                                 goto handle_zblock;
248                         }
249                         /* It is a hole, need to zero it. */
250                         if (lcn == LCN_HOLE)
251                                 goto handle_hole;
252                         /* If first try and run list unmapped, map and retry. */
253                         if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
254                                 is_retry = TRUE;
255                                 /*
256                                  * Attempt to map run list, dropping lock for
257                                  * the duration.
258                                  */
259                                 up_read(&ni->run_list.lock);
260                                 if (!map_run_list(ni, vcn))
261                                         goto lock_retry_remap;
262                                 rl = NULL;
263                         }
264                         /* Hard error, zero out region. */
265                         SetPageError(page);
266                         ntfs_error(vol->sb, "vcn_to_lcn(vcn = 0x%llx) failed "
267                                         "with error code 0x%llx%s.",
268                                         (unsigned long long)vcn,
269                                         (unsigned long long)-lcn,
270                                         is_retry ? " even after retrying" : "");
271                         // FIXME: Depending on vol->on_errors, do something.
272                 }
273                 /*
274                  * Either iblock was outside lblock limits or vcn_to_lcn()
275                  * returned error. Just zero that portion of the page and set
276                  * the buffer uptodate.
277                  */
278 handle_hole:
279                 bh->b_blocknr = -1UL;
280                 clear_buffer_mapped(bh);
281 handle_zblock:
282                 memset(kmap(page) + i * blocksize, 0, blocksize);
283                 flush_dcache_page(page);
284                 kunmap(page);
285                 set_buffer_uptodate(bh);
286         } while (i++, iblock++, (bh = bh->b_this_page) != head);
287
288         /* Release the lock if we took it. */
289         if (rl)
290                 up_read(&ni->run_list.lock);
291
292         /* Check we have at least one buffer ready for i/o. */
293         if (nr) {
294                 struct buffer_head *tbh;
295
296                 /* Lock the buffers. */
297                 for (i = 0; i < nr; i++) {
298                         tbh = arr[i];
299                         lock_buffer(tbh);
300                         tbh->b_end_io = ntfs_end_buffer_async_read;
301                         set_buffer_async_read(tbh);
302                 }
303                 /* Finally, start i/o on the buffers. */
304                 for (i = 0; i < nr; i++) {
305                         tbh = arr[i];
306                         if (likely(!buffer_uptodate(tbh)))
307                                 submit_bh(READ, tbh);
308                         else
309                                 ntfs_end_buffer_async_read(tbh, 1);
310                 }
311                 return 0;
312         }
313         /* No i/o was scheduled on any of the buffers. */
314         if (likely(!PageError(page)))
315                 SetPageUptodate(page);
316         else /* Signal synchronous i/o error. */
317                 nr = -EIO;
318         unlock_page(page);
319         return nr;
320 }
321
322 /**
323  * ntfs_readpage - fill a @page of a @file with data from the device
324  * @file:       open file to which the page @page belongs or NULL
325  * @page:       page cache page to fill with data
326  *
327  * For non-resident attributes, ntfs_readpage() fills the @page of the open
328  * file @file by calling the ntfs version of the generic block_read_full_page()
329  * function, ntfs_read_block(), which in turn creates and reads in the buffers
330  * associated with the page asynchronously.
331  *
332  * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
333  * data from the mft record (which at this stage is most likely in memory) and
334  * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
335  * even if the mft record is not cached at this point in time, we need to wait
336  * for it to be read in before we can do the copy.
337  *
338  * Return 0 on success and -errno on error.
339  *
340  * WARNING: Do not make this function static! It is used by mft.c!
341  */
342 int ntfs_readpage(struct file *file, struct page *page)
343 {
344         s64 attr_pos;
345         ntfs_inode *ni, *base_ni;
346         char *addr;
347         attr_search_context *ctx;
348         MFT_RECORD *mrec;
349         u32 attr_len;
350         int err = 0;
351
352         BUG_ON(!PageLocked(page));
353
354         /*
355          * This can potentially happen because we clear PageUptodate() during
356          * ntfs_writepage() of MstProtected() attributes.
357          */
358         if (PageUptodate(page)) {
359                 unlock_page(page);
360                 return 0;
361         }
362
363         ni = NTFS_I(page->mapping->host);
364
365         if (NInoNonResident(ni)) {
366                 /*
367                  * Only unnamed $DATA attributes can be compressed or
368                  * encrypted.
369                  */
370                 if (ni->type == AT_DATA && !ni->name_len) {
371                         /* If file is encrypted, deny access, just like NT4. */
372                         if (NInoEncrypted(ni)) {
373                                 err = -EACCES;
374                                 goto err_out;
375                         }
376                         /* Compressed data streams are handled in compress.c. */
377                         if (NInoCompressed(ni))
378                                 return ntfs_read_compressed_block(page);
379                 }
380                 /* Normal data stream. */
381                 return ntfs_read_block(page);
382         }
383         /* Attribute is resident, implying it is not compressed or encrypted. */
384         if (!NInoAttr(ni))
385                 base_ni = ni;
386         else
387                 base_ni = ni->ext.base_ntfs_ino;
388
389         /* Map, pin, and lock the mft record. */
390         mrec = map_mft_record(base_ni);
391         if (unlikely(IS_ERR(mrec))) {
392                 err = PTR_ERR(mrec);
393                 goto err_out;
394         }
395         ctx = get_attr_search_ctx(base_ni, mrec);
396         if (unlikely(!ctx)) {
397                 err = -ENOMEM;
398                 goto unm_err_out;
399         }
400         if (unlikely(!lookup_attr(ni->type, ni->name, ni->name_len,
401                         IGNORE_CASE, 0, NULL, 0, ctx))) {
402                 err = -ENOENT;
403                 goto put_unm_err_out;
404         }
405
406         /* Starting position of the page within the attribute value. */
407         attr_pos = page->index << PAGE_CACHE_SHIFT;
408
409         /* The total length of the attribute value. */
410         attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
411
412         addr = kmap(page);
413         /* Copy over in bounds data, zeroing the remainder of the page. */
414         if (attr_pos < attr_len) {
415                 u32 bytes = attr_len - attr_pos;
416                 if (bytes > PAGE_CACHE_SIZE)
417                         bytes = PAGE_CACHE_SIZE;
418                 else if (bytes < PAGE_CACHE_SIZE)
419                         memset(addr + bytes, 0, PAGE_CACHE_SIZE - bytes);
420                 /* Copy the data to the page. */
421                 memcpy(addr, attr_pos + (char*)ctx->attr +
422                                 le16_to_cpu(
423                                 ctx->attr->data.resident.value_offset), bytes);
424         } else
425                 memset(addr, 0, PAGE_CACHE_SIZE);
426         flush_dcache_page(page);
427         kunmap(page);
428
429         SetPageUptodate(page);
430 put_unm_err_out:
431         put_attr_search_ctx(ctx);
432 unm_err_out:
433         unmap_mft_record(base_ni);
434 err_out:
435         unlock_page(page);
436         return err;
437 }
438
439 #ifdef NTFS_RW
440
441 /**
442  * ntfs_write_block - write a @page to the backing store
443  * @page:       page cache page to write out
444  *
445  * This function is for writing pages belonging to non-resident, non-mst
446  * protected attributes to their backing store.
447  *
448  * For a page with buffers, map and write the dirty buffers asynchronously
449  * under page writeback. For a page without buffers, create buffers for the
450  * page, then proceed as above.
451  *
452  * If a page doesn't have buffers the page dirty state is definitive. If a page
453  * does have buffers, the page dirty state is just a hint, and the buffer dirty
454  * state is definitive. (A hint which has rules: dirty buffers against a clean
455  * page is illegal. Other combinations are legal and need to be handled. In
456  * particular a dirty page containing clean buffers for example.)
457  *
458  * Return 0 on success and -errno on error.
459  *
460  * Based on ntfs_read_block() and __block_write_full_page().
461  */
462 static int ntfs_write_block(struct writeback_control *wbc, struct page *page)
463 {
464         VCN vcn;
465         LCN lcn;
466         sector_t block, dblock, iblock;
467         struct inode *vi;
468         ntfs_inode *ni;
469         ntfs_volume *vol;
470         run_list_element *rl;
471         struct buffer_head *bh, *head;
472         unsigned int blocksize, vcn_ofs;
473         int err;
474         BOOL need_end_writeback;
475         unsigned char blocksize_bits;
476
477         vi = page->mapping->host;
478         ni = NTFS_I(vi);
479         vol = ni->vol;
480
481         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
482                         "0x%lx.", vi->i_ino, ni->type, page->index);
483
484         BUG_ON(!NInoNonResident(ni));
485         BUG_ON(NInoMstProtected(ni));
486
487         blocksize_bits = vi->i_blkbits;
488         blocksize = 1 << blocksize_bits;
489
490         if (!page_has_buffers(page)) {
491                 BUG_ON(!PageUptodate(page));
492                 create_empty_buffers(page, blocksize,
493                                 (1 << BH_Uptodate) | (1 << BH_Dirty));
494         }
495         bh = head = page_buffers(page);
496         if (unlikely(!bh)) {
497                 ntfs_warning(vol->sb, "Error allocating page buffers. "
498                                 "Redirtying page so we try again later.");
499                 /*
500                  * Put the page back on mapping->dirty_pages, but leave its
501                  * buffer's dirty state as-is.
502                  */
503                 redirty_page_for_writepage(wbc, page);
504                 unlock_page(page);
505                 return 0;
506         }
507
508         /* NOTE: Different naming scheme to ntfs_read_block()! */
509
510         /* The first block in the page. */
511         block = page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
512
513         /* The first out of bounds block for the data size. */
514         dblock = (vi->i_size + blocksize - 1) >> blocksize_bits;
515
516         /* The last (fully or partially) initialized block. */
517         iblock = ni->initialized_size >> blocksize_bits;
518
519         /*
520          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
521          * here, and the (potentially unmapped) buffers may become dirty at
522          * any time.  If a buffer becomes dirty here after we've inspected it
523          * then we just miss that fact, and the page stays dirty.
524          *
525          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
526          * handle that here by just cleaning them.
527          */
528
529         /*
530          * Loop through all the buffers in the page, mapping all the dirty
531          * buffers to disk addresses and handling any aliases from the
532          * underlying block device's mapping.
533          */
534         rl = NULL;
535         err = 0;
536         do {
537                 BOOL is_retry = FALSE;
538
539                 if (unlikely(block >= dblock)) {
540                         /*
541                          * Mapped buffers outside i_size will occur, because
542                          * this page can be outside i_size when there is a
543                          * truncate in progress. The contents of such buffers
544                          * were zeroed by ntfs_writepage().
545                          *
546                          * FIXME: What about the small race window where
547                          * ntfs_writepage() has not done any clearing because
548                          * the page was within i_size but before we get here,
549                          * vmtruncate() modifies i_size?
550                          */
551                         clear_buffer_dirty(bh);
552                         set_buffer_uptodate(bh);
553                         continue;
554                 }
555
556                 /* Clean buffers are not written out, so no need to map them. */
557                 if (!buffer_dirty(bh))
558                         continue;
559
560                 /* Make sure we have enough initialized size. */
561                 if (unlikely((block >= iblock) &&
562                                 (ni->initialized_size < vi->i_size))) {
563                         /*
564                          * If this page is fully outside initialized size, zero
565                          * out all pages between the current initialized size
566                          * and the current page. Just use ntfs_readpage() to do
567                          * the zeroing transparently.
568                          */
569                         if (block > iblock) {
570                                 // TODO:
571                                 // For each page do:
572                                 // - read_cache_page()
573                                 // Again for each page do:
574                                 // - wait_on_page_locked()
575                                 // - Check (PageUptodate(page) &&
576                                 //                      !PageError(page))
577                                 // Update initialized size in the attribute and
578                                 // in the inode.
579                                 // Again, for each page do:
580                                 //      __set_page_dirty_buffers();
581                                 // page_cache_release()
582                                 // We don't need to wait on the writes.
583                                 // Update iblock.
584                         }
585                         /*
586                          * The current page straddles initialized size. Zero
587                          * all non-uptodate buffers and set them uptodate (and
588                          * dirty?). Note, there aren't any non-uptodate buffers
589                          * if the page is uptodate.
590                          * FIXME: For an uptodate page, the buffers may need to
591                          * be written out because they were not initialized on
592                          * disk before.
593                          */
594                         if (!PageUptodate(page)) {
595                                 // TODO:
596                                 // Zero any non-uptodate buffers up to i_size.
597                                 // Set them uptodate and dirty.
598                         }
599                         // TODO:
600                         // Update initialized size in the attribute and in the
601                         // inode (up to i_size).
602                         // Update iblock.
603                         // FIXME: This is inefficient. Try to batch the two
604                         // size changes to happen in one go.
605                         ntfs_error(vol->sb, "Writing beyond initialized size "
606                                         "is not supported yet. Sorry.");
607                         err = -EOPNOTSUPP;
608                         break;
609                         // Do NOT set_buffer_new() BUT DO clear buffer range
610                         // outside write request range.
611                         // set_buffer_uptodate() on complete buffers as well as
612                         // set_buffer_dirty().
613                 }
614
615                 /* No need to map buffers that are already mapped. */
616                 if (buffer_mapped(bh))
617                         continue;
618
619                 /* Unmapped, dirty buffer. Need to map it. */
620                 bh->b_bdev = vol->sb->s_bdev;
621
622                 /* Convert block into corresponding vcn and offset. */
623                 vcn = (VCN)block << blocksize_bits >> vol->cluster_size_bits;
624                 vcn_ofs = ((VCN)block << blocksize_bits) &
625                                 vol->cluster_size_mask;
626                 if (!rl) {
627 lock_retry_remap:
628                         down_read(&ni->run_list.lock);
629                         rl = ni->run_list.rl;
630                 }
631                 if (likely(rl != NULL)) {
632                         /* Seek to element containing target vcn. */
633                         while (rl->length && rl[1].vcn <= vcn)
634                                 rl++;
635                         lcn = vcn_to_lcn(rl, vcn);
636                 } else
637                         lcn = (LCN)LCN_RL_NOT_MAPPED;
638                 /* Successful remap. */
639                 if (lcn >= 0) {
640                         /* Setup buffer head to point to correct block. */
641                         bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
642                                         vcn_ofs) >> blocksize_bits;
643                         set_buffer_mapped(bh);
644                         continue;
645                 }
646                 /* It is a hole, need to instantiate it. */
647                 if (lcn == LCN_HOLE) {
648                         // TODO: Instantiate the hole.
649                         // clear_buffer_new(bh);
650                         // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
651                         ntfs_error(vol->sb, "Writing into sparse regions is "
652                                         "not supported yet. Sorry.");
653                         err = -EOPNOTSUPP;
654                         break;
655                 }
656                 /* If first try and run list unmapped, map and retry. */
657                 if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
658                         is_retry = TRUE;
659                         /*
660                          * Attempt to map run list, dropping lock for
661                          * the duration.
662                          */
663                         up_read(&ni->run_list.lock);
664                         err = map_run_list(ni, vcn);
665                         if (likely(!err))
666                                 goto lock_retry_remap;
667                         rl = NULL;
668                 }
669                 /* Failed to map the buffer, even after retrying. */
670                 bh->b_blocknr = -1UL;
671                 ntfs_error(vol->sb, "vcn_to_lcn(vcn = 0x%llx) failed "
672                                 "with error code 0x%llx%s.",
673                                 (unsigned long long)vcn,
674                                 (unsigned long long)-lcn,
675                                 is_retry ? " even after retrying" : "");
676                 // FIXME: Depending on vol->on_errors, do something.
677                 if (!err)
678                         err = -EIO;
679                 break;
680         } while (block++, (bh = bh->b_this_page) != head);
681
682         /* Release the lock if we took it. */
683         if (rl)
684                 up_read(&ni->run_list.lock);
685
686         /* For the error case, need to reset bh to the beginning. */
687         bh = head;
688
689         /* Just an optimization, so ->readpage() isn't called later. */
690         if (unlikely(!PageUptodate(page))) {
691                 int uptodate = 1;
692                 do {
693                         if (!buffer_uptodate(bh)) {
694                                 uptodate = 0;
695                                 bh = head;
696                                 break;
697                         }
698                 } while ((bh = bh->b_this_page) != head);
699                 if (uptodate)
700                         SetPageUptodate(page);
701         }
702
703         /* Setup all mapped, dirty buffers for async write i/o. */
704         do {
705                 get_bh(bh);
706                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
707                         lock_buffer(bh);
708                         if (test_clear_buffer_dirty(bh)) {
709                                 BUG_ON(!buffer_uptodate(bh));
710                                 mark_buffer_async_write(bh);
711                         } else
712                                 unlock_buffer(bh);
713                 } else if (unlikely(err)) {
714                         /*
715                          * For the error case. The buffer may have been set
716                          * dirty during attachment to a dirty page.
717                          */
718                         if (err != -ENOMEM)
719                                 clear_buffer_dirty(bh);
720                 }
721         } while ((bh = bh->b_this_page) != head);
722
723         if (unlikely(err)) {
724                 // TODO: Remove the -EOPNOTSUPP check later on...
725                 if (unlikely(err == -EOPNOTSUPP))
726                         err = 0;
727                 else if (err == -ENOMEM) {
728                         ntfs_warning(vol->sb, "Error allocating memory. "
729                                         "Redirtying page so we try again "
730                                         "later.");
731                         /*
732                          * Put the page back on mapping->dirty_pages, but
733                          * leave its buffer's dirty state as-is.
734                          */
735                         redirty_page_for_writepage(wbc, page);
736                         err = 0;
737                 } else
738                         SetPageError(page);
739         }
740
741         BUG_ON(PageWriteback(page));
742         set_page_writeback(page);       /* Keeps try_to_free_buffers() away. */
743         unlock_page(page);
744
745         /*
746          * Submit the prepared buffers for i/o. Note the page is unlocked,
747          * and the async write i/o completion handler can end_page_writeback()
748          * at any time after the *first* submit_bh(). So the buffers can then
749          * disappear...
750          */
751         need_end_writeback = TRUE;
752         do {
753                 struct buffer_head *next = bh->b_this_page;
754                 if (buffer_async_write(bh)) {
755                         submit_bh(WRITE, bh);
756                         need_end_writeback = FALSE;
757                 }
758                 put_bh(bh);
759                 bh = next;
760         } while (bh != head);
761
762         /* If no i/o was started, need to end_page_writeback(). */
763         if (unlikely(need_end_writeback))
764                 end_page_writeback(page);
765
766         ntfs_debug("Done.");
767         return err;
768 }
769
770 /**
771  * ntfs_writepage - write a @page to the backing store
772  * @page:       page cache page to write out
773  *
774  * For non-resident attributes, ntfs_writepage() writes the @page by calling
775  * the ntfs version of the generic block_write_full_page() function,
776  * ntfs_write_block(), which in turn if necessary creates and writes the
777  * buffers associated with the page asynchronously.
778  *
779  * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
780  * the data to the mft record (which at this stage is most likely in memory).
781  * The mft record is then marked dirty and written out asynchronously via the
782  * vfs inode dirty code path.
783  *
784  * Note the caller clears the page dirty flag before calling ntfs_writepage().
785  *
786  * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
787  *
788  * Return 0 on success and -errno on error.
789  */
790 static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
791 {
792         s64 attr_pos;
793         struct inode *vi;
794         ntfs_inode *ni, *base_ni;
795         char *kaddr;
796         attr_search_context *ctx;
797         MFT_RECORD *m;
798         u32 attr_len, bytes;
799         int err;
800
801         BUG_ON(!PageLocked(page));
802
803         vi = page->mapping->host;
804
805         /* Is the page fully outside i_size? (truncate in progress) */
806         if (unlikely(page->index >= (vi->i_size + PAGE_CACHE_SIZE - 1) >>
807                         PAGE_CACHE_SHIFT)) {
808                 unlock_page(page);
809                 ntfs_debug("Write outside i_size - truncated?");
810                 return 0;
811         }
812
813         ni = NTFS_I(vi);
814
815         if (NInoNonResident(ni)) {
816                 /*
817                  * Only unnamed $DATA attributes can be compressed, encrypted,
818                  * and/or sparse.
819                  */
820                 if (ni->type == AT_DATA && !ni->name_len) {
821                         /* If file is encrypted, deny access, just like NT4. */
822                         if (NInoEncrypted(ni)) {
823                                 unlock_page(page);
824                                 ntfs_debug("Denying write access to encrypted "
825                                                 "file.");
826                                 return -EACCES;
827                         }
828                         /* Compressed data streams are handled in compress.c. */
829                         if (NInoCompressed(ni)) {
830                                 // TODO: Implement and replace this check with
831                                 // return ntfs_write_compressed_block(page);
832                                 unlock_page(page);
833                                 ntfs_error(vi->i_sb, "Writing to compressed "
834                                                 "files is not supported yet. "
835                                                 "Sorry.");
836                                 return -EOPNOTSUPP;
837                         }
838                         // TODO: Implement and remove this check.
839                         if (NInoSparse(ni)) {
840                                 unlock_page(page);
841                                 ntfs_error(vi->i_sb, "Writing to sparse files "
842                                                 "is not supported yet. Sorry.");
843                                 return -EOPNOTSUPP;
844                         }
845                 }
846
847                 /* We have to zero every time due to mmap-at-end-of-file. */
848                 if (page->index >= (vi->i_size >> PAGE_CACHE_SHIFT)) {
849                         /* The page straddles i_size. */
850                         unsigned int ofs = vi->i_size & ~PAGE_CACHE_MASK;
851                         kaddr = kmap_atomic(page, KM_USER0);
852                         memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
853                         flush_dcache_page(page);
854                         kunmap_atomic(kaddr, KM_USER0);
855                 }
856
857                 // TODO: Implement and remove this check.
858                 if (NInoMstProtected(ni)) {
859                         unlock_page(page);
860                         ntfs_error(vi->i_sb, "Writing to MST protected "
861                                         "attributes is not supported yet. "
862                                         "Sorry.");
863                         return -EOPNOTSUPP;
864                 }
865
866                 /* Normal data stream. */
867                 return ntfs_write_block(wbc, page);
868         }
869
870         /*
871          * Attribute is resident, implying it is not compressed, encrypted, or
872          * mst protected.
873          */
874         BUG_ON(page_has_buffers(page));
875         BUG_ON(!PageUptodate(page));
876
877         if (!NInoAttr(ni))
878                 base_ni = ni;
879         else
880                 base_ni = ni->ext.base_ntfs_ino;
881
882         /* Map, pin, and lock the mft record. */
883         m = map_mft_record(base_ni);
884         if (unlikely(IS_ERR(m))) {
885                 err = PTR_ERR(m);
886                 m = NULL;
887                 ctx = NULL;
888                 goto err_out;
889         }
890         ctx = get_attr_search_ctx(base_ni, m);
891         if (unlikely(!ctx)) {
892                 err = -ENOMEM;
893                 goto err_out;
894         }
895         if (unlikely(!lookup_attr(ni->type, ni->name, ni->name_len,
896                         IGNORE_CASE, 0, NULL, 0, ctx))) {
897                 err = -ENOENT;
898                 goto err_out;
899         }
900
901         /* Starting position of the page within the attribute value. */
902         attr_pos = page->index << PAGE_CACHE_SHIFT;
903
904         /* The total length of the attribute value. */
905         attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
906
907         if (unlikely(vi->i_size != attr_len)) {
908                 ntfs_error(vi->i_sb, "BUG()! i_size (0x%llx) doesn't match "
909                                 "attr_len (0x%x). Aborting write.", vi->i_size,
910                                 attr_len);
911                 err = -EIO;
912                 goto err_out;
913         }
914         if (unlikely(attr_pos >= attr_len)) {
915                 ntfs_error(vi->i_sb, "BUG()! attr_pos (0x%llx) > attr_len "
916                                 "(0x%x). Aborting write.",
917                                 (unsigned long long)attr_pos, attr_len);
918                 err = -EIO;
919                 goto err_out;
920         }
921
922         bytes = attr_len - attr_pos;
923         if (unlikely(bytes > PAGE_CACHE_SIZE))
924                 bytes = PAGE_CACHE_SIZE;
925
926         /*
927          * Keep the VM happy.  This must be done otherwise the radix-tree tag
928          * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
929          */
930         BUG_ON(PageWriteback(page));
931         set_page_writeback(page);
932         unlock_page(page);
933
934         /*
935          * Here, we don't need to zero the out of bounds area everytime because
936          * the below memcpy() already takes care of the mmap-at-end-of-file
937          * requirements. If the file is converted to a non-resident one, then
938          * the code path use is switched to the non-resident one where the
939          * zeroing happens on each ntfs_writepage() invocation.
940          *
941          * The above also applies nicely when i_size is decreased.
942          *
943          * When i_size is increased, the memory between the old and new i_size
944          * _must_ be zeroed (or overwritten with new data). Otherwise we will
945          * expose data to userspace/disk which should never have been exposed.
946          *
947          * FIXME: Ensure that i_size increases do the zeroing/overwriting and
948          * if we cannot guarantee that, then enable the zeroing below.  If the
949          * zeroing below is enabled, we MUST move the unlock_page() from above
950          * to after the kunmap_atomic(), i.e. just before the
951          * end_page_writeback().
952          */
953
954         kaddr = kmap_atomic(page, KM_USER0);
955         /* Copy the data from the page to the mft record. */
956         memcpy((u8*)ctx->attr + le16_to_cpu(
957                         ctx->attr->data.resident.value_offset) + attr_pos,
958                         kaddr, bytes);
959         flush_dcache_mft_record_page(ctx->ntfs_ino);
960 #if 0
961         /* Zero out of bounds area. */
962         if (likely(bytes < PAGE_CACHE_SIZE)) {
963                 memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
964                 flush_dcache_page(page);
965         }
966 #endif
967         kunmap_atomic(kaddr, KM_USER0);
968
969         end_page_writeback(page);
970
971         /* Mark the mft record dirty, so it gets written back. */
972         mark_mft_record_dirty(ctx->ntfs_ino);
973
974         put_attr_search_ctx(ctx);
975         unmap_mft_record(base_ni);
976         return 0;
977 err_out:
978         if (err == -ENOMEM) {
979                 ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
980                                 "page so we try again later.");
981                 /*
982                  * Put the page back on mapping->dirty_pages, but leave its
983                  * buffer's dirty state as-is.
984                  */
985                 redirty_page_for_writepage(wbc, page);
986                 err = 0;
987         } else {
988                 ntfs_error(vi->i_sb, "Resident attribute write failed with "
989                                 "error %i. Setting page error flag.", -err);
990                 SetPageError(page);
991         }
992         unlock_page(page);
993         if (ctx)
994                 put_attr_search_ctx(ctx);
995         if (m)
996                 unmap_mft_record(base_ni);
997         return err;
998 }
999
1000 /**
1001  * ntfs_prepare_nonresident_write -
1002  *
1003  */
1004 static int ntfs_prepare_nonresident_write(struct page *page,
1005                 unsigned from, unsigned to)
1006 {
1007         VCN vcn;
1008         LCN lcn;
1009         sector_t block, ablock, iblock;
1010         struct inode *vi;
1011         ntfs_inode *ni;
1012         ntfs_volume *vol;
1013         run_list_element *rl;
1014         struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1015         unsigned int vcn_ofs, block_start, block_end, blocksize;
1016         int err;
1017         BOOL is_retry;
1018         unsigned char blocksize_bits;
1019
1020         vi = page->mapping->host;
1021         ni = NTFS_I(vi);
1022         vol = ni->vol;
1023
1024         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1025                         "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1026                         page->index, from, to);
1027
1028         BUG_ON(!NInoNonResident(ni));
1029         BUG_ON(NInoMstProtected(ni));
1030
1031         blocksize_bits = vi->i_blkbits;
1032         blocksize = 1 << blocksize_bits;
1033
1034         /*
1035          * create_empty_buffers() will create uptodate/dirty buffers if the
1036          * page is uptodate/dirty.
1037          */
1038         if (!page_has_buffers(page))
1039                 create_empty_buffers(page, blocksize, 0);
1040         bh = head = page_buffers(page);
1041         if (unlikely(!bh))
1042                 return -ENOMEM;
1043
1044         /* The first block in the page. */
1045         block = page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
1046
1047         /*
1048          * The first out of bounds block for the allocated size. No need to
1049          * round up as allocated_size is in multiples of cluster size and the
1050          * minimum cluster size is 512 bytes, which is equal to the smallest
1051          * blocksize.
1052          */
1053         ablock = ni->allocated_size >> blocksize_bits;
1054
1055         /* The last (fully or partially) initialized block. */
1056         iblock = ni->initialized_size >> blocksize_bits;
1057
1058         /* Loop through all the buffers in the page. */
1059         block_start = 0;
1060         rl = NULL;
1061         err = 0;
1062         do {
1063                 block_end = block_start + blocksize;
1064                 /*
1065                  * If buffer @bh is outside the write, just mark it uptodate
1066                  * if the page is uptodate and continue with the next buffer.
1067                  */
1068                 if (block_end <= from || block_start >= to) {
1069                         if (PageUptodate(page)) {
1070                                 if (!buffer_uptodate(bh))
1071                                         set_buffer_uptodate(bh);
1072                         }
1073                         continue;
1074                 }
1075                 /*
1076                  * @bh is at least partially being written to.
1077                  * Make sure it is not marked as new.
1078                  */
1079                 //if (buffer_new(bh))
1080                 //      clear_buffer_new(bh);
1081
1082                 if (block >= ablock) {
1083                         // TODO: block is above allocated_size, need to
1084                         // allocate it. Best done in one go to accommodate not
1085                         // only block but all above blocks up to and including:
1086                         // ((page->index << PAGE_CACHE_SHIFT) + to + blocksize
1087                         // - 1) >> blobksize_bits. Obviously will need to round
1088                         // up to next cluster boundary, too. This should be
1089                         // done with a helper function, so it can be reused.
1090                         ntfs_error(vol->sb, "Writing beyond allocated size "
1091                                         "is not supported yet. Sorry.");
1092                         err = -EOPNOTSUPP;
1093                         goto err_out;
1094                         // Need to update ablock.
1095                         // Need to set_buffer_new() on all block bhs that are
1096                         // newly allocated.
1097                 }
1098                 /*
1099                  * Now we have enough allocated size to fulfill the whole
1100                  * request, i.e. block < ablock is true.
1101                  */
1102                 if (unlikely((block >= iblock) &&
1103                                 (ni->initialized_size < vi->i_size))) {
1104                         /*
1105                          * If this page is fully outside initialized size, zero
1106                          * out all pages between the current initialized size
1107                          * and the current page. Just use ntfs_readpage() to do
1108                          * the zeroing transparently.
1109                          */
1110                         if (block > iblock) {
1111                                 // TODO:
1112                                 // For each page do:
1113                                 // - read_cache_page()
1114                                 // Again for each page do:
1115                                 // - wait_on_page_locked()
1116                                 // - Check (PageUptodate(page) &&
1117                                 //                      !PageError(page))
1118                                 // Update initialized size in the attribute and
1119                                 // in the inode.
1120                                 // Again, for each page do:
1121                                 //      __set_page_dirty_buffers();
1122                                 // page_cache_release()
1123                                 // We don't need to wait on the writes.
1124                                 // Update iblock.
1125                         }
1126                         /*
1127                          * The current page straddles initialized size. Zero
1128                          * all non-uptodate buffers and set them uptodate (and
1129                          * dirty?). Note, there aren't any non-uptodate buffers
1130                          * if the page is uptodate.
1131                          * FIXME: For an uptodate page, the buffers may need to
1132                          * be written out because they were not initialized on
1133                          * disk before.
1134                          */
1135                         if (!PageUptodate(page)) {
1136                                 // TODO:
1137                                 // Zero any non-uptodate buffers up to i_size.
1138                                 // Set them uptodate and dirty.
1139                         }
1140                         // TODO:
1141                         // Update initialized size in the attribute and in the
1142                         // inode (up to i_size).
1143                         // Update iblock.
1144                         // FIXME: This is inefficient. Try to batch the two
1145                         // size changes to happen in one go.
1146                         ntfs_error(vol->sb, "Writing beyond initialized size "
1147                                         "is not supported yet. Sorry.");
1148                         err = -EOPNOTSUPP;
1149                         goto err_out;
1150                         // Do NOT set_buffer_new() BUT DO clear buffer range
1151                         // outside write request range.
1152                         // set_buffer_uptodate() on complete buffers as well as
1153                         // set_buffer_dirty().
1154                 }
1155
1156                 /* Need to map unmapped buffers. */
1157                 if (!buffer_mapped(bh)) {
1158                         /* Unmapped buffer. Need to map it. */
1159                         bh->b_bdev = vol->sb->s_bdev;
1160
1161                         /* Convert block into corresponding vcn and offset. */
1162                         vcn = (VCN)block << blocksize_bits >>
1163                                         vol->cluster_size_bits;
1164                         vcn_ofs = ((VCN)block << blocksize_bits) &
1165                                         vol->cluster_size_mask;
1166
1167                         is_retry = FALSE;
1168                         if (!rl) {
1169 lock_retry_remap:
1170                                 down_read(&ni->run_list.lock);
1171                                 rl = ni->run_list.rl;
1172                         }
1173                         if (likely(rl != NULL)) {
1174                                 /* Seek to element containing target vcn. */
1175                                 while (rl->length && rl[1].vcn <= vcn)
1176                                         rl++;
1177                                 lcn = vcn_to_lcn(rl, vcn);
1178                         } else
1179                                 lcn = (LCN)LCN_RL_NOT_MAPPED;
1180                         if (unlikely(lcn < 0)) {
1181                                 /*
1182                                  * We extended the attribute allocation above.
1183                                  * If we hit an ENOENT here it means that the
1184                                  * allocation was insufficient which is a bug.
1185                                  */
1186                                 BUG_ON(lcn == LCN_ENOENT);
1187
1188                                 /* It is a hole, need to instantiate it. */
1189                                 if (lcn == LCN_HOLE) {
1190                                         // TODO: Instantiate the hole.
1191                                         // clear_buffer_new(bh);
1192                                         // unmap_underlying_metadata(bh->b_bdev,
1193                                         //              bh->b_blocknr);
1194                                         // For non-uptodate buffers, need to
1195                                         // zero out the region outside the
1196                                         // request in this bh or all bhs,
1197                                         // depending on what we implemented
1198                                         // above.
1199                                         // Need to flush_dcache_page().
1200                                         // Or could use set_buffer_new()
1201                                         // instead?
1202                                         ntfs_error(vol->sb, "Writing into "
1203                                                         "sparse regions is "
1204                                                         "not supported yet. "
1205                                                         "Sorry.");
1206                                         err = -EOPNOTSUPP;
1207                                         goto err_out;
1208                                 } else if (!is_retry &&
1209                                                 lcn == LCN_RL_NOT_MAPPED) {
1210                                         is_retry = TRUE;
1211                                         /*
1212                                          * Attempt to map run list, dropping
1213                                          * lock for the duration.
1214                                          */
1215                                         up_read(&ni->run_list.lock);
1216                                         err = map_run_list(ni, vcn);
1217                                         if (likely(!err))
1218                                                 goto lock_retry_remap;
1219                                         rl = NULL;
1220                                 }
1221                                 /*
1222                                  * Failed to map the buffer, even after
1223                                  * retrying.
1224                                  */
1225                                 bh->b_blocknr = -1UL;
1226                                 ntfs_error(vol->sb, "vcn_to_lcn(vcn = 0x%llx) "
1227                                                 "failed with error code "
1228                                                 "0x%llx%s.",
1229                                                 (unsigned long long)vcn,
1230                                                 (unsigned long long)-lcn,
1231                                                 is_retry ? " even after "
1232                                                 "retrying" : "");
1233                                 // FIXME: Depending on vol->on_errors, do
1234                                 // something.
1235                                 if (!err)
1236                                         err = -EIO;
1237                                 goto err_out;
1238                         }
1239                         /* We now have a successful remap, i.e. lcn >= 0. */
1240
1241                         /* Setup buffer head to correct block. */
1242                         bh->b_blocknr = ((lcn << vol->cluster_size_bits)
1243                                         + vcn_ofs) >> blocksize_bits;
1244                         set_buffer_mapped(bh);
1245
1246                         // FIXME: Something analogous to this is needed for
1247                         // each newly allocated block, i.e. BH_New.
1248                         // FIXME: Might need to take this out of the
1249                         // if (!buffer_mapped(bh)) {}, depending on how we
1250                         // implement things during the allocated_size and
1251                         // initialized_size extension code above.
1252                         if (buffer_new(bh)) {
1253                                 clear_buffer_new(bh);
1254                                 unmap_underlying_metadata(bh->b_bdev,
1255                                                 bh->b_blocknr);
1256                                 if (PageUptodate(page)) {
1257                                         set_buffer_uptodate(bh);
1258                                         continue;
1259                                 }
1260                                 /*
1261                                  * Page is _not_ uptodate, zero surrounding
1262                                  * region. NOTE: This is how we decide if to
1263                                  * zero or not!
1264                                  */
1265                                 if (block_end > to || block_start < from) {
1266                                         void *kaddr;
1267
1268                                         kaddr = kmap_atomic(page, KM_USER0);
1269                                         if (block_end > to)
1270                                                 memset(kaddr + to, 0,
1271                                                                 block_end - to);
1272                                         if (block_start < from)
1273                                                 memset(kaddr + block_start, 0,
1274                                                                 from -
1275                                                                 block_start);
1276                                         flush_dcache_page(page);
1277                                         kunmap_atomic(kaddr, KM_USER0);
1278                                 }
1279                                 continue;
1280                         }
1281                 }
1282                 /* @bh is mapped, set it uptodate if the page is uptodate. */
1283                 if (PageUptodate(page)) {
1284                         if (!buffer_uptodate(bh))
1285                                 set_buffer_uptodate(bh);
1286                         continue;
1287                 }
1288                 /*
1289                  * The page is not uptodate. The buffer is mapped. If it is not
1290                  * uptodate, and it is only partially being written to, we need
1291                  * to read the buffer in before the write, i.e. right now.
1292                  */
1293                 if (!buffer_uptodate(bh) &&
1294                                 (block_start < from || block_end > to)) {
1295                         ll_rw_block(READ, 1, &bh);
1296                         *wait_bh++ = bh;
1297                 }
1298         } while (block++, block_start = block_end,
1299                         (bh = bh->b_this_page) != head);
1300
1301         /* Release the lock if we took it. */
1302         if (rl) {
1303                 up_read(&ni->run_list.lock);
1304                 rl = NULL;
1305         }
1306
1307         /* If we issued read requests, let them complete. */
1308         while (wait_bh > wait) {
1309                 wait_on_buffer(*--wait_bh);
1310                 if (!buffer_uptodate(*wait_bh))
1311                         return -EIO;
1312         }
1313
1314         ntfs_debug("Done.");
1315         return 0;
1316 err_out:
1317         /*
1318          * Zero out any newly allocated blocks to avoid exposing stale data.
1319          * If BH_New is set, we know that the block was newly allocated in the
1320          * above loop.
1321          * FIXME: What about initialized_size increments? Have we done all the
1322          * required zeroing above? If not this error handling is broken, and
1323          * in particular the if (block_end <= from) check is completely bogus.
1324          */
1325         bh = head;
1326         block_start = 0;
1327         is_retry = FALSE;
1328         do {
1329                 block_end = block_start + blocksize;
1330                 if (block_end <= from)
1331                         continue;
1332                 if (block_start >= to)
1333                         break;
1334                 if (buffer_new(bh)) {
1335                         void *kaddr;
1336
1337                         clear_buffer_new(bh);
1338                         kaddr = kmap_atomic(page, KM_USER0);
1339                         memset(kaddr + block_start, 0, bh->b_size);
1340                         kunmap_atomic(kaddr, KM_USER0);
1341                         set_buffer_uptodate(bh);
1342                         mark_buffer_dirty(bh);
1343                         is_retry = TRUE;
1344                 }
1345         } while (block_start = block_end, (bh = bh->b_this_page) != head);
1346         if (is_retry)
1347                 flush_dcache_page(page);
1348         if (rl)
1349                 up_read(&ni->run_list.lock);
1350         return err;
1351 }
1352
1353 /**
1354  * ntfs_prepare_write - prepare a page for receiving data
1355  *
1356  * This is called from generic_file_write() with i_sem held on the inode
1357  * (@page->mapping->host). The @page is locked and kmap()ped so page_address()
1358  * can simply be used. The source data has not yet been copied into the @page.
1359  *
1360  * Need to extend the attribute/fill in holes if necessary, create blocks and
1361  * make partially overwritten blocks uptodate,
1362  *
1363  * i_size is not to be modified yet.
1364  *
1365  * Return 0 on success or -errno on error.
1366  *
1367  * Should be using block_prepare_write() [support for sparse files] or
1368  * cont_prepare_write() [no support for sparse files]. Can't do that due to
1369  * ntfs specifics but can look at them for implementation guidancea.
1370  *
1371  * Note: In the range, @from is inclusive and @to is exclusive, i.e. @from is
1372  * the first byte in the page that will be written to and @to is the first byte
1373  * after the last byte that will be written to.
1374  */
1375 static int ntfs_prepare_write(struct file *file, struct page *page,
1376                 unsigned from, unsigned to)
1377 {
1378         struct inode *vi = page->mapping->host;
1379         ntfs_inode   *ni = NTFS_I(vi);
1380
1381         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1382                         "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1383                         page->index, from, to);
1384
1385         BUG_ON(!PageLocked(page));
1386         BUG_ON(from > PAGE_CACHE_SIZE);
1387         BUG_ON(to > PAGE_CACHE_SIZE);
1388         BUG_ON(from > to);
1389
1390         if (NInoNonResident(ni)) {
1391                 /*
1392                  * Only unnamed $DATA attributes can be compressed, encrypted,
1393                  * and/or sparse.
1394                  */
1395                 if (ni->type == AT_DATA && !ni->name_len) {
1396                         /* If file is encrypted, deny access, just like NT4. */
1397                         if (NInoEncrypted(ni)) {
1398                                 ntfs_debug("Denying write access to encrypted "
1399                                                 "file.");
1400                                 return -EACCES;
1401                         }
1402                         /* Compressed data streams are handled in compress.c. */
1403                         if (NInoCompressed(ni)) {
1404                                 // TODO: Implement and replace this check with
1405                                 // return ntfs_write_compressed_block(page);
1406                                 ntfs_error(vi->i_sb, "Writing to compressed "
1407                                                 "files is not supported yet. "
1408                                                 "Sorry.");
1409                                 return -EOPNOTSUPP;
1410                         }
1411                         // TODO: Implement and remove this check.
1412                         if (NInoSparse(ni)) {
1413                                 ntfs_error(vi->i_sb, "Writing to sparse files "
1414                                                 "is not supported yet. Sorry.");
1415                                 return -EOPNOTSUPP;
1416                         }
1417                 }
1418
1419                 // TODO: Implement and remove this check.
1420                 if (NInoMstProtected(ni)) {
1421                         ntfs_error(vi->i_sb, "Writing to MST protected "
1422                                         "attributes is not supported yet. "
1423                                         "Sorry.");
1424                         return -EOPNOTSUPP;
1425                 }
1426
1427                 /* Normal data stream. */
1428                 return ntfs_prepare_nonresident_write(page, from, to);
1429         }
1430
1431         /*
1432          * Attribute is resident, implying it is not compressed, encrypted, or
1433          * mst protected.
1434          */
1435         BUG_ON(page_has_buffers(page));
1436
1437         /* Do we need to resize the attribute? */
1438         if (((s64)page->index << PAGE_CACHE_SHIFT) + to > vi->i_size) {
1439                 // TODO: Implement resize...
1440                 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
1441                                 "not supported yet. Sorry.");
1442                 return -EOPNOTSUPP;
1443         }
1444
1445         /*
1446          * Because resident attributes are handled by memcpy() to/from the
1447          * corresponding MFT record, and because this form of i/o is byte
1448          * aligned rather than block aligned, there is no need to bring the
1449          * page uptodate here as in the non-resident case where we need to
1450          * bring the buffers straddled by the write uptodate before
1451          * generic_file_write() does the copying from userspace.
1452          *
1453          * We thus defer the uptodate bringing of the page region outside the
1454          * region written to to ntfs_commit_write(). The reason for doing this
1455          * is that we save one round of:
1456          *      map_mft_record(), get_attr_search_ctx(), lookup_attr(),
1457          *      kmap_atomic(), kunmap_atomic(), put_attr_search_ctx(),
1458          *      unmap_mft_record().
1459          * Which is obviously a very worthwhile save.
1460          *
1461          * Thus we just return success now...
1462          */
1463         ntfs_debug("Done.");
1464         return 0;
1465 }
1466
1467 /*
1468  * NOTES: There is a disparity between the apparent need to extend the
1469  * attribute in prepare write but to update i_size only in commit write.
1470  * Need to make sure i_sem protection is sufficient. And if not will need to
1471  * handle this in some way or another.
1472  */
1473
1474 /**
1475  * ntfs_commit_nonresident_write -
1476  *
1477  */
1478 static int ntfs_commit_nonresident_write(struct page *page,
1479                 unsigned from, unsigned to)
1480 {
1481         s64 pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
1482         struct inode *vi;
1483         struct buffer_head *bh, *head;
1484         unsigned int block_start, block_end, blocksize;
1485         BOOL partial;
1486
1487         vi = page->mapping->host;
1488
1489         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1490                         "0x%lx, from = %u, to = %u.", vi->i_ino,
1491                         NTFS_I(vi)->type, page->index, from, to);
1492
1493         blocksize = 1 << vi->i_blkbits;
1494
1495         // FIXME: We need a whole slew of special cases in here for MST
1496         // protected attributes for example. For compressed files, too...
1497         // For now, we know ntfs_prepare_write() would have failed so we can't
1498         // get here in any of the cases which we have to special case, so we
1499         // are just a ripped off unrolled generic_commit_write() at present.
1500
1501         bh = head = page_buffers(page);
1502         block_start = 0;
1503         partial = FALSE;
1504         do {
1505                 block_end = block_start + blocksize;
1506                 if (block_end <= from || block_start >= to) {
1507                         if (!buffer_uptodate(bh))
1508                                 partial = TRUE;
1509                 } else {
1510                         set_buffer_uptodate(bh);
1511                         mark_buffer_dirty(bh);
1512                 }
1513         } while (block_start = block_end, (bh = bh->b_this_page) != head);
1514
1515         /*
1516          * If this is a partial write which happened to make all buffers
1517          * uptodate then we can optimize away a bogus ->readpage() for the next
1518          * read(). Here we 'discover' whether the page went uptodate as a
1519          * result of this (potentially partial) write.
1520          */
1521         if (!partial)
1522                 SetPageUptodate(page);
1523
1524         /*
1525          * Not convinced about this at all. See disparity comment above. For
1526          * now we know ntfs_prepare_write() would have failed in the write
1527          * exceeds i_size case, so this will never trigger which is fine.
1528          */
1529         if (pos > vi->i_size) {
1530                 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
1531                                 "not supported yet. Sorry.");
1532                 return -EOPNOTSUPP;
1533                 // vi->i_size = pos;
1534                 // mark_inode_dirty(vi);
1535         }
1536         ntfs_debug("Done.");
1537         return 0;
1538 }
1539
1540 /**
1541  * ntfs_commit_write - commit the received data
1542  *
1543  * This is called from generic_file_write() with i_sem held on the inode
1544  * (@page->mapping->host). The @page is locked and kmap()ped so page_address()
1545  * can simply be used. The source data has already been copied into the @page.
1546  *
1547  * Need to mark modified blocks dirty so they get written out later when
1548  * ntfs_writepage() is invoked by the VM.
1549  *
1550  * Return 0 on success or -errno on error.
1551  *
1552  * Should be using generic_commit_write(). This marks buffers uptodate and
1553  * dirty, sets the page uptodate if all buffers in the page are uptodate, and
1554  * updates i_size if the end of io is beyond i_size. In that case, it also
1555  * marks the inode dirty. - We could still use this (obviously except for
1556  * NInoMstProtected() attributes, where we will need to duplicate the core code
1557  * because we need our own async_io completion handler) but we could just do
1558  * the i_size update in prepare write, when we resize the attribute. Then
1559  * we would avoid the i_size update and mark_inode_dirty() happening here.
1560  *
1561  * Can't use generic_commit_write() due to ntfs specialities but can look at
1562  * it for implementation guidance.
1563  *
1564  * If things have gone as outlined in ntfs_prepare_write(), then we do not
1565  * need to do any page content modifications here at all, except in the write
1566  * to resident attribute case, where we need to do the uptodate bringing here
1567  * which we combine with the copying into the mft record which means we only
1568  * need to map the mft record and find the attribute record in it only once.
1569  */
1570 static int ntfs_commit_write(struct file *file, struct page *page,
1571                 unsigned from, unsigned to)
1572 {
1573         s64 attr_pos;
1574         struct inode *vi;
1575         ntfs_inode *ni, *base_ni;
1576         char *kaddr, *kattr;
1577         attr_search_context *ctx;
1578         MFT_RECORD *m;
1579         u32 attr_len, bytes;
1580         int err;
1581
1582         vi = page->mapping->host;
1583         ni = NTFS_I(vi);
1584
1585         ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
1586                         "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type,
1587                         page->index, from, to);
1588
1589         if (NInoNonResident(ni)) {
1590                 /*
1591                  * Only unnamed $DATA attributes can be compressed, encrypted,
1592                  * and/or sparse.
1593                  */
1594                 if (ni->type == AT_DATA && !ni->name_len) {
1595                         /* If file is encrypted, deny access, just like NT4. */
1596                         if (NInoEncrypted(ni)) {
1597                                 // Should never get here!
1598                                 ntfs_debug("Denying write access to encrypted "
1599                                                 "file.");
1600                                 return -EACCES;
1601                         }
1602                         /* Compressed data streams are handled in compress.c. */
1603                         if (NInoCompressed(ni)) {
1604                                 // TODO: Implement and replace this check with
1605                                 // return ntfs_write_compressed_block(page);
1606                                 // Should never get here!
1607                                 ntfs_error(vi->i_sb, "Writing to compressed "
1608                                                 "files is not supported yet. "
1609                                                 "Sorry.");
1610                                 return -EOPNOTSUPP;
1611                         }
1612                         // TODO: Implement and remove this check.
1613                         if (NInoSparse(ni)) {
1614                                 // Should never get here!
1615                                 ntfs_error(vi->i_sb, "Writing to sparse files "
1616                                                 "is not supported yet. Sorry.");
1617                                 return -EOPNOTSUPP;
1618                         }
1619                 }
1620
1621                 // TODO: Implement and remove this check.
1622                 if (NInoMstProtected(ni)) {
1623                         // Should never get here!
1624                         ntfs_error(vi->i_sb, "Writing to MST protected "
1625                                         "attributes is not supported yet. "
1626                                         "Sorry.");
1627                         return -EOPNOTSUPP;
1628                 }
1629
1630                 /* Normal data stream. */
1631                 return ntfs_commit_nonresident_write(page, from, to);
1632         }
1633
1634         /*
1635          * Attribute is resident, implying it is not compressed, encrypted, or
1636          * mst protected.
1637          */
1638
1639         /* Do we need to resize the attribute? */
1640         if (((s64)page->index << PAGE_CACHE_SHIFT) + to > vi->i_size) {
1641                 // TODO: Implement resize...
1642                 // pos = ((s64)page->index << PAGE_CACHE_SHIFT) + to;
1643                 // vi->i_size = pos;
1644                 // mark_inode_dirty(vi);
1645                 // Should never get here!
1646                 ntfs_error(vi->i_sb, "Writing beyond the existing file size is "
1647                                 "not supported yet. Sorry.");
1648                 return -EOPNOTSUPP;
1649         }
1650
1651         if (!NInoAttr(ni))
1652                 base_ni = ni;
1653         else
1654                 base_ni = ni->ext.base_ntfs_ino;
1655
1656         /* Map, pin, and lock the mft record. */
1657         m = map_mft_record(base_ni);
1658         if (unlikely(IS_ERR(m))) {
1659                 err = PTR_ERR(m);
1660                 m = NULL;
1661                 ctx = NULL;
1662                 goto err_out;
1663         }
1664         ctx = get_attr_search_ctx(base_ni, m);
1665         if (unlikely(!ctx)) {
1666                 err = -ENOMEM;
1667                 goto err_out;
1668         }
1669         if (unlikely(!lookup_attr(ni->type, ni->name, ni->name_len,
1670                         IGNORE_CASE, 0, NULL, 0, ctx))) {
1671                 err = -ENOENT;
1672                 goto err_out;
1673         }
1674
1675         /* Starting position of the page within the attribute value. */
1676         attr_pos = page->index << PAGE_CACHE_SHIFT;
1677
1678         /* The total length of the attribute value. */
1679         attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
1680
1681         if (unlikely(vi->i_size != attr_len)) {
1682                 ntfs_error(vi->i_sb, "BUG()! i_size (0x%llx) doesn't match "
1683                                 "attr_len (0x%x). Aborting write.", vi->i_size,
1684                                 attr_len);
1685                 err = -EIO;
1686                 goto err_out;
1687         }
1688         if (unlikely(attr_pos >= attr_len)) {
1689                 ntfs_error(vi->i_sb, "BUG()! attr_pos (0x%llx) > attr_len "
1690                                 "(0x%x). Aborting write.",
1691                                 (unsigned long long)attr_pos, attr_len);
1692                 err = -EIO;
1693                 goto err_out;
1694         }
1695
1696         bytes = attr_len - attr_pos;
1697         if (unlikely(bytes > PAGE_CACHE_SIZE))
1698                 bytes = PAGE_CACHE_SIZE;
1699
1700         /*
1701          * Calculate the address of the attribute value corresponding to the
1702          * beginning of the current data @page.
1703          */
1704         kattr = (u8*)ctx->attr + le16_to_cpu(
1705                         ctx->attr->data.resident.value_offset) + attr_pos;
1706
1707         kaddr = kmap_atomic(page, KM_USER0);
1708
1709         /* Copy the received data from the page to the mft record. */
1710         memcpy(kattr + from, kaddr + from, to - from);
1711         flush_dcache_mft_record_page(ctx->ntfs_ino);
1712
1713         if (!PageUptodate(page)) {
1714                 /*
1715                  * Bring the out of bounds area(s) uptodate by copying data
1716                  * from the mft record to the page.
1717                  */
1718                 if (from > 0)
1719                         memcpy(kaddr, kattr, from);
1720                 if (to < bytes)
1721                         memcpy(kaddr + to, kattr + to, bytes - to);
1722
1723                 /* Zero the region outside the end of the attribute value. */
1724                 if (likely(bytes < PAGE_CACHE_SIZE))
1725                         memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
1726
1727                 /*
1728                  * The probability of not having done any of the above is
1729                  * extremely small, so we just flush unconditionally.
1730                  */
1731                 flush_dcache_page(page);
1732                 SetPageUptodate(page);
1733         }
1734         kunmap_atomic(kaddr, KM_USER0);
1735
1736         /* Mark the mft record dirty, so it gets written back. */
1737         mark_mft_record_dirty(ctx->ntfs_ino);
1738
1739         put_attr_search_ctx(ctx);
1740         unmap_mft_record(base_ni);
1741         ntfs_debug("Done.");
1742         return 0;
1743 err_out:
1744         if (err == -ENOMEM) {
1745                 ntfs_warning(vi->i_sb, "Error allocating memory required to "
1746                                 "commit the write.");
1747                 if (PageUptodate(page)) {
1748                         ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1749                                         "dirty so the write will be retried "
1750                                         "later on by the VM.");
1751                         /*
1752                          * Put the page on mapping->dirty_pages, but leave its
1753                          * buffer's dirty state as-is.
1754                          */
1755                         __set_page_dirty_nobuffers(page);
1756                         err = 0;
1757                 } else
1758                         ntfs_error(vi->i_sb, "Page is not uptodate. Written "
1759                                         "data has been lost. )-:");
1760         } else {
1761                 ntfs_error(vi->i_sb, "Resident attribute write failed with "
1762                                 "error %i. Setting page error flag.", -err);
1763                 SetPageError(page);
1764         }
1765         if (ctx)
1766                 put_attr_search_ctx(ctx);
1767         if (m)
1768                 unmap_mft_record(base_ni);
1769         return err;
1770 }
1771
1772 #endif  /* NTFS_RW */
1773
1774 /**
1775  * ntfs_aops - general address space operations for inodes and attributes
1776  */
1777 struct address_space_operations ntfs_aops = {
1778         .readpage       = ntfs_readpage,        /* Fill page with data. */
1779         .sync_page      = block_sync_page,      /* Currently, just unplugs the
1780                                                    disk request queue. */
1781 #ifdef NTFS_RW
1782         .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
1783         .prepare_write  = ntfs_prepare_write,   /* Prepare page and buffers
1784                                                    ready to receive data. */
1785         .commit_write   = ntfs_commit_write,    /* Commit received data. */
1786 #endif
1787 };
1788
1789 /**
1790  * ntfs_mst_aops - general address space operations for mst protecteed inodes
1791  *                 and attributes
1792  */
1793 struct address_space_operations ntfs_mst_aops = {
1794         .readpage       = ntfs_readpage,        /* Fill page with data. */
1795         .sync_page      = block_sync_page,      /* Currently, just unplugs the
1796                                                    disk request queue. */
1797 };