vserver 2.0 rc7

[linux-2.6.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 4f2fb2c..4a2fee2 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,11 +29,6 @@
  #include <linux/security.h>
  #include <linux/syscalls.h>
  /*
- * This is needed for the following functions:
- *  - try_to_release_page
- *  - block_invalidatepage
- *  - generic_osync_inode
- *
   * FIXME: remove all knowledge of the buffer layer from the core VM
   */
  #include <linux/buffer_head.h> /* for generic_osync_inode */
@@ -123,12 +118,11 @@ void remove_from_page_cache(struct page *page)
  {
         struct address_space *mapping = page->mapping;
  
-       if (unlikely(!PageLocked(page)))
-               PAGE_BUG(page);
+       BUG_ON(!PageLocked(page));
  
-       spin_lock_irq(&mapping->tree_lock);
+       write_lock_irq(&mapping->tree_lock);
         __remove_from_page_cache(page);
-       spin_unlock_irq(&mapping->tree_lock);
+       write_unlock_irq(&mapping->tree_lock);
  }
  
  static int sync_page(void *word)
@@ -139,7 +133,25 @@ static int sync_page(void *word)
         page = container_of((page_flags_t *)word, struct page, flags);
  
         /*
-        * FIXME, fercrissake.  What is this barrier here for?
+        * page_mapping() is being called without PG_locked held.
+        * Some knowledge of the state and use of the page is used to
+        * reduce the requirements down to a memory barrier.
+        * The danger here is of a stale page_mapping() return value
+        * indicating a struct address_space different from the one it's
+        * associated with when it is associated with one.
+        * After smp_mb(), it's either the correct page_mapping() for
+        * the page, or an old page_mapping() and the page's own
+        * page_mapping() has gone NULL.
+        * The ->sync_page() address_space operation must tolerate
+        * page_mapping() going NULL. By an amazing coincidence,
+        * this comes about because none of the users of the page
+        * in the ->sync_page() methods make essential use of the
+        * page_mapping(), merely passing the page down to the backing
+        * device's unplug functions when it's non-NULL, which in turn
+        * ignore it for all cases but swap, where only page->private is
+        * of interest. When page_mapping() does go NULL, the entire
+        * call stack gracefully ignores the page and returns.
+        * -- wli
          */
         smp_mb();
         mapping = page_mapping(page);
@@ -152,9 +164,10 @@ static int sync_page(void *word)
  /**
   * filemap_fdatawrite_range - start writeback against all of a mapping's
   * dirty pages that lie within the byte offsets <start, end>
- * @mapping: address space structure to write
- * @start: offset in bytes where the range starts
- * @end : offset in bytes where the range ends
+ * @mapping:   address space structure to write
+ * @start:     offset in bytes where the range starts
+ * @end:       offset in bytes where the range ends
+ * @sync_mode: enable synchronous operation
   *
   * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
   * opposed to a regular memory * cleansing writeback.  The difference between
@@ -172,7 +185,7 @@ static int __filemap_fdatawrite_range(struct address_space *mapping,
                 .end = end,
         };
  
-       if (mapping->backing_dev_info->memory_backed)
+       if (!mapping_cap_writeback_dirty(mapping))
                 return 0;
  
         ret = do_writepages(mapping, &wbc);
@@ -269,7 +282,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
         pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
         int ret;
  
-       if (mapping->backing_dev_info->memory_backed || !count)
+       if (!mapping_cap_writeback_dirty(mapping) || !count)
                 return 0;
         ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
         if (ret == 0) {
@@ -295,7 +308,7 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
         pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
         int ret;
  
-       if (mapping->backing_dev_info->memory_backed || !count)
+       if (!mapping_cap_writeback_dirty(mapping) || !count)
                 return 0;
         ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
         if (ret == 0)
@@ -336,6 +349,22 @@ int filemap_write_and_wait(struct address_space *mapping)
         return retval;
  }
  
+int filemap_write_and_wait_range(struct address_space *mapping,
+                                loff_t lstart, loff_t lend)
+{
+       int retval = 0;
+
+       if (mapping->nrpages) {
+               retval = __filemap_fdatawrite_range(mapping, lstart, lend,
+                                                   WB_SYNC_ALL);
+               if (retval == 0)
+                       retval = wait_on_page_writeback_range(mapping,
+                                                   lstart >> PAGE_CACHE_SHIFT,
+                                                   lend >> PAGE_CACHE_SHIFT);
+       }
+       return retval;
+}
+
  /*
   * This function is used to add newly allocated pagecache pages:
   * the page is new, so we can just run SetPageLocked() against it.
@@ -349,7 +378,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
         int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
  
         if (error == 0) {
-               spin_lock_irq(&mapping->tree_lock);
+               write_lock_irq(&mapping->tree_lock);
                 error = radix_tree_insert(&mapping->page_tree, offset, page);
                 if (!error) {
                         page_cache_get(page);
@@ -359,7 +388,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
                         mapping->nrpages++;
                         pagecache_acct(1);
                 }
-               spin_unlock_irq(&mapping->tree_lock);
+               write_unlock_irq(&mapping->tree_lock);
                 radix_tree_preload_end();
         }
         return error;
@@ -472,11 +501,11 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
  {
         struct page *page;
  
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
         page = radix_tree_lookup(&mapping->page_tree, offset);
         if (page)
                 page_cache_get(page);
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
         return page;
  }
  
@@ -489,11 +518,11 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
  {
         struct page *page;
  
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
         page = radix_tree_lookup(&mapping->page_tree, offset);
         if (page && TestSetPageLocked(page))
                 page = NULL;
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
         return page;
  }
  
@@ -502,8 +531,8 @@ EXPORT_SYMBOL(find_trylock_page);
  /**
   * find_lock_page - locate, pin and lock a pagecache page
   *
- * @mapping - the address_space to search
- * @offset - the page index
+ * @mapping: the address_space to search
+ * @offset: the page index
   *
   * Locates the desired pagecache page, locks it, increments its reference
   * count and returns its address.
@@ -515,15 +544,15 @@ struct page *find_lock_page(struct address_space *mapping,
  {
         struct page *page;
  
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
  repeat:
         page = radix_tree_lookup(&mapping->page_tree, offset);
         if (page) {
                 page_cache_get(page);
                 if (TestSetPageLocked(page)) {
-                       spin_unlock_irq(&mapping->tree_lock);
+                       read_unlock_irq(&mapping->tree_lock);
                         lock_page(page);
-                       spin_lock_irq(&mapping->tree_lock);
+                       read_lock_irq(&mapping->tree_lock);
  
                         /* Has the page been truncated while we slept? */
                         if (page->mapping != mapping || page->index != offset) {
@@ -533,7 +562,7 @@ repeat:
                         }
                 }
         }
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
         return page;
  }
  
@@ -542,9 +571,9 @@ EXPORT_SYMBOL(find_lock_page);
  /**
   * find_or_create_page - locate or add a pagecache page
   *
- * @mapping - the page's address_space
- * @index - the page's index into the mapping
- * @gfp_mask - page allocation mode
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
   *
   * Locates a page in the pagecache.  If the page is not present, a new page
   * is allocated using @gfp_mask and is added to the pagecache and to the VM's
@@ -607,12 +636,12 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
         unsigned int i;
         unsigned int ret;
  
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
         ret = radix_tree_gang_lookup(&mapping->page_tree,
                                 (void **)pages, start, nr_pages);
         for (i = 0; i < ret; i++)
                 page_cache_get(pages[i]);
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
         return ret;
  }
  
@@ -626,14 +655,14 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
         unsigned int i;
         unsigned int ret;
  
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
         ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
                                 (void **)pages, *index, nr_pages, tag);
         for (i = 0; i < ret; i++)
                 page_cache_get(pages[i]);
         if (ret)
                 *index = pages[ret - 1]->index + 1;
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
         return ret;
  }
  
@@ -650,7 +679,7 @@ struct page *
  grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
  {
         struct page *page = find_get_page(mapping, index);
-       int gfp_mask;
+       unsigned int gfp_mask;
  
         if (page) {
                 if (!TestSetPageLocked(page))
@@ -691,7 +720,7 @@ void do_generic_mapping_read(struct address_space *mapping,
         unsigned long index;
         unsigned long end_index;
         unsigned long offset;
-       unsigned long req_size;
+       unsigned long last_index;
         unsigned long next_index;
         unsigned long prev_index;
         loff_t isize;
@@ -703,7 +732,7 @@ void do_generic_mapping_read(struct address_space *mapping,
         index = *ppos >> PAGE_CACHE_SHIFT;
         next_index = index;
         prev_index = ra.prev_page;
-       req_size = (desc->count + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
         offset = *ppos & ~PAGE_CACHE_MASK;
  
         isize = i_size_read(inode);
@@ -713,7 +742,7 @@ void do_generic_mapping_read(struct address_space *mapping,
         end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
         for (;;) {
                 struct page *page;
-               unsigned long ret_size, nr, ret;
+               unsigned long nr, ret;
  
                 /* nr is the maximum number of bytes to copy from this page */
                 nr = PAGE_CACHE_SIZE;
@@ -728,12 +757,9 @@ void do_generic_mapping_read(struct address_space *mapping,
                 nr = nr - offset;
  
                 cond_resched();
-               if (index == next_index && req_size) {
-                       ret_size = page_cache_readahead(mapping, &ra,
-                                       filp, index, req_size);
-                       next_index += ret_size;
-                       req_size -= ret_size;
-               }
+               if (index == next_index)
+                       next_index = page_cache_readahead(mapping, &ra, filp,
+                                       index, last_index - index);
  
  find_page:
                 page = find_get_page(mapping, index);
@@ -978,7 +1004,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                 if (pos < size) {
                         retval = generic_file_direct_IO(READ, iocb,
                                                 iov, pos, nr_segs);
-                       if (retval >= 0 && !is_sync_kiocb(iocb))
+                       if (retval > 0 && !is_sync_kiocb(iocb))
                                 retval = -EIOCBQUEUED;
                         if (retval > 0)
                                 *ppos = pos + retval;
@@ -1154,7 +1180,8 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
   * it in the page cache, and handles the special cases reasonably without
   * having a lot of duplicated code.
   */
-struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int *type)
+struct page *filemap_nopage(struct vm_area_struct *area,
+                               unsigned long address, int *type)
  {
         int error;
         struct file *file = area->vm_file;
@@ -1162,11 +1189,10 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
         struct file_ra_state *ra = &file->f_ra;
         struct inode *inode = mapping->host;
         struct page *page;
-       unsigned long size, pgoff, endoff;
+       unsigned long size, pgoff;
         int did_readaround = 0, majmin = VM_FAULT_MINOR;
  
-       pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
-       endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+       pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
  
  retry_all:
         size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -1177,13 +1203,6 @@ retry_all:
         if (VM_RandomReadHint(area))
                 goto no_cached_page;
  
-       /*
-        * The "size" of the file, as far as mmap is concerned, isn't bigger
-        * than the mapping
-        */
-       if (size > endoff)
-               size = endoff;
-
         /*
          * The readahead code wants to be told about each and every page
          * so it can build and shrink its windows appropriately
@@ -1373,8 +1392,13 @@ retry_find:
          * Ok, found a page in the page cache, now we need to check
          * that it's up-to-date.
          */
-       if (!PageUptodate(page))
+       if (!PageUptodate(page)) {
+               if (nonblock) {
+                       page_cache_release(page);
+                       return NULL;
+               }
                 goto page_not_uptodate;
+       }
  
  success:
         /*
@@ -1938,12 +1962,13 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                 buf = iov->iov_base + written;
         else {
                 filemap_set_next_iovec(&cur_iov, &iov_base, written);
-               buf = iov->iov_base + iov_base;
+               buf = cur_iov->iov_base + iov_base;
         }
  
         do {
                 unsigned long index;
                 unsigned long offset;
+               unsigned long maxlen;
                 size_t copied;
  
                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
@@ -1958,7 +1983,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                  * same page as we're writing to, without it being marked
                  * up-to-date.
                  */
-               fault_in_pages_readable(buf, bytes);
+               maxlen = cur_iov->iov_len - iov_base;
+               if (maxlen > bytes)
+                       maxlen = bytes;
+               fault_in_pages_readable(buf, maxlen);
  
                 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
                 if (!page) {
@@ -1996,9 +2024,13 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                 count -= status;
                                 pos += status;
                                 buf += status;
-                               if (unlikely(nr_segs > 1))
+                               if (unlikely(nr_segs > 1)) {
                                         filemap_set_next_iovec(&cur_iov,
                                                         &iov_base, status);
+                                       buf = cur_iov->iov_base + iov_base;
+                               } else {
+                                       iov_base += status;
+                               }
                         }
                 }
                 if (unlikely(copied != bytes))
@@ -2269,21 +2301,28 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
         struct file *file = iocb->ki_filp;
         struct address_space *mapping = file->f_mapping;
         ssize_t retval;
+       size_t write_len = 0;
  
         /*
          * If it's a write, unmap all mmappings of the file up-front.  This
          * will cause any pte dirty bits to be propagated into the pageframes
          * for the subsequent filemap_write_and_wait().
          */
-       if (rw == WRITE && mapping_mapped(mapping))
-               unmap_mapping_range(mapping, 0, -1, 0);
+       if (rw == WRITE) {
+               write_len = iov_length(iov, nr_segs);
+               if (mapping_mapped(mapping))
+                       unmap_mapping_range(mapping, offset, write_len, 0);
+       }
  
         retval = filemap_write_and_wait(mapping);
         if (retval == 0) {
                 retval = mapping->a_ops->direct_IO(rw, iocb, iov,
                                                 offset, nr_segs);
                 if (rw == WRITE && mapping->nrpages) {
-                       int err = invalidate_inode_pages2(mapping);
+                       pgoff_t end = (offset + write_len - 1)
+                                               >> PAGE_CACHE_SHIFT;
+                       int err = invalidate_inode_pages2_range(mapping,
+                                       offset >> PAGE_CACHE_SHIFT, end);
                         if (err)
                                 retval = err;
                 }