vserver 2.0 rc7
[linux-2.6.git] / mm / filemap.c
index 4f2fb2c..4a2fee2 100644 (file)
 #include <linux/security.h>
 #include <linux/syscalls.h>
 /*
- * This is needed for the following functions:
- *  - try_to_release_page
- *  - block_invalidatepage
- *  - generic_osync_inode
- *
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
 #include <linux/buffer_head.h> /* for generic_osync_inode */
@@ -123,12 +118,11 @@ void remove_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
 
-       if (unlikely(!PageLocked(page)))
-               PAGE_BUG(page);
+       BUG_ON(!PageLocked(page));
 
-       spin_lock_irq(&mapping->tree_lock);
+       write_lock_irq(&mapping->tree_lock);
        __remove_from_page_cache(page);
-       spin_unlock_irq(&mapping->tree_lock);
+       write_unlock_irq(&mapping->tree_lock);
 }
 
 static int sync_page(void *word)
@@ -139,7 +133,25 @@ static int sync_page(void *word)
        page = container_of((page_flags_t *)word, struct page, flags);
 
        /*
-        * FIXME, fercrissake.  What is this barrier here for?
+        * page_mapping() is being called without PG_locked held.
+        * Some knowledge of the state and use of the page is used to
+        * reduce the requirements down to a memory barrier.
+        * The danger here is of a stale page_mapping() return value
+        * indicating a struct address_space different from the one it's
+        * associated with when it is associated with one.
+        * After smp_mb(), it's either the correct page_mapping() for
+        * the page, or an old page_mapping() and the page's own
+        * page_mapping() has gone NULL.
+        * The ->sync_page() address_space operation must tolerate
+        * page_mapping() going NULL. By an amazing coincidence,
+        * this comes about because none of the users of the page
+        * in the ->sync_page() methods make essential use of the
+        * page_mapping(), merely passing the page down to the backing
+        * device's unplug functions when it's non-NULL, which in turn
+        * ignore it for all cases but swap, where only page->private is
+        * of interest. When page_mapping() does go NULL, the entire
+        * call stack gracefully ignores the page and returns.
+        * -- wli
         */
        smp_mb();
        mapping = page_mapping(page);
@@ -152,9 +164,10 @@ static int sync_page(void *word)
 /**
  * filemap_fdatawrite_range - start writeback against all of a mapping's
  * dirty pages that lie within the byte offsets <start, end>
- * @mapping: address space structure to write
- * @start: offset in bytes where the range starts
- * @end : offset in bytes where the range ends
+ * @mapping:   address space structure to write
+ * @start:     offset in bytes where the range starts
+ * @end:       offset in bytes where the range ends
+ * @sync_mode: enable synchronous operation
  *
  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
  * opposed to a regular memory * cleansing writeback.  The difference between
@@ -172,7 +185,7 @@ static int __filemap_fdatawrite_range(struct address_space *mapping,
                .end = end,
        };
 
-       if (mapping->backing_dev_info->memory_backed)
+       if (!mapping_cap_writeback_dirty(mapping))
                return 0;
 
        ret = do_writepages(mapping, &wbc);
@@ -269,7 +282,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
        int ret;
 
-       if (mapping->backing_dev_info->memory_backed || !count)
+       if (!mapping_cap_writeback_dirty(mapping) || !count)
                return 0;
        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
        if (ret == 0) {
@@ -295,7 +308,7 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
        int ret;
 
-       if (mapping->backing_dev_info->memory_backed || !count)
+       if (!mapping_cap_writeback_dirty(mapping) || !count)
                return 0;
        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
        if (ret == 0)
@@ -336,6 +349,22 @@ int filemap_write_and_wait(struct address_space *mapping)
        return retval;
 }
 
+int filemap_write_and_wait_range(struct address_space *mapping,
+                                loff_t lstart, loff_t lend)
+{
+       int retval = 0;
+
+       if (mapping->nrpages) {
+               retval = __filemap_fdatawrite_range(mapping, lstart, lend,
+                                                   WB_SYNC_ALL);
+               if (retval == 0)
+                       retval = wait_on_page_writeback_range(mapping,
+                                                   lstart >> PAGE_CACHE_SHIFT,
+                                                   lend >> PAGE_CACHE_SHIFT);
+       }
+       return retval;
+}
+
 /*
  * This function is used to add newly allocated pagecache pages:
  * the page is new, so we can just run SetPageLocked() against it.
@@ -349,7 +378,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
        int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
        if (error == 0) {
-               spin_lock_irq(&mapping->tree_lock);
+               write_lock_irq(&mapping->tree_lock);
                error = radix_tree_insert(&mapping->page_tree, offset, page);
                if (!error) {
                        page_cache_get(page);
@@ -359,7 +388,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
                        mapping->nrpages++;
                        pagecache_acct(1);
                }
-               spin_unlock_irq(&mapping->tree_lock);
+               write_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
        }
        return error;
@@ -472,11 +501,11 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 {
        struct page *page;
 
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page)
                page_cache_get(page);
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 
@@ -489,11 +518,11 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
 {
        struct page *page;
 
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page && TestSetPageLocked(page))
                page = NULL;
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 
@@ -502,8 +531,8 @@ EXPORT_SYMBOL(find_trylock_page);
 /**
  * find_lock_page - locate, pin and lock a pagecache page
  *
- * @mapping - the address_space to search
- * @offset - the page index
+ * @mapping: the address_space to search
+ * @offset: the page index
  *
  * Locates the desired pagecache page, locks it, increments its reference
  * count and returns its address.
@@ -515,15 +544,15 @@ struct page *find_lock_page(struct address_space *mapping,
 {
        struct page *page;
 
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
 repeat:
        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page) {
                page_cache_get(page);
                if (TestSetPageLocked(page)) {
-                       spin_unlock_irq(&mapping->tree_lock);
+                       read_unlock_irq(&mapping->tree_lock);
                        lock_page(page);
-                       spin_lock_irq(&mapping->tree_lock);
+                       read_lock_irq(&mapping->tree_lock);
 
                        /* Has the page been truncated while we slept? */
                        if (page->mapping != mapping || page->index != offset) {
@@ -533,7 +562,7 @@ repeat:
                        }
                }
        }
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 
@@ -542,9 +571,9 @@ EXPORT_SYMBOL(find_lock_page);
 /**
  * find_or_create_page - locate or add a pagecache page
  *
- * @mapping - the page's address_space
- * @index - the page's index into the mapping
- * @gfp_mask - page allocation mode
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
  *
  * Locates a page in the pagecache.  If the page is not present, a new page
  * is allocated using @gfp_mask and is added to the pagecache and to the VM's
@@ -607,12 +636,12 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
        unsigned int i;
        unsigned int ret;
 
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
        ret = radix_tree_gang_lookup(&mapping->page_tree,
                                (void **)pages, start, nr_pages);
        for (i = 0; i < ret; i++)
                page_cache_get(pages[i]);
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
        return ret;
 }
 
@@ -626,14 +655,14 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
        unsigned int i;
        unsigned int ret;
 
-       spin_lock_irq(&mapping->tree_lock);
+       read_lock_irq(&mapping->tree_lock);
        ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
                                (void **)pages, *index, nr_pages, tag);
        for (i = 0; i < ret; i++)
                page_cache_get(pages[i]);
        if (ret)
                *index = pages[ret - 1]->index + 1;
-       spin_unlock_irq(&mapping->tree_lock);
+       read_unlock_irq(&mapping->tree_lock);
        return ret;
 }
 
@@ -650,7 +679,7 @@ struct page *
 grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 {
        struct page *page = find_get_page(mapping, index);
-       int gfp_mask;
+       unsigned int gfp_mask;
 
        if (page) {
                if (!TestSetPageLocked(page))
@@ -691,7 +720,7 @@ void do_generic_mapping_read(struct address_space *mapping,
        unsigned long index;
        unsigned long end_index;
        unsigned long offset;
-       unsigned long req_size;
+       unsigned long last_index;
        unsigned long next_index;
        unsigned long prev_index;
        loff_t isize;
@@ -703,7 +732,7 @@ void do_generic_mapping_read(struct address_space *mapping,
        index = *ppos >> PAGE_CACHE_SHIFT;
        next_index = index;
        prev_index = ra.prev_page;
-       req_size = (desc->count + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
        offset = *ppos & ~PAGE_CACHE_MASK;
 
        isize = i_size_read(inode);
@@ -713,7 +742,7 @@ void do_generic_mapping_read(struct address_space *mapping,
        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
        for (;;) {
                struct page *page;
-               unsigned long ret_size, nr, ret;
+               unsigned long nr, ret;
 
                /* nr is the maximum number of bytes to copy from this page */
                nr = PAGE_CACHE_SIZE;
@@ -728,12 +757,9 @@ void do_generic_mapping_read(struct address_space *mapping,
                nr = nr - offset;
 
                cond_resched();
-               if (index == next_index && req_size) {
-                       ret_size = page_cache_readahead(mapping, &ra,
-                                       filp, index, req_size);
-                       next_index += ret_size;
-                       req_size -= ret_size;
-               }
+               if (index == next_index)
+                       next_index = page_cache_readahead(mapping, &ra, filp,
+                                       index, last_index - index);
 
 find_page:
                page = find_get_page(mapping, index);
@@ -978,7 +1004,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                if (pos < size) {
                        retval = generic_file_direct_IO(READ, iocb,
                                                iov, pos, nr_segs);
-                       if (retval >= 0 && !is_sync_kiocb(iocb))
+                       if (retval > 0 && !is_sync_kiocb(iocb))
                                retval = -EIOCBQUEUED;
                        if (retval > 0)
                                *ppos = pos + retval;
@@ -1154,7 +1180,8 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
  * it in the page cache, and handles the special cases reasonably without
  * having a lot of duplicated code.
  */
-struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int *type)
+struct page *filemap_nopage(struct vm_area_struct *area,
+                               unsigned long address, int *type)
 {
        int error;
        struct file *file = area->vm_file;
@@ -1162,11 +1189,10 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
        struct file_ra_state *ra = &file->f_ra;
        struct inode *inode = mapping->host;
        struct page *page;
-       unsigned long size, pgoff, endoff;
+       unsigned long size, pgoff;
        int did_readaround = 0, majmin = VM_FAULT_MINOR;
 
-       pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
-       endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+       pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
 
 retry_all:
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -1177,13 +1203,6 @@ retry_all:
        if (VM_RandomReadHint(area))
                goto no_cached_page;
 
-       /*
-        * The "size" of the file, as far as mmap is concerned, isn't bigger
-        * than the mapping
-        */
-       if (size > endoff)
-               size = endoff;
-
        /*
         * The readahead code wants to be told about each and every page
         * so it can build and shrink its windows appropriately
@@ -1373,8 +1392,13 @@ retry_find:
         * Ok, found a page in the page cache, now we need to check
         * that it's up-to-date.
         */
-       if (!PageUptodate(page))
+       if (!PageUptodate(page)) {
+               if (nonblock) {
+                       page_cache_release(page);
+                       return NULL;
+               }
                goto page_not_uptodate;
+       }
 
 success:
        /*
@@ -1938,12 +1962,13 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                buf = iov->iov_base + written;
        else {
                filemap_set_next_iovec(&cur_iov, &iov_base, written);
-               buf = iov->iov_base + iov_base;
+               buf = cur_iov->iov_base + iov_base;
        }
 
        do {
                unsigned long index;
                unsigned long offset;
+               unsigned long maxlen;
                size_t copied;
 
                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
@@ -1958,7 +1983,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                 * same page as we're writing to, without it being marked
                 * up-to-date.
                 */
-               fault_in_pages_readable(buf, bytes);
+               maxlen = cur_iov->iov_len - iov_base;
+               if (maxlen > bytes)
+                       maxlen = bytes;
+               fault_in_pages_readable(buf, maxlen);
 
                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
                if (!page) {
@@ -1996,9 +2024,13 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                count -= status;
                                pos += status;
                                buf += status;
-                               if (unlikely(nr_segs > 1))
+                               if (unlikely(nr_segs > 1)) {
                                        filemap_set_next_iovec(&cur_iov,
                                                        &iov_base, status);
+                                       buf = cur_iov->iov_base + iov_base;
+                               } else {
+                                       iov_base += status;
+                               }
                        }
                }
                if (unlikely(copied != bytes))
@@ -2269,21 +2301,28 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        ssize_t retval;
+       size_t write_len = 0;
 
        /*
         * If it's a write, unmap all mmappings of the file up-front.  This
         * will cause any pte dirty bits to be propagated into the pageframes
         * for the subsequent filemap_write_and_wait().
         */
-       if (rw == WRITE && mapping_mapped(mapping))
-               unmap_mapping_range(mapping, 0, -1, 0);
+       if (rw == WRITE) {
+               write_len = iov_length(iov, nr_segs);
+               if (mapping_mapped(mapping))
+                       unmap_mapping_range(mapping, offset, write_len, 0);
+       }
 
        retval = filemap_write_and_wait(mapping);
        if (retval == 0) {
                retval = mapping->a_ops->direct_IO(rw, iocb, iov,
                                                offset, nr_segs);
                if (rw == WRITE && mapping->nrpages) {
-                       int err = invalidate_inode_pages2(mapping);
+                       pgoff_t end = (offset + write_len - 1)
+                                               >> PAGE_CACHE_SHIFT;
+                       int err = invalidate_inode_pages2_range(mapping,
+                                       offset >> PAGE_CACHE_SHIFT, end);
                        if (err)
                                retval = err;
                }