#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/aio.h>
+#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/cpuset.h>
+#include "filemap.h"
+#include "internal.h"
+
/*
- * This is needed for the following functions:
- * - try_to_release_page
- * - block_invalidatepage
- * - generic_osync_inode
- *
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
#include <linux/buffer_head.h> /* for generic_osync_inode */
#include <asm/uaccess.h>
#include <asm/mman.h>
+static ssize_t
+generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+ loff_t offset, unsigned long nr_segs);
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
/*
* Lock ordering:
*
- * ->i_shared_sem (vmtruncate)
+ * ->i_mmap_lock (vmtruncate)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
- * ->swap_list_lock
- * ->swap_device_lock (exclusive_swap_page, others)
- * ->mapping->tree_lock
+ * ->swap_lock (exclusive_swap_page, others)
+ * ->mapping->tree_lock
*
- * ->i_sem
- * ->i_shared_sem (truncate->unmap_mapping_range)
+ * ->i_mutex
+ * ->i_mmap_lock (truncate->unmap_mapping_range)
*
* ->mmap_sem
- * ->i_shared_sem (various places)
+ * ->i_mmap_lock
+ * ->page_table_lock or pte_lock (various, mainly in memory.c)
+ * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_sem
* ->lock_page (access_process_vm)
*
* ->mmap_sem
- * ->i_sem (msync)
+ * ->i_mutex (msync)
*
- * ->i_sem
+ * ->i_mutex
* ->i_alloc_sem (various)
*
* ->inode_lock
* ->sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
- * ->page_table_lock
- * ->swap_device_lock (try_to_unmap_one)
+ * ->i_mmap_lock
+ * ->anon_vma.lock (vma_adjust)
+ *
+ * ->anon_vma.lock
+ * ->page_table_lock or pte_lock (anon_vma_prepare and various)
+ *
+ * ->page_table_lock or pte_lock
+ * ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
+ * ->zone.lru_lock (check_pte_range->isolate_lru_page)
+ * ->private_lock (page_remove_rmap->set_page_dirty)
+ * ->tree_lock (page_remove_rmap->set_page_dirty)
+ * ->inode_lock (page_remove_rmap->set_page_dirty)
+ * ->inode_lock (zap_pte_range->set_page_dirty)
+ * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
* ->task->proc_lock
* ->dcache_lock (proc_pid_lookup)
{
struct address_space *mapping = page->mapping;
- if (unlikely(!PageLocked(page)))
- PAGE_BUG(page);
+ BUG_ON(!PageLocked(page));
- spin_lock_irq(&mapping->tree_lock);
+ write_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
+ write_unlock_irq(&mapping->tree_lock);
}
-static inline int sync_page(struct page *page)
+static int sync_page(void *word)
{
struct address_space *mapping;
+ struct page *page;
+
+ page = container_of((unsigned long *)word, struct page, flags);
+ /*
+ * page_mapping() is being called without PG_locked held.
+ * Some knowledge of the state and use of the page is used to
+ * reduce the requirements down to a memory barrier.
+ * The danger here is of a stale page_mapping() return value
+ * indicating a struct address_space different from the one it's
+ * associated with when it is associated with one.
+ * After smp_mb(), it's either the correct page_mapping() for
+ * the page, or an old page_mapping() and the page's own
+ * page_mapping() has gone NULL.
+ * The ->sync_page() address_space operation must tolerate
+ * page_mapping() going NULL. By an amazing coincidence,
+ * this comes about because none of the users of the page
+ * in the ->sync_page() methods make essential use of the
+ * page_mapping(), merely passing the page down to the backing
+ * device's unplug functions when it's non-NULL, which in turn
+ * ignore it for all cases but swap, where only page_private(page) is
+ * of interest. When page_mapping() does go NULL, the entire
+ * call stack gracefully ignores the page and returns.
+ * -- wli
+ */
smp_mb();
mapping = page_mapping(page);
- if (mapping) {
- if (mapping->a_ops && mapping->a_ops->sync_page)
- return mapping->a_ops->sync_page(page);
- } else if (PageSwapCache(page)) {
- swap_unplug_io_fn(page);
- }
+ if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
+ mapping->a_ops->sync_page(page);
+ io_schedule();
return 0;
}
/**
- * filemap_fdatawrite - start writeback against all of a mapping's dirty pages
- * @mapping: address space structure to write
+ * filemap_fdatawrite_range - start writeback against all of a mapping's
+ * dirty pages that lie within the byte offsets <start, end>
+ * @mapping: address space structure to write
+ * @start: offset in bytes where the range starts
+ * @end: offset in bytes where the range ends (inclusive)
+ * @sync_mode: enable synchronous operation
*
* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
* opposed to a regular memory * cleansing writeback. The difference between
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
*/
-static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode)
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode)
{
int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
.nr_to_write = mapping->nrpages * 2,
+ .start = start,
+ .end = end,
};
- if (mapping->backing_dev_info->memory_backed)
+ if (!mapping_cap_writeback_dirty(mapping))
return 0;
ret = do_writepages(mapping, &wbc);
return ret;
}
+static inline int __filemap_fdatawrite(struct address_space *mapping,
+ int sync_mode)
+{
+ return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+}
+
int filemap_fdatawrite(struct address_space *mapping)
{
return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
+}
+
/*
* This is a mostly non-blocking flush. Not suitable for data-integrity
* purposes - I/O may not be started against all dirty pages.
* Wait for writeback to complete against pages indexed by start->end
* inclusive
*/
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
pagevec_init(&pvec, 0);
index = start;
- while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ while ((index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
unsigned i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ /* until radix tree lookup accepts end_index */
+ if (page->index > end)
+ continue;
+
wait_on_page_writeback(page);
if (PageError(page))
ret = -EIO;
return ret;
}
+/*
+ * Write and wait upon all the pages in the passed range. This is a "data
+ * integrity" operation. It waits upon in-flight writeout before starting and
+ * waiting upon new writeout. If there was an IO error, return it.
+ *
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
+ * it is otherwise livelockable.
+ */
+int sync_page_range(struct inode *inode, struct address_space *mapping,
+ loff_t pos, loff_t count)
+{
+ pgoff_t start = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+ int ret;
+
+ if (!mapping_cap_writeback_dirty(mapping) || !count)
+ return 0;
+ ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
+ if (ret == 0) {
+ mutex_lock(&inode->i_mutex);
+ ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ mutex_unlock(&inode->i_mutex);
+ }
+ if (ret == 0)
+ ret = wait_on_page_writeback_range(mapping, start, end);
+ return ret;
+}
+EXPORT_SYMBOL(sync_page_range);
+
+/*
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
+ * as it forces O_SYNC writers to different parts of the same file
+ * to be serialised right until io completion.
+ */
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+ loff_t pos, loff_t count)
+{
+ pgoff_t start = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+ int ret;
+
+ if (!mapping_cap_writeback_dirty(mapping) || !count)
+ return 0;
+ ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
+ if (ret == 0)
+ ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ if (ret == 0)
+ ret = wait_on_page_writeback_range(mapping, start, end);
+ return ret;
+}
+EXPORT_SYMBOL(sync_page_range_nolock);
+
/**
* filemap_fdatawait - walk the list of under-writeback pages of the given
* address space and wait for all of them.
*/
int filemap_fdatawait(struct address_space *mapping)
{
- return wait_on_page_writeback_range(mapping, 0, -1);
-}
+ loff_t i_size = i_size_read(mapping->host);
+
+ if (i_size == 0)
+ return 0;
+ return wait_on_page_writeback_range(mapping, 0,
+ (i_size - 1) >> PAGE_CACHE_SHIFT);
+}
EXPORT_SYMBOL(filemap_fdatawait);
int filemap_write_and_wait(struct address_space *mapping)
{
- int retval = 0;
+ int err = 0;
if (mapping->nrpages) {
- retval = filemap_fdatawrite(mapping);
- if (retval == 0)
- retval = filemap_fdatawait(mapping);
+ err = filemap_fdatawrite(mapping);
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
+ if (err != -EIO) {
+ int err2 = filemap_fdatawait(mapping);
+ if (!err)
+ err = err2;
+ }
}
- return retval;
+ return err;
+}
+EXPORT_SYMBOL(filemap_write_and_wait);
+
+/*
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
+int filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int err = 0;
+
+ if (mapping->nrpages) {
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /* See comment of filemap_write_and_wait() */
+ if (err != -EIO) {
+ int err2 = wait_on_page_writeback_range(mapping,
+ lstart >> PAGE_CACHE_SHIFT,
+ lend >> PAGE_CACHE_SHIFT);
+ if (!err)
+ err = err2;
+ }
+ }
+ return err;
}
/*
* This function does not add the page to the LRU. The caller must do that.
*/
int add_to_page_cache(struct page *page, struct address_space *mapping,
- pgoff_t offset, int gfp_mask)
+ pgoff_t offset, gfp_t gfp_mask)
{
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
- page_cache_get(page);
- spin_lock_irq(&mapping->tree_lock);
+ write_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (!error) {
+ page_cache_get(page);
SetPageLocked(page);
page->mapping = mapping;
page->index = offset;
mapping->nrpages++;
pagecache_acct(1);
- } else {
- page_cache_release(page);
}
- spin_unlock_irq(&mapping->tree_lock);
+ write_unlock_irq(&mapping->tree_lock);
radix_tree_preload_end();
}
return error;
EXPORT_SYMBOL(add_to_page_cache);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
- pgoff_t offset, int gfp_mask)
+ pgoff_t offset, gfp_t gfp_mask)
{
int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
if (ret == 0)
return ret;
}
+#ifdef CONFIG_NUMA
+struct page *page_cache_alloc(struct address_space *x)
+{
+ if (cpuset_do_page_mem_spread()) {
+ int n = cpuset_mem_spread_node();
+ return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+ }
+ return alloc_pages(mapping_gfp_mask(x), 0);
+}
+EXPORT_SYMBOL(page_cache_alloc);
+
+struct page *page_cache_alloc_cold(struct address_space *x)
+{
+ if (cpuset_do_page_mem_spread()) {
+ int n = cpuset_mem_spread_node();
+ return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+ }
+ return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+}
+EXPORT_SYMBOL(page_cache_alloc_cold);
+#endif
+
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
}
+static inline void wake_up_page(struct page *page, int bit)
+{
+ __wake_up_bit(page_waitqueue(page), &page->flags, bit);
+}
+
void fastcall wait_on_page_bit(struct page *page, int bit_nr)
{
- wait_queue_head_t *waitqueue = page_waitqueue(page);
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
- do {
- prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE);
- if (test_bit(bit_nr, &page->flags)) {
- sync_page(page);
- io_schedule();
- }
- } while (test_bit(bit_nr, &page->flags));
- finish_wait(waitqueue, &wait);
+ if (test_bit(bit_nr, &page->flags))
+ __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+ TASK_UNINTERRUPTIBLE);
}
-
EXPORT_SYMBOL(wait_on_page_bit);
/**
*/
void fastcall unlock_page(struct page *page)
{
- wait_queue_head_t *waitqueue = page_waitqueue(page);
smp_mb__before_clear_bit();
if (!TestClearPageLocked(page))
BUG();
smp_mb__after_clear_bit();
- if (waitqueue_active(waitqueue))
- wake_up_all(waitqueue);
+ wake_up_page(page, PG_locked);
}
-
EXPORT_SYMBOL(unlock_page);
-EXPORT_SYMBOL(lock_page);
/*
* End writeback against a page.
*/
void end_page_writeback(struct page *page)
{
- wait_queue_head_t *waitqueue = page_waitqueue(page);
-
+ struct zone *zone = page_zone(page);
if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
if (!test_clear_page_writeback(page))
BUG();
- smp_mb__after_clear_bit();
}
- if (waitqueue_active(waitqueue))
- wake_up_all(waitqueue);
+ smp_mb__after_clear_bit();
+ if (zone->all_unreclaimable) {
+ spin_lock(&zone->lock);
+ zone->all_unreclaimable = 0;
+ zone->pages_scanned = 0;
+ spin_unlock(&zone->lock);
+ }
+ wake_up_page(page, PG_writeback);
}
-
EXPORT_SYMBOL(end_page_writeback);
/*
*/
void fastcall __lock_page(struct page *page)
{
- wait_queue_head_t *wqh = page_waitqueue(page);
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- while (TestSetPageLocked(page)) {
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- if (PageLocked(page)) {
- sync_page(page);
- io_schedule();
- }
- }
- finish_wait(wqh, &wait);
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+ TASK_UNINTERRUPTIBLE);
}
-
EXPORT_SYMBOL(__lock_page);
/*
{
struct page *page;
- /*
- * We scan the hash list read-only. Addition to and removal from
- * the hash-list needs a held write-lock.
- */
- spin_lock_irq(&mapping->tree_lock);
+ read_lock_irq(&mapping->tree_lock);
page = radix_tree_lookup(&mapping->page_tree, offset);
if (page)
page_cache_get(page);
- spin_unlock_irq(&mapping->tree_lock);
+ read_unlock_irq(&mapping->tree_lock);
return page;
}
{
struct page *page;
- spin_lock_irq(&mapping->tree_lock);
+ read_lock_irq(&mapping->tree_lock);
page = radix_tree_lookup(&mapping->page_tree, offset);
if (page && TestSetPageLocked(page))
page = NULL;
- spin_unlock_irq(&mapping->tree_lock);
+ read_unlock_irq(&mapping->tree_lock);
return page;
}
/**
* find_lock_page - locate, pin and lock a pagecache page
*
- * @mapping - the address_space to search
- * @offset - the page index
+ * @mapping: the address_space to search
+ * @offset: the page index
*
* Locates the desired pagecache page, locks it, increments its reference
* count and returns its address.
{
struct page *page;
- spin_lock_irq(&mapping->tree_lock);
+ read_lock_irq(&mapping->tree_lock);
repeat:
page = radix_tree_lookup(&mapping->page_tree, offset);
if (page) {
page_cache_get(page);
if (TestSetPageLocked(page)) {
- spin_unlock_irq(&mapping->tree_lock);
- lock_page(page);
- spin_lock_irq(&mapping->tree_lock);
+ read_unlock_irq(&mapping->tree_lock);
+ __lock_page(page);
+ read_lock_irq(&mapping->tree_lock);
/* Has the page been truncated while we slept? */
- if (page->mapping != mapping || page->index != offset) {
+ if (unlikely(page->mapping != mapping ||
+ page->index != offset)) {
unlock_page(page);
page_cache_release(page);
goto repeat;
}
}
}
- spin_unlock_irq(&mapping->tree_lock);
+ read_unlock_irq(&mapping->tree_lock);
return page;
}
/**
* find_or_create_page - locate or add a pagecache page
*
- * @mapping - the page's address_space
- * @index - the page's index into the mapping
- * @gfp_mask - page allocation mode
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
*
* Locates a page in the pagecache. If the page is not present, a new page
* is allocated using @gfp_mask and is added to the pagecache and to the VM's
* memory exhaustion.
*/
struct page *find_or_create_page(struct address_space *mapping,
- unsigned long index, unsigned int gfp_mask)
+ unsigned long index, gfp_t gfp_mask)
{
struct page *page, *cached_page = NULL;
int err;
unsigned int i;
unsigned int ret;
- spin_lock_irq(&mapping->tree_lock);
+ read_lock_irq(&mapping->tree_lock);
ret = radix_tree_gang_lookup(&mapping->page_tree,
(void **)pages, start, nr_pages);
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
- spin_unlock_irq(&mapping->tree_lock);
+ read_unlock_irq(&mapping->tree_lock);
return ret;
}
+/**
+ * find_get_pages_contig - gang contiguous pagecache lookup
+ * @mapping: The address_space to search
+ * @index: The starting page index
+ * @nr_pages: The maximum number of pages
+ * @pages: Where the resulting pages are placed
+ *
+ * find_get_pages_contig() works exactly like find_get_pages(), except
+ * that the returned number of pages are guaranteed to be contiguous.
+ *
+ * find_get_pages_contig() returns the number of pages which were found.
+ */
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+ unsigned int nr_pages, struct page **pages)
+{
+ unsigned int i;
+ unsigned int ret;
+
+ read_lock_irq(&mapping->tree_lock);
+ ret = radix_tree_gang_lookup(&mapping->page_tree,
+ (void **)pages, index, nr_pages);
+ for (i = 0; i < ret; i++) {
+ if (pages[i]->mapping == NULL || pages[i]->index != index)
+ break;
+
+ page_cache_get(pages[i]);
+ index++;
+ }
+ read_unlock_irq(&mapping->tree_lock);
+ return i;
+}
+
/*
* Like find_get_pages, except we only return pages which are tagged with
* `tag'. We update *index to index the next page for the traversal.
unsigned int i;
unsigned int ret;
- spin_lock_irq(&mapping->tree_lock);
+ read_lock_irq(&mapping->tree_lock);
ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
(void **)pages, *index, nr_pages, tag);
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
if (ret)
*index = pages[ret - 1]->index + 1;
- spin_unlock_irq(&mapping->tree_lock);
+ read_unlock_irq(&mapping->tree_lock);
return ret;
}
grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
{
struct page *page = find_get_page(mapping, index);
- int gfp_mask;
+ gfp_t gfp_mask;
if (page) {
if (!TestSetPageLocked(page))
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
- * - note the struct file * is only passed for the use of readpage
+ *
+ * Note the struct file* is only passed for the use of readpage. It may be
+ * NULL.
*/
void do_generic_mapping_read(struct address_space *mapping,
- struct file_ra_state *ra,
- struct file * filp,
+ struct file_ra_state *_ra,
+ struct file *filp,
loff_t *ppos,
- read_descriptor_t * desc,
- read_actor_t actor)
+ read_descriptor_t *desc,
+ read_actor_t actor,
+ int nonblock)
{
struct inode *inode = mapping->host;
- unsigned long index, offset;
+ unsigned long index;
+ unsigned long end_index;
+ unsigned long offset;
+ unsigned long last_index;
+ unsigned long next_index;
+ unsigned long prev_index;
+ loff_t isize;
struct page *cached_page;
int error;
+ struct file_ra_state ra = *_ra;
cached_page = NULL;
index = *ppos >> PAGE_CACHE_SHIFT;
+ next_index = index;
+ prev_index = ra.prev_page;
+ last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
+ isize = i_size_read(inode);
+ if (!isize)
+ goto out;
+
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
- unsigned long end_index, nr, ret;
- loff_t isize = i_size_read(inode);
+ unsigned long nr, ret;
- end_index = isize >> PAGE_CACHE_SHIFT;
-
- if (index > end_index)
- break;
+ /* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
- if (index == end_index) {
- nr = isize & ~PAGE_CACHE_MASK;
- if (nr <= offset)
- break;
+ if (index >= end_index) {
+ if (index > end_index)
+ goto out;
+ nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (nr <= offset) {
+ goto out;
+ }
}
+ nr = nr - offset;
cond_resched();
- page_cache_readahead(mapping, ra, filp, index);
+ if (index == next_index)
+ next_index = page_cache_readahead(mapping, &ra, filp,
+ index, last_index - index);
- nr = nr - offset;
find_page:
page = find_get_page(mapping, index);
if (unlikely(page == NULL)) {
- handle_ra_miss(mapping, ra, index);
+ if (nonblock) {
+ desc->error = -EWOULDBLOCKIO;
+ break;
+ }
+ handle_ra_miss(mapping, &ra, index);
goto no_cached_page;
}
- if (!PageUptodate(page))
+ if (!PageUptodate(page)) {
+ if (nonblock) {
+ page_cache_release(page);
+ desc->error = -EWOULDBLOCKIO;
+ break;
+ }
goto page_not_up_to_date;
+ }
page_ok:
+
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
flush_dcache_page(page);
/*
- * Mark the page accessed if we read the beginning.
+ * When (part of) the same page is read multiple times
+ * in succession, only mark it as accessed the first time.
*/
- if (!offset)
+ if (prev_index != index)
mark_page_accessed(page);
+ prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
page_cache_release(page);
if (ret == nr && desc->count)
continue;
- break;
+ goto out;
page_not_up_to_date:
- if (PageUptodate(page))
- goto page_ok;
-
/* Get exclusive access to the page ... */
lock_page(page);
}
readpage:
- /* ... and start the actual read. The read will unlock the page. */
+ /* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
- if (!error) {
- if (PageUptodate(page))
- goto page_ok;
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto page_ok;
- error = -EIO;
+ if (unlikely(error)) {
+ if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto find_page;
+ }
+ goto readpage_error;
+ }
+
+ if (!PageUptodate(page)) {
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ if (page->mapping == NULL) {
+ /*
+ * invalidate_inode_pages got it
+ */
+ unlock_page(page);
+ page_cache_release(page);
+ goto find_page;
+ }
+ unlock_page(page);
+ error = -EIO;
+ goto readpage_error;
+ }
+ unlock_page(page);
+ }
+
+ /*
+ * i_size must be checked after we have done ->readpage.
+ *
+ * Checking i_size after the readpage allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index)) {
+ page_cache_release(page);
+ goto out;
+ }
+
+ /* nr is the maximum number of bytes to copy from this page */
+ nr = PAGE_CACHE_SIZE;
+ if (index == end_index) {
+ nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (nr <= offset) {
+ page_cache_release(page);
+ goto out;
+ }
}
+ nr = nr - offset;
+ goto page_ok;
+readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
page_cache_release(page);
- break;
+ goto out;
no_cached_page:
/*
cached_page = page_cache_alloc_cold(mapping);
if (!cached_page) {
desc->error = -ENOMEM;
- break;
+ goto out;
}
}
error = add_to_page_cache_lru(cached_page, mapping,
if (error == -EEXIST)
goto find_page;
desc->error = error;
- break;
+ goto out;
}
page = cached_page;
cached_page = NULL;
goto readpage;
}
+out:
+ *_ra = ra;
+
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
if (cached_page)
page_cache_release(cached_page);
- file_accessed(filp);
+ if (filp)
+ file_accessed(filp);
}
EXPORT_SYMBOL(do_generic_mapping_read);
* Faults on the destination of a read are common, so do it before
* taking the kmap.
*/
- if (!fault_in_pages_writeable(desc->buf, size)) {
+ if (!fault_in_pages_writeable(desc->arg.buf, size)) {
kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_to_user(desc->buf, kaddr + offset, size);
+ left = __copy_to_user_inatomic(desc->arg.buf,
+ kaddr + offset, size);
kunmap_atomic(kaddr, KM_USER0);
if (left == 0)
goto success;
/* Do it the slow way */
kaddr = kmap(page);
- left = __copy_to_user(desc->buf, kaddr + offset, size);
+ left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
kunmap(page);
if (left) {
success:
desc->count = count - size;
desc->written += size;
- desc->buf += size;
+ desc->arg.buf += size;
return size;
}
if (pos < size) {
retval = generic_file_direct_IO(READ, iocb,
iov, pos, nr_segs);
- if (retval >= 0 && !is_sync_kiocb(iocb))
+ if (retval > 0 && !is_sync_kiocb(iocb))
retval = -EIOCBQUEUED;
if (retval > 0)
*ppos = pos + retval;
read_descriptor_t desc;
desc.written = 0;
- desc.buf = iov[seg].iov_base;
+ desc.arg.buf = iov[seg].iov_base;
desc.count = iov[seg].iov_len;
if (desc.count == 0)
continue;
desc.error = 0;
- do_generic_file_read(filp,ppos,&desc,file_read_actor);
+ do_generic_file_read(filp,ppos,&desc,file_read_actor,0);
retval += desc.written;
- if (!retval) {
- retval = desc.error;
+ if (desc.error) {
+ retval = retval ?: desc.error;
break;
}
}
{
ssize_t written;
unsigned long count = desc->count;
- struct file *file = (struct file *) desc->buf;
+ struct file *file = desc->arg.data;
if (size > count)
size = count;
}
ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
- size_t count, read_actor_t actor, void __user *target)
+ size_t count, read_actor_t actor, void *target)
{
read_descriptor_t desc;
desc.written = 0;
desc.count = count;
- desc.buf = target;
+ desc.arg.data = target;
desc.error = 0;
- do_generic_file_read(in_file, ppos, &desc, actor);
+ do_generic_file_read(in_file, ppos, &desc, actor, 0);
if (desc.written)
return desc.written;
return desc.error;
{
struct address_space *mapping = file->f_mapping;
struct page *page;
- int error;
+ int ret;
- page = page_cache_alloc_cold(mapping);
- if (!page)
- return -ENOMEM;
+ do {
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ return -ENOMEM;
+
+ ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ if (ret == 0)
+ ret = mapping->a_ops->readpage(file, page);
+ else if (ret == -EEXIST)
+ ret = 0; /* losing race to add is OK */
- error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
- if (!error) {
- error = mapping->a_ops->readpage(file, page);
page_cache_release(page);
- return error;
- }
- /*
- * We arrive here in the unlikely event that someone
- * raced with us and added our page to the cache first
- * or we are out of memory for radix-tree nodes.
- */
- page_cache_release(page);
- return error == -EEXIST ? 0 : error;
+ } while (ret == AOP_TRUNCATED_PAGE);
+
+ return ret;
}
#define MMAP_LOTSAMISS (100)
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*/
-struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int *type)
+struct page *filemap_nopage(struct vm_area_struct *area,
+ unsigned long address, int *type)
{
int error;
struct file *file = area->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
struct page *page;
- unsigned long size, pgoff, endoff;
+ unsigned long size, pgoff;
int did_readaround = 0, majmin = VM_FAULT_MINOR;
- pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
- endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+ pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
retry_all:
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (VM_RandomReadHint(area))
goto no_cached_page;
- /*
- * The "size" of the file, as far as mmap is concerned, isn't bigger
- * than the mapping
- */
- if (size > endoff)
- size = endoff;
-
/*
* The readahead code wants to be told about each and every page
* so it can build and shrink its windows appropriately
* For sequential accesses, we use the generic readahead logic.
*/
if (VM_SequentialReadHint(area))
- page_cache_readahead(mapping, ra, file, pgoff);
+ page_cache_readahead(mapping, ra, file, pgoff, 1);
/*
* Do we have something in the page cache already?
did_readaround = 1;
ra_pages = max_sane_readahead(file->f_ra.ra_pages);
if (ra_pages) {
- long start;
+ pgoff_t start = 0;
- start = pgoff - ra_pages / 2;
- if (pgoff < 0)
- pgoff = 0;
- do_page_cache_readahead(mapping, file, pgoff, ra_pages);
+ if (pgoff > ra_pages / 2)
+ start = pgoff - ra_pages / 2;
+ do_page_cache_readahead(mapping, file, start, ra_pages);
}
page = find_get_page(mapping, pgoff);
if (!page)
* effect.
*/
error = page_cache_read(file, pgoff);
+ grab_swap_token();
/*
* The page we want has now been added to the page cache.
goto success;
}
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
goto success;
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
* Ok, found a page in the page cache, now we need to check
* that it's up-to-date.
*/
- if (!PageUptodate(page))
+ if (!PageUptodate(page)) {
+ if (nonblock) {
+ page_cache_release(page);
+ return NULL;
+ }
goto page_not_uptodate;
+ }
success:
/*
goto success;
}
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
}
ClearPageError(page);
- if (!mapping->a_ops->readpage(file, page)) {
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
+ } else if (error == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ goto retry_find;
}
/*
return NULL;
}
-static int filemap_populate(struct vm_area_struct *vma,
- unsigned long addr,
- unsigned long len,
- pgprot_t prot,
- unsigned long pgoff,
- int nonblock)
+int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long len, pgprot_t prot, unsigned long pgoff,
+ int nonblock)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
return -EINVAL;
page = filemap_getpage(file, pgoff, nonblock);
+
+ /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
+ * done in shmem_populate calling shmem_getpage */
if (!page && !nonblock)
return -ENOMEM;
+
if (page) {
err = install_page(mm, vma, addr, page, prot);
if (err) {
page_cache_release(page);
return err;
}
- } else {
- /*
- * If a nonlinear mapping then store the file page offset
- * in the pte.
- */
- if (pgoff != linear_page_index(vma, addr)) {
- err = install_file_pte(mm, vma, addr, pgoff, prot);
- if (err)
- return err;
- }
+ } else if (vma->vm_flags & VM_NONLINEAR) {
+ /* No page was found just because we can't read it in now (being
+ * here implies nonblock != 0), but the page may exist, so set
+ * the PTE to fault it in later. */
+ err = install_file_pte(mm, vma, addr, pgoff, prot);
+ if (err)
+ return err;
}
len -= PAGE_SIZE;
return 0;
}
+EXPORT_SYMBOL(filemap_populate);
-static struct vm_operations_struct generic_file_vm_ops = {
+struct vm_operations_struct generic_file_vm_ops = {
.nopage = filemap_nopage,
.populate = filemap_populate,
};
}
EXPORT_SYMBOL(remove_suid);
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied. If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
- const char __user *buf, unsigned bytes)
-{
- char *kaddr;
- int left;
-
- kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_from_user(kaddr + offset, buf, bytes);
- kunmap_atomic(kaddr, KM_USER0);
-
- if (left != 0) {
- /* Do it the slow way */
- kaddr = kmap(page);
- left = __copy_from_user(kaddr + offset, buf, bytes);
- kunmap(page);
- }
- return bytes - left;
-}
-
-static size_t
+size_t
__filemap_copy_from_user_iovec(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
int copy = min(bytes, iov->iov_len - base);
base = 0;
- left = __copy_from_user(vaddr, buf, copy);
+ left = __copy_from_user_inatomic(vaddr, buf, copy);
copied += copy;
bytes -= copy;
vaddr += copy;
return copied - left;
}
-/*
- * This has the same sideeffects and return value as filemap_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
- * single-segment behaviour.
- */
-static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap_atomic(page, KM_USER0);
- copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
- base, bytes);
- kunmap_atomic(kaddr, KM_USER0);
- if (copied != bytes) {
- kaddr = kmap(page);
- copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
- base, bytes);
- kunmap(page);
- }
- return copied;
-}
-
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
- const struct iovec *iov = *iovp;
- size_t base = *basep;
-
- while (bytes) {
- int copy = min(bytes, iov->iov_len - base);
-
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- base = 0;
- }
- }
- *iovp = iov;
- *basep = base;
-}
-
/*
* Performs necessary checks before doing a write
*
inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
{
struct inode *inode = file->f_mapping->host;
- unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+ unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
if (unlikely(*pos < 0))
return -EINVAL;
- if (unlikely(file->f_error)) {
- int err = file->f_error;
- file->f_error = 0;
- return err;
- }
-
if (!isblk) {
/* FIXME: this is for backwards compatibility with 2.4 */
if (file->f_flags & O_APPEND)
}
return 0;
}
-
EXPORT_SYMBOL(generic_write_checks);
-/*
- * Write to a file through the page cache.
- * Called under i_sem for S_ISREG files.
- *
- * We put everything into the page cache prior to writing it. This is not a
- * problem when writing full pages. With partial pages, however, we first have
- * to read the data into the cache, then dirty the page, and finally schedule
- * it for writing by marking it dirty.
- * okir@monad.swb.de
- */
ssize_t
-generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, size_t ocount)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t written;
+
+ if (count != ocount)
+ *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+
+ written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+ if (written > 0) {
+ loff_t end = pos + written;
+ if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, end);
+ mark_inode_dirty(inode);
+ }
+ *ppos = end;
+ }
+
+ /*
+ * Sync the fs metadata but not the minor inode changes and
+ * of course not the data as we did direct DMA for the IO.
+ * i_mutex is held, which protects generic_osync_inode() from
+ * livelocking.
+ */
+ if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ if (err < 0)
+ written = err;
+ }
+ if (written == count && !is_sync_kiocb(iocb))
+ written = -EIOCBQUEUED;
+ return written;
+}
+EXPORT_SYMBOL(generic_file_direct_write);
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
struct address_space_operations *a_ops = mapping->a_ops;
- size_t ocount; /* original count */
- size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
long status = 0;
- loff_t pos;
struct page *page;
struct page *cached_page = NULL;
- const int isblk = S_ISBLK(inode->i_mode);
- ssize_t written;
- ssize_t err;
size_t bytes;
struct pagevec lru_pvec;
const struct iovec *cur_iov = iov; /* current iovec */
size_t iov_base = 0; /* offset in the current iovec */
- unsigned long seg;
char __user *buf;
- ocount = 0;
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *iv = &iov[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- ocount += iv->iov_len;
- if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
- return -EINVAL;
- if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
- continue;
- if (seg == 0)
- return -EFAULT;
- nr_segs = seg;
- ocount -= iv->iov_len; /* This segment is no good */
- break;
- }
-
- count = ocount;
- pos = *ppos;
pagevec_init(&lru_pvec, 0);
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
- written = 0;
-
- err = generic_write_checks(file, &pos, &count, isblk);
- if (err)
- goto out;
-
- if (count == 0)
- goto out;
-
- err = remove_suid(file->f_dentry);
- if (err)
- goto out;
-
- inode_update_time(inode, 1);
-
- /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (unlikely(file->f_flags & O_DIRECT)) {
- if (count != ocount)
- nr_segs = iov_shorten((struct iovec *)iov,
- nr_segs, count);
- written = generic_file_direct_IO(WRITE, iocb,
- iov, pos, nr_segs);
- if (written > 0) {
- loff_t end = pos + written;
- if (end > i_size_read(inode) && !isblk) {
- i_size_write(inode, end);
- mark_inode_dirty(inode);
- }
- *ppos = end;
- }
- /*
- * Sync the fs metadata but not the minor inode changes and
- * of course not the data as we did direct DMA for the IO.
- * i_sem is held, which protects generic_osync_inode() from
- * livelocking.
- */
- if (written >= 0 && file->f_flags & O_SYNC)
- status = generic_osync_inode(inode, mapping, OSYNC_METADATA);
- if (written == count && !is_sync_kiocb(iocb))
- written = -EIOCBQUEUED;
- if (written < 0 || written == count)
- goto out_status;
- /*
- * direct-io write to a hole: fall through to buffered I/O
- * for completing the rest of the request.
- */
- pos += written;
- count -= written;
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1))
+ buf = iov->iov_base + written;
+ else {
+ filemap_set_next_iovec(&cur_iov, &iov_base, written);
+ buf = cur_iov->iov_base + iov_base;
}
- buf = iov->iov_base;
do {
unsigned long index;
unsigned long offset;
+ unsigned long maxlen;
size_t copied;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
* same page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
+ maxlen = cur_iov->iov_len - iov_base;
+ if (maxlen > bytes)
+ maxlen = bytes;
+ fault_in_pages_readable(buf, maxlen);
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
loff_t isize = i_size_read(inode);
+
+ if (status != AOP_TRUNCATED_PAGE)
+ unlock_page(page);
+ page_cache_release(page);
+ if (status == AOP_TRUNCATED_PAGE)
+ continue;
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
*/
- unlock_page(page);
- page_cache_release(page);
if (pos + bytes > isize)
vmtruncate(inode, isize);
break;
cur_iov, iov_base, bytes);
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
+ if (status == AOP_TRUNCATED_PAGE) {
+ page_cache_release(page);
+ continue;
+ }
if (likely(copied > 0)) {
if (!status)
status = copied;
count -= status;
pos += status;
buf += status;
- if (unlikely(nr_segs > 1))
+ if (unlikely(nr_segs > 1)) {
filemap_set_next_iovec(&cur_iov,
&iov_base, status);
+ if (count)
+ buf = cur_iov->iov_base +
+ iov_base;
+ } else {
+ iov_base += status;
+ }
}
}
if (unlikely(copied != bytes))
/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
*/
- if (status >= 0) {
- if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
- status = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- }
+ if (likely(status >= 0)) {
+ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ if (!a_ops->writepage || !is_sync_kiocb(iocb))
+ status = generic_osync_inode(inode, mapping,
+ OSYNC_METADATA|OSYNC_DATA);
+ }
+ }
/*
* If we get here for O_DIRECT writes then we must have fallen through
if (unlikely(file->f_flags & O_DIRECT) && written)
status = filemap_write_and_wait(mapping);
-out_status:
- err = written ? written : status;
-out:
pagevec_lru_add(&lru_pvec);
- current->backing_dev_info = 0;
- return err;
+ return written ? written : status;
}
+EXPORT_SYMBOL(generic_file_buffered_write);
+
+static ssize_t
+__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ size_t ocount; /* original count */
+ size_t count; /* after file limit checks */
+ struct inode *inode = mapping->host;
+ unsigned long seg;
+ loff_t pos;
+ ssize_t written;
+ ssize_t err;
+ ocount = 0;
+ for (seg = 0; seg < nr_segs; seg++) {
+ const struct iovec *iv = &iov[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ ocount += iv->iov_len;
+ if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+ return -EINVAL;
+ if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+ continue;
+ if (seg == 0)
+ return -EFAULT;
+ nr_segs = seg;
+ ocount -= iv->iov_len; /* This segment is no good */
+ break;
+ }
+
+ count = ocount;
+ pos = *ppos;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+ written = 0;
+
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ err = remove_suid(file->f_dentry);
+ if (err)
+ goto out;
+
+ file_update_time(file);
+
+ /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ written = generic_file_direct_write(iocb, iov,
+ &nr_segs, pos, ppos, count, ocount);
+ if (written < 0 || written == count)
+ goto out;
+ /*
+ * direct-io write to a hole: fall through to buffered I/O
+ * for completing the rest of the request.
+ */
+ pos += written;
+ count -= written;
+ }
+
+ written = generic_file_buffered_write(iocb, iov, nr_segs,
+ pos, ppos, count, written);
+out:
+ current->backing_dev_info = NULL;
+ return written ? written : err;
+}
EXPORT_SYMBOL(generic_file_aio_write_nolock);
+ssize_t
+generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+ loff_t pos = *ppos;
+
+ ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
+
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err;
+
+ err = sync_page_range_nolock(inode, mapping, pos, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
+
+static ssize_t
+__generic_file_write_nolock(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+ if (ret == -EIOCBQUEUED)
+ ret = wait_on_sync_kiocb(&kiocb);
+ return ret;
+}
+
ssize_t
generic_file_write_nolock(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
ret = wait_on_sync_kiocb(&kiocb);
return ret;
}
-
EXPORT_SYMBOL(generic_file_write_nolock);
ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
size_t count, loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- ssize_t err;
- struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+ struct iovec local_iov = { .iov_base = (void __user *)buf,
+ .iov_len = count };
BUG_ON(iocb->ki_pos != pos);
- down(&inode->i_sem);
- err = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+ mutex_lock(&inode->i_mutex);
+ ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
&iocb->ki_pos);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
- return err;
-}
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ ssize_t err;
+ err = sync_page_range(inode, mapping, pos, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
EXPORT_SYMBOL(generic_file_aio_write);
ssize_t generic_file_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct inode *inode = file->f_mapping->host;
- ssize_t err;
- struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+ struct iovec local_iov = { .iov_base = (void __user *)buf,
+ .iov_len = count };
- down(&inode->i_sem);
- err = generic_file_write_nolock(file, &local_iov, 1, ppos);
- up(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
+ ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
+ mutex_unlock(&inode->i_mutex);
- return err;
-}
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ ssize_t err;
+ err = sync_page_range(inode, mapping, *ppos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
EXPORT_SYMBOL(generic_file_write);
ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
ret = wait_on_sync_kiocb(&kiocb);
return ret;
}
-
EXPORT_SYMBOL(generic_file_readv);
ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t * ppos)
+ unsigned long nr_segs, loff_t *ppos)
{
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
ssize_t ret;
- down(&inode->i_sem);
- ret = generic_file_write_nolock(file, iov, nr_segs, ppos);
- up(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
+ ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err;
+
+ err = sync_page_range(inode, mapping, *ppos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
return ret;
}
-
EXPORT_SYMBOL(generic_file_writev);
/*
- * Called under i_sem for writes to S_ISREG files
+ * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
+ * went wrong during pagecache shootdown.
*/
-ssize_t
+static ssize_t
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
ssize_t retval;
+ size_t write_len = 0;
+
+ /*
+ * If it's a write, unmap all mmappings of the file up-front. This
+ * will cause any pte dirty bits to be propagated into the pageframes
+ * for the subsequent filemap_write_and_wait().
+ */
+ if (rw == WRITE) {
+ write_len = iov_length(iov, nr_segs);
+ if (mapping_mapped(mapping))
+ unmap_mapping_range(mapping, offset, write_len, 0);
+ }
retval = filemap_write_and_wait(mapping);
if (retval == 0) {
retval = mapping->a_ops->direct_IO(rw, iocb, iov,
offset, nr_segs);
- if (rw == WRITE && mapping->nrpages)
- invalidate_inode_pages2(mapping);
+ if (rw == WRITE && mapping->nrpages) {
+ pgoff_t end = (offset + write_len - 1)
+ >> PAGE_CACHE_SHIFT;
+ int err = invalidate_inode_pages2_range(mapping,
+ offset >> PAGE_CACHE_SHIFT, end);
+ if (err)
+ retval = err;
+ }
}
return retval;
}
-
-EXPORT_SYMBOL_GPL(generic_file_direct_IO);