X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Ffilemap.c;h=c3659a01e3a4ec58ed9022c75fde17e1af9f99c4;hb=refs%2Fheads%2Fvserver;hp=1abe874e88e1327bc12fc51d4fb98fc642e867b5;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/mm/filemap.c b/mm/filemap.c index 1abe874e8..c3659a01e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -9,12 +9,13 @@ * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ -#include #include #include #include #include +#include #include +#include #include #include #include @@ -27,19 +28,22 @@ #include #include #include +#include +#include +#include "filemap.h" +#include "internal.h" + /* - * This is needed for the following functions: - * - try_to_release_page - * - block_invalidatepage - * - generic_osync_inode - * * FIXME: remove all knowledge of the buffer layer from the core VM */ #include /* for generic_osync_inode */ -#include #include +static ssize_t +generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs); + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -55,36 +59,49 @@ /* * Lock ordering: * - * ->i_shared_sem (vmtruncate) + * ->i_mmap_lock (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) - * ->swap_list_lock - * ->swap_device_lock (exclusive_swap_page, others) - * ->mapping->tree_lock + * ->swap_lock (exclusive_swap_page, others) + * ->mapping->tree_lock * - * ->i_sem - * ->i_shared_sem (truncate->unmap_mapping_range) + * ->i_mutex + * ->i_mmap_lock (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_shared_sem (various places) + * ->i_mmap_lock + * ->page_table_lock or pte_lock (various, mainly in memory.c) + * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem * ->lock_page (access_process_vm) * - * ->mmap_sem - * ->i_sem (msync) + * ->i_mutex (generic_file_buffered_write) + * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * ->i_sem + * ->i_mutex * ->i_alloc_sem (various) * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->page_table_lock - * ->swap_device_lock (try_to_unmap_one) + * ->i_mmap_lock + * ->anon_vma.lock (vma_adjust) + * + * ->anon_vma.lock + * ->page_table_lock or pte_lock (anon_vma_prepare and various) + * + * ->page_table_lock or pte_lock + * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) + * ->zone.lru_lock (check_pte_range->isolate_lru_page) + * ->private_lock (page_remove_rmap->set_page_dirty) + * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->inode_lock (page_remove_rmap->set_page_dirty) + * ->inode_lock (zap_pte_range->set_page_dirty) + * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) @@ -102,67 +119,111 @@ void __remove_from_page_cache(struct page *page) radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; - pagecache_acct(-1); + __dec_zone_page_state(page, NR_FILE_PAGES); } void remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - if (unlikely(!PageLocked(page))) - PAGE_BUG(page); + BUG_ON(!PageLocked(page)); - spin_lock_irq(&mapping->tree_lock); + write_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - spin_unlock_irq(&mapping->tree_lock); + write_unlock_irq(&mapping->tree_lock); } -static inline int sync_page(struct page *page) +static int sync_page(void *word) { struct address_space *mapping; + struct page *page; + + page = container_of((unsigned long *)word, struct page, flags); + /* + * page_mapping() is being called without PG_locked held. + * Some knowledge of the state and use of the page is used to + * reduce the requirements down to a memory barrier. + * The danger here is of a stale page_mapping() return value + * indicating a struct address_space different from the one it's + * associated with when it is associated with one. + * After smp_mb(), it's either the correct page_mapping() for + * the page, or an old page_mapping() and the page's own + * page_mapping() has gone NULL. + * The ->sync_page() address_space operation must tolerate + * page_mapping() going NULL. By an amazing coincidence, + * this comes about because none of the users of the page + * in the ->sync_page() methods make essential use of the + * page_mapping(), merely passing the page down to the backing + * device's unplug functions when it's non-NULL, which in turn + * ignore it for all cases but swap, where only page_private(page) is + * of interest. When page_mapping() does go NULL, the entire + * call stack gracefully ignores the page and returns. + * -- wli + */ smp_mb(); mapping = page_mapping(page); - if (mapping) { - if (mapping->a_ops && mapping->a_ops->sync_page) - return mapping->a_ops->sync_page(page); - } else if (PageSwapCache(page)) { - swap_unplug_io_fn(page); - } + if (mapping && mapping->a_ops && mapping->a_ops->sync_page) + mapping->a_ops->sync_page(page); + io_schedule(); return 0; } /** - * filemap_fdatawrite - start writeback against all of a mapping's dirty pages - * @mapping: address space structure to write + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * @sync_mode: enable synchronous operation + * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets inclusive. * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory * cleansing writeback. The difference between + * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. */ -static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) { int ret; struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = mapping->nrpages * 2, + .range_start = start, + .range_end = end, }; - if (mapping->backing_dev_info->memory_backed) + if (!mapping_cap_writeback_dirty(mapping)) return 0; ret = do_writepages(mapping, &wbc); return ret; } +static inline int __filemap_fdatawrite(struct address_space *mapping, + int sync_mode) +{ + return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); +} + int filemap_fdatawrite(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite); -/* +static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); +} + +/** + * filemap_flush - mostly a non-blocking flush + * @mapping: target address_space + * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. */ @@ -172,11 +233,16 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); -/* +/** + * wait_on_page_writeback_range - wait for writeback to complete + * @mapping: target address_space + * @start: beginning page index + * @end: ending page index + * * Wait for writeback to complete against pages indexed by start->end * inclusive */ -static int wait_on_page_writeback_range(struct address_space *mapping, +int wait_on_page_writeback_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; @@ -189,14 +255,19 @@ static int wait_on_page_writeback_range(struct address_space *mapping, pagevec_init(&pvec, 0); index = start; - while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { unsigned i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; @@ -215,65 +286,179 @@ static int wait_on_page_writeback_range(struct address_space *mapping, } /** - * filemap_fdatawait - walk the list of under-writeback pages of the given - * address space and wait for all of them. + * sync_page_range - write and wait on all pages in the passed range + * @inode: target inode + * @mapping: target address_space + * @pos: beginning offset in pages to write + * @count: number of bytes to write + * + * Write and wait upon all the pages in the passed range. This is a "data + * integrity" operation. It waits upon in-flight writeout before starting and + * waiting upon new writeout. If there was an IO error, return it. + * + * We need to re-take i_mutex during the generic_osync_inode list walk because + * it is otherwise livelockable. + */ +int sync_page_range(struct inode *inode, struct address_space *mapping, + loff_t pos, loff_t count) +{ + pgoff_t start = pos >> PAGE_CACHE_SHIFT; + pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + int ret; + + if (!mapping_cap_writeback_dirty(mapping) || !count) + return 0; + ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); + if (ret == 0) { + mutex_lock(&inode->i_mutex); + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + mutex_unlock(&inode->i_mutex); + } + if (ret == 0) + ret = wait_on_page_writeback_range(mapping, start, end); + return ret; +} +EXPORT_SYMBOL(sync_page_range); + +/** + * sync_page_range_nolock + * @inode: target inode + * @mapping: target address_space + * @pos: beginning offset in pages to write + * @count: number of bytes to write * + * Note: Holding i_mutex across sync_page_range_nolock is not a good idea + * as it forces O_SYNC writers to different parts of the same file + * to be serialised right until io completion. + */ +int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, + loff_t pos, loff_t count) +{ + pgoff_t start = pos >> PAGE_CACHE_SHIFT; + pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + int ret; + + if (!mapping_cap_writeback_dirty(mapping) || !count) + return 0; + ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); + if (ret == 0) + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + if (ret == 0) + ret = wait_on_page_writeback_range(mapping, start, end); + return ret; +} +EXPORT_SYMBOL(sync_page_range_nolock); + +/** + * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. */ int filemap_fdatawait(struct address_space *mapping) { - return wait_on_page_writeback_range(mapping, 0, -1); -} + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return 0; + return wait_on_page_writeback_range(mapping, 0, + (i_size - 1) >> PAGE_CACHE_SHIFT); +} EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) { - int retval = 0; + int err = 0; if (mapping->nrpages) { - retval = filemap_fdatawrite(mapping); - if (retval == 0) - retval = filemap_fdatawait(mapping); + err = filemap_fdatawrite(mapping); + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ + if (err != -EIO) { + int err2 = filemap_fdatawait(mapping); + if (!err) + err = err2; + } } - return retval; + return err; } +EXPORT_SYMBOL(filemap_write_and_wait); -/* - * This function is used to add newly allocated pagecache pages: +/** + * filemap_write_and_wait_range - write out & wait on a file range + * @mapping: the address_space for the pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that `lend' is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + */ +int filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int err = 0; + + if (mapping->nrpages) { + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) { + int err2 = wait_on_page_writeback_range(mapping, + lstart >> PAGE_CACHE_SHIFT, + lend >> PAGE_CACHE_SHIFT); + if (!err) + err = err2; + } + } + return err; +} + +/** + * add_to_page_cache - add newly allocated pagecache pages + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add newly allocated pagecache pages; * the page is new, so we can just run SetPageLocked() against it. * The other page state flags were set by rmqueue(). * * This function does not add the page to the LRU. The caller must do that. */ int add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t offset, int gfp_mask) + pgoff_t offset, gfp_t gfp_mask) { int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { - page_cache_get(page); - spin_lock_irq(&mapping->tree_lock); + write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { + page_cache_get(page); SetPageLocked(page); page->mapping = mapping; page->index = offset; mapping->nrpages++; - pagecache_acct(1); - } else { - page_cache_release(page); + __inc_zone_page_state(page, NR_FILE_PAGES); } - spin_unlock_irq(&mapping->tree_lock); + write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } return error; } - EXPORT_SYMBOL(add_to_page_cache); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t offset, int gfp_mask) + pgoff_t offset, gfp_t gfp_mask) { int ret = add_to_page_cache(page, mapping, offset, gfp_mask); if (ret == 0) @@ -281,6 +466,24 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, return ret; } +#ifdef CONFIG_NUMA +struct page *__page_cache_alloc(gfp_t gfp) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, gfp, 0); + } + return alloc_pages(gfp, 0); +} +EXPORT_SYMBOL(__page_cache_alloc); +#endif + +static int __sleep_on_page_lock(void *word) +{ + io_schedule(); + return 0; +} + /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -298,26 +501,35 @@ static wait_queue_head_t *page_waitqueue(struct page *page) return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; } +static inline void wake_up_page(struct page *page, int bit) +{ + __wake_up_bit(page_waitqueue(page), &page->flags, bit); +} + void fastcall wait_on_page_bit(struct page *page, int bit_nr) { - wait_queue_head_t *waitqueue = page_waitqueue(page); - DEFINE_WAIT(wait); + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); - do { - prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE); - if (test_bit(bit_nr, &page->flags)) { - sync_page(page); - io_schedule(); - } - } while (test_bit(bit_nr, &page->flags)); - finish_wait(waitqueue, &wait); + if (test_bit(bit_nr, &page->flags)) + __wait_on_bit(page_waitqueue(page), &wait, sync_page, + TASK_UNINTERRUPTIBLE); } - EXPORT_SYMBOL(wait_on_page_bit); +void install_page_waitqueue_monitor(struct page *page, wait_queue_t *monitor) +{ + wait_queue_head_t *q = page_waitqueue(page); + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, monitor); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL_GPL(install_page_waitqueue_monitor); + /** - * unlock_page() - unlock a locked page - * + * unlock_page - unlock a locked page * @page: the page * * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). @@ -332,105 +544,103 @@ EXPORT_SYMBOL(wait_on_page_bit); */ void fastcall unlock_page(struct page *page) { - wait_queue_head_t *waitqueue = page_waitqueue(page); smp_mb__before_clear_bit(); if (!TestClearPageLocked(page)) BUG(); smp_mb__after_clear_bit(); - if (waitqueue_active(waitqueue)) - wake_up_all(waitqueue); + wake_up_page(page, PG_locked); } - EXPORT_SYMBOL(unlock_page); -EXPORT_SYMBOL(lock_page); -/* - * End writeback against a page. +/** + * end_page_writeback - end writeback against a page + * @page: the page */ void end_page_writeback(struct page *page) { - wait_queue_head_t *waitqueue = page_waitqueue(page); - if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { if (!test_clear_page_writeback(page)) BUG(); - smp_mb__after_clear_bit(); } - if (waitqueue_active(waitqueue)) - wake_up_all(waitqueue); + smp_mb__after_clear_bit(); + wake_up_page(page, PG_writeback); } - EXPORT_SYMBOL(end_page_writeback); -/* - * Get a lock on the page, assuming we need to sleep to get it. +/** + * __lock_page - get a lock on the page, assuming we need to sleep to get it + * @page: the page to lock * - * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some + * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some * random driver's requestfn sets TASK_RUNNING, we could busywait. However * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ void fastcall __lock_page(struct page *page) { - wait_queue_head_t *wqh = page_waitqueue(page); - DEFINE_WAIT(wait); + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - while (TestSetPageLocked(page)) { - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - if (PageLocked(page)) { - sync_page(page); - io_schedule(); - } - } - finish_wait(wqh, &wait); + __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + TASK_UNINTERRUPTIBLE); } - EXPORT_SYMBOL(__lock_page); /* - * a rather lightweight function, finding and getting a reference to a - * hashed page atomically. + * Variant of lock_page that does not require the caller to hold a reference + * on the page's mapping. + */ +void fastcall __lock_page_nosync(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, + TASK_UNINTERRUPTIBLE); +} + +/** + * find_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * + * Is there a pagecache struct page at the given (mapping, offset) tuple? + * If yes, increment its refcount and return it; if no, return NULL. */ struct page * find_get_page(struct address_space *mapping, unsigned long offset) { struct page *page; - /* - * We scan the hash list read-only. Addition to and removal from - * the hash-list needs a held write-lock. - */ - spin_lock_irq(&mapping->tree_lock); + read_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock_irq(&mapping->tree_lock); + read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_get_page); -/* - * Same as above, but trylock it instead of incrementing the count. +/** + * find_trylock_page - find and lock a page + * @mapping: the address_space to search + * @offset: the page index + * + * Same as find_get_page(), but trylock it instead of incrementing the count. */ struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) { struct page *page; - spin_lock_irq(&mapping->tree_lock); + read_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock_irq(&mapping->tree_lock); + read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_trylock_page); /** * find_lock_page - locate, pin and lock a pagecache page - * - * @mapping - the address_space to search - * @offset - the page index + * @mapping: the address_space to search + * @offset: the page index * * Locates the desired pagecache page, locks it, increments its reference * count and returns its address. @@ -442,36 +652,35 @@ struct page *find_lock_page(struct address_space *mapping, { struct page *page; - spin_lock_irq(&mapping->tree_lock); + read_lock_irq(&mapping->tree_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock_irq(&mapping->tree_lock); - lock_page(page); - spin_lock_irq(&mapping->tree_lock); + read_unlock_irq(&mapping->tree_lock); + __lock_page(page); + read_lock_irq(&mapping->tree_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (unlikely(page->mapping != mapping || + page->index != offset)) { unlock_page(page); page_cache_release(page); goto repeat; } } } - spin_unlock_irq(&mapping->tree_lock); + read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_lock_page); /** * find_or_create_page - locate or add a pagecache page - * - * @mapping - the page's address_space - * @index - the page's index into the mapping - * @gfp_mask - page allocation mode + * @mapping: the page's address_space + * @index: the page's index into the mapping + * @gfp_mask: page allocation mode * * Locates a page in the pagecache. If the page is not present, a new page * is allocated using @gfp_mask and is added to the pagecache and to the VM's @@ -485,7 +694,7 @@ EXPORT_SYMBOL(find_lock_page); * memory exhaustion. */ struct page *find_or_create_page(struct address_space *mapping, - unsigned long index, unsigned int gfp_mask) + unsigned long index, gfp_t gfp_mask) { struct page *page, *cached_page = NULL; int err; @@ -509,7 +718,6 @@ repeat: page_cache_release(cached_page); return page; } - EXPORT_SYMBOL(find_or_create_page); /** @@ -534,18 +742,57 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int i; unsigned int ret; - spin_lock_irq(&mapping->tree_lock); + read_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock_irq(&mapping->tree_lock); + read_unlock_irq(&mapping->tree_lock); return ret; } -/* +/** + * find_get_pages_contig - gang contiguous pagecache lookup + * @mapping: The address_space to search + * @index: The starting page index + * @nr_pages: The maximum number of pages + * @pages: Where the resulting pages are placed + * + * find_get_pages_contig() works exactly like find_get_pages(), except + * that the returned number of pages are guaranteed to be contiguous. + * + * find_get_pages_contig() returns the number of pages which were found. + */ +unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, + unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + + read_lock_irq(&mapping->tree_lock); + ret = radix_tree_gang_lookup(&mapping->page_tree, + (void **)pages, index, nr_pages); + for (i = 0; i < ret; i++) { + if (pages[i]->mapping == NULL || pages[i]->index != index) + break; + + page_cache_get(pages[i]); + index++; + } + read_unlock_irq(&mapping->tree_lock); + return i; +} + +/** + * find_get_pages_tag - find and return pages that match @tag + * @mapping: the address_space to search + * @index: the starting page index + * @tag: the tag index + * @nr_pages: the maximum number of pages + * @pages: where the resulting pages are placed + * * Like find_get_pages, except we only return pages which are tagged with - * `tag'. We update *index to index the next page for the traversal. + * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) @@ -553,18 +800,22 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, unsigned int i; unsigned int ret; - spin_lock_irq(&mapping->tree_lock); + read_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup_tag(&mapping->page_tree, (void **)pages, *index, nr_pages, tag); for (i = 0; i < ret; i++) page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - spin_unlock_irq(&mapping->tree_lock); + read_unlock_irq(&mapping->tree_lock); return ret; } -/* +/** + * grab_cache_page_nowait - returns locked page at given index in given cache + * @mapping: target address_space + * @index: the page index + * * Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should @@ -577,7 +828,6 @@ struct page * grab_cache_page_nowait(struct address_space *mapping, unsigned long index) { struct page *page = find_get_page(mapping, index); - int gfp_mask; if (page) { if (!TestSetPageLocked(page)) @@ -585,71 +835,121 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) page_cache_release(page); return NULL; } - gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; - page = alloc_pages(gfp_mask, 0); - if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); + if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { page_cache_release(page); page = NULL; } return page; } - EXPORT_SYMBOL(grab_cache_page_nowait); /* + * CD/DVDs are error prone. When a medium error occurs, the driver may fail + * a _large_ part of the i/o request. Imagine the worst scenario: + * + * ---R__________________________________________B__________ + * ^ reading here ^ bad block(assume 4k) + * + * read(R) => miss => readahead(R...B) => media error => frustrating retries + * => failing the whole request => read(R) => read(R+1) => + * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => + * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => + * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... + * + * It is going insane. Fix it by quickly scaling down the readahead size. + */ +static void shrink_readahead_size_eio(struct file *filp, + struct file_ra_state *ra) +{ + if (!ra->ra_pages) + return; + + ra->ra_pages /= 4; +} + +/** + * do_generic_mapping_read - generic file read routine + * @mapping: address_space to be read + * @_ra: file's readahead state + * @filp: the file to read + * @ppos: current file position + * @desc: read_descriptor + * @actor: read method + * * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level - * stuff. + * mapping->a_ops->readpage() function for the actual low-level stuff. * * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. - * - note the struct file * is only passed for the use of readpage + * + * Note the struct file* is only passed for the use of readpage. + * It may be NULL. */ void do_generic_mapping_read(struct address_space *mapping, - struct file_ra_state *ra, - struct file * filp, + struct file_ra_state *_ra, + struct file *filp, loff_t *ppos, - read_descriptor_t * desc, + read_descriptor_t *desc, read_actor_t actor) { struct inode *inode = mapping->host; - unsigned long index, offset; + unsigned long index; + unsigned long end_index; + unsigned long offset; + unsigned long last_index; + unsigned long next_index; + unsigned long prev_index; + unsigned int prev_offset; + loff_t isize; struct page *cached_page; int error; + struct file_ra_state ra = *_ra; cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; + next_index = index; + prev_index = ra.prev_page; + prev_offset = ra.offset; + last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; + isize = i_size_read(inode); + if (!isize) + goto out; + + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; - unsigned long end_index, nr, ret; - loff_t isize = i_size_read(inode); + unsigned long nr, ret; - end_index = isize >> PAGE_CACHE_SHIFT; - - if (index > end_index) - break; + /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; - if (index == end_index) { - nr = isize & ~PAGE_CACHE_MASK; - if (nr <= offset) - break; + if (index >= end_index) { + if (index > end_index) + goto out; + nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (nr <= offset) { + goto out; + } } + nr = nr - offset; cond_resched(); - page_cache_readahead(mapping, ra, filp, index); + if (index == next_index) + next_index = page_cache_readahead(mapping, &ra, filp, + index, last_index - index); - nr = nr - offset; find_page: page = find_get_page(mapping, index); if (unlikely(page == NULL)) { - handle_ra_miss(mapping, ra, index); + handle_ra_miss(mapping, &ra, index); goto no_cached_page; } if (!PageUptodate(page)) goto page_not_up_to_date; page_ok: + /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. @@ -658,10 +958,12 @@ page_ok: flush_dcache_page(page); /* - * Mark the page accessed if we read the beginning. + * When a sequential read accesses a page several times, + * only mark it as accessed the first time. */ - if (!offset) + if (prev_index != index || offset != prev_offset) mark_page_accessed(page); + prev_index = index; /* * Ok, we have the page, and it's up-to-date, so @@ -677,20 +979,18 @@ page_ok: offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; + prev_offset = ra.offset = offset; page_cache_release(page); if (ret == nr && desc->count) continue; - break; + goto out; page_not_up_to_date: - if (PageUptodate(page)) - goto page_ok; - /* Get exclusive access to the page ... */ lock_page(page); - /* Did it get unhashed before we got the lock? */ + /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); page_cache_release(page); @@ -704,22 +1004,68 @@ page_not_up_to_date: } readpage: - /* ... and start the actual read. The read will unlock the page. */ + /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); - if (!error) { - if (PageUptodate(page)) - goto page_ok; - wait_on_page_locked(page); - if (PageUptodate(page)) - goto page_ok; - error = -EIO; + if (unlikely(error)) { + if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } + goto readpage_error; + } + + if (!PageUptodate(page)) { + lock_page(page); + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* + * invalidate_inode_pages got it + */ + unlock_page(page); + page_cache_release(page); + goto find_page; + } + unlock_page(page); + error = -EIO; + shrink_readahead_size_eio(filp, &ra); + goto readpage_error; + } + unlock_page(page); + } + + /* + * i_size must be checked after we have done ->readpage. + * + * Checking i_size after the readpage allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + goto out; + } + + /* nr is the maximum number of bytes to copy from this page */ + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (nr <= offset) { + page_cache_release(page); + goto out; + } } + nr = nr - offset; + goto page_ok; +readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; page_cache_release(page); - break; + goto out; no_cached_page: /* @@ -730,7 +1076,7 @@ no_cached_page: cached_page = page_cache_alloc_cold(mapping); if (!cached_page) { desc->error = -ENOMEM; - break; + goto out; } } error = add_to_page_cache_lru(cached_page, mapping, @@ -739,19 +1085,22 @@ no_cached_page: if (error == -EEXIST) goto find_page; desc->error = error; - break; + goto out; } page = cached_page; cached_page = NULL; goto readpage; } +out: + *_ra = ra; + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) page_cache_release(cached_page); - file_accessed(filp); + if (filp) + file_accessed(filp); } - EXPORT_SYMBOL(do_generic_mapping_read); int file_read_actor(read_descriptor_t *desc, struct page *page, @@ -767,9 +1116,10 @@ int file_read_actor(read_descriptor_t *desc, struct page *page, * Faults on the destination of a read are common, so do it before * taking the kmap. */ - if (!fault_in_pages_writeable(desc->buf, size)) { + if (!fault_in_pages_writeable(desc->arg.buf, size)) { kaddr = kmap_atomic(page, KM_USER0); - left = __copy_to_user(desc->buf, kaddr + offset, size); + left = __copy_to_user_inatomic(desc->arg.buf, + kaddr + offset, size); kunmap_atomic(kaddr, KM_USER0); if (left == 0) goto success; @@ -777,7 +1127,7 @@ int file_read_actor(read_descriptor_t *desc, struct page *page, /* Do it the slow way */ kaddr = kmap(page); - left = __copy_to_user(desc->buf, kaddr + offset, size); + left = __copy_to_user(desc->arg.buf, kaddr + offset, size); kunmap(page); if (left) { @@ -787,22 +1137,29 @@ int file_read_actor(read_descriptor_t *desc, struct page *page, success: desc->count = count - size; desc->written += size; - desc->buf += size; + desc->arg.buf += size; return size; } -/* +/** + * generic_file_aio_read - generic filesystem read routine + * @iocb: kernel I/O control block + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @pos: current file position + * * This is the "read()" routine for all filesystems * that can use the page cache directly. */ ssize_t -__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg; size_t count; + loff_t *ppos = &iocb->ki_pos; count = 0; for (seg = 0; seg < nr_segs; seg++) { @@ -826,7 +1183,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { - loff_t pos = *ppos, size; + loff_t size; struct address_space *mapping; struct inode *inode; @@ -839,13 +1196,13 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (pos < size) { retval = generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); - if (retval >= 0 && !is_sync_kiocb(iocb)) - retval = -EIOCBQUEUED; if (retval > 0) *ppos = pos + retval; } - file_accessed(filp); - goto out; + if (likely(retval != 0)) { + file_accessed(filp); + goto out; + } } retval = 0; @@ -854,15 +1211,15 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, read_descriptor_t desc; desc.written = 0; - desc.buf = iov[seg].iov_base; + desc.arg.buf = iov[seg].iov_base; desc.count = iov[seg].iov_len; if (desc.count == 0) continue; desc.error = 0; do_generic_file_read(filp,ppos,&desc,file_read_actor); retval += desc.written; - if (!retval) { - retval = desc.error; + if (desc.error) { + retval = retval ?: desc.error; break; } } @@ -870,41 +1227,13 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, out: return retval; } - -EXPORT_SYMBOL(__generic_file_aio_read); - -ssize_t -generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) -{ - struct iovec local_iov = { .iov_base = buf, .iov_len = count }; - - BUG_ON(iocb->ki_pos != pos); - return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); -} - EXPORT_SYMBOL(generic_file_aio_read); -ssize_t -generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) -{ - struct iovec local_iov = { .iov_base = buf, .iov_len = count }; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - return ret; -} - -EXPORT_SYMBOL(generic_file_read); - int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) { ssize_t written; unsigned long count = desc->count; - struct file *file = (struct file *) desc->buf; + struct file *file = desc->arg.data; if (size > count) size = count; @@ -920,8 +1249,33 @@ int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long o return written; } +/* FIXME: It would be as simple as this, if we had a (void __user*) to write. + * We already have a kernel buffer, so it should be even simpler, right? ;) + * + * Yes, sorta. After duplicating the complete path of generic_file_write(), + * at least some special cases could be removed, so the copy is simpler than + * the original. But it remains a copy, so overall complexity increases. + */ +static ssize_t +generic_kernel_file_write(struct file *, const char *, size_t, loff_t *); + +ssize_t generic_file_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) +{ + ssize_t ret; + char *kaddr; + + kaddr = kmap(page); + ret = generic_kernel_file_write(file, kaddr + offset, size, ppos); + kunmap(page); + + return ret; +} + +EXPORT_SYMBOL(generic_file_sendpage); + ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void __user *target) + size_t count, read_actor_t actor, void *target) { read_descriptor_t desc; @@ -930,7 +1284,7 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, desc.written = 0; desc.count = count; - desc.buf = target; + desc.arg.data = target; desc.error = 0; do_generic_file_read(in_file, ppos, &desc, actor); @@ -938,7 +1292,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, return desc.written; return desc.error; } - EXPORT_SYMBOL(generic_file_sendfile); static ssize_t @@ -974,40 +1327,47 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } #ifdef CONFIG_MMU -/* - * This adds the requested page to the page cache if it isn't already there, - * and schedules an I/O to read in its contents from disk. - */ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +/** + * page_cache_read - adds requested page to the page cache if not already there + * @file: file to read + * @offset: page index + * + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ static int fastcall page_cache_read(struct file * file, unsigned long offset) { struct address_space *mapping = file->f_mapping; struct page *page; - int error; + int ret; - page = page_cache_alloc_cold(mapping); - if (!page) - return -ENOMEM; + do { + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (ret == 0) + ret = mapping->a_ops->readpage(file, page); + else if (ret == -EEXIST) + ret = 0; /* losing race to add is OK */ - error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); - if (!error) { - error = mapping->a_ops->readpage(file, page); page_cache_release(page); - return error; - } - /* - * We arrive here in the unlikely event that someone - * raced with us and added our page to the cache first - * or we are out of memory for radix-tree nodes. - */ - page_cache_release(page); - return error == -EEXIST ? 0 : error; + } while (ret == AOP_TRUNCATED_PAGE); + + return ret; } #define MMAP_LOTSAMISS (100) -/* +/** + * filemap_nopage - read in file data for page fault handling + * @area: the applicable vm_area + * @address: target address to read in + * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL + * * filemap_nopage() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * @@ -1015,7 +1375,8 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. */ -struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int *type) +struct page *filemap_nopage(struct vm_area_struct *area, + unsigned long address, int *type) { int error; struct file *file = area->vm_file; @@ -1023,11 +1384,10 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; struct page *page; - unsigned long size, pgoff, endoff; + unsigned long size, pgoff; int did_readaround = 0, majmin = VM_FAULT_MINOR; - pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; - endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; retry_all: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -1038,13 +1398,6 @@ retry_all: if (VM_RandomReadHint(area)) goto no_cached_page; - /* - * The "size" of the file, as far as mmap is concerned, isn't bigger - * than the mapping - */ - if (size > endoff) - size = endoff; - /* * The readahead code wants to be told about each and every page * so it can build and shrink its windows appropriately @@ -1052,7 +1405,7 @@ retry_all: * For sequential accesses, we use the generic readahead logic. */ if (VM_SequentialReadHint(area)) - page_cache_readahead(mapping, ra, file, pgoff); + page_cache_readahead(mapping, ra, file, pgoff, 1); /* * Do we have something in the page cache already? @@ -1081,17 +1434,16 @@ retry_find: */ if (!did_readaround) { majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); } did_readaround = 1; ra_pages = max_sane_readahead(file->f_ra.ra_pages); if (ra_pages) { - long start; + pgoff_t start = 0; - start = pgoff - ra_pages / 2; - if (pgoff < 0) - pgoff = 0; - do_page_cache_readahead(mapping, file, pgoff, ra_pages); + if (pgoff > ra_pages / 2) + start = pgoff - ra_pages / 2; + do_page_cache_readahead(mapping, file, start, ra_pages); } page = find_get_page(mapping, pgoff); if (!page) @@ -1123,7 +1475,7 @@ outside_data_content: * accessible.. */ if (area->vm_mm == current->mm) - return NULL; + return NOPAGE_SIGBUS; /* Fall through to the non-read-ahead case */ no_cached_page: /* @@ -1147,12 +1499,12 @@ no_cached_page: */ if (error == -ENOMEM) return NOPAGE_OOM; - return NULL; + return NOPAGE_SIGBUS; page_not_uptodate: if (!did_readaround) { majmin = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); + count_vm_event(PGMAJFAULT); } lock_page(page); @@ -1169,10 +1521,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1196,20 +1552,24 @@ page_not_uptodate: goto success; } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* * Things didn't work out. Return zero to tell the * mm layer so, possibly freeing the page cache page first. */ + shrink_readahead_size_eio(file, ra); page_cache_release(page); - return NULL; + return NOPAGE_SIGBUS; } - EXPORT_SYMBOL(filemap_nopage); static struct page * filemap_getpage(struct file *file, unsigned long pgoff, @@ -1234,8 +1594,13 @@ retry_find: * Ok, found a page in the page cache, now we need to check * that it's up-to-date. */ - if (!PageUptodate(page)) + if (!PageUptodate(page)) { + if (nonblock) { + page_cache_release(page); + return NULL; + } goto page_not_uptodate; + } success: /* @@ -1265,7 +1630,7 @@ no_cached_page: page_not_uptodate: lock_page(page); - /* Did it get unhashed while we waited for it? */ + /* Did it get truncated while we waited for it? */ if (!page->mapping) { unlock_page(page); goto err; @@ -1277,10 +1642,14 @@ page_not_uptodate: goto success; } - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1303,10 +1672,14 @@ page_not_uptodate: } ClearPageError(page); - if (!mapping->a_ops->readpage(file, page)) { + error = mapping->a_ops->readpage(file, page); + if (!error) { wait_on_page_locked(page); if (PageUptodate(page)) goto success; + } else if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto retry_find; } /* @@ -1319,12 +1692,9 @@ err: return NULL; } -static int filemap_populate(struct vm_area_struct *vma, - unsigned long addr, - unsigned long len, - pgprot_t prot, - unsigned long pgoff, - int nonblock) +int filemap_populate(struct vm_area_struct *vma, unsigned long addr, + unsigned long len, pgprot_t prot, unsigned long pgoff, + int nonblock) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -1344,24 +1714,25 @@ repeat: return -EINVAL; page = filemap_getpage(file, pgoff, nonblock); + + /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as + * done in shmem_populate calling shmem_getpage */ if (!page && !nonblock) return -ENOMEM; + if (page) { err = install_page(mm, vma, addr, page, prot); if (err) { page_cache_release(page); return err; } - } else { - /* - * If a nonlinear mapping then store the file page offset - * in the pte. - */ - if (pgoff != linear_page_index(vma, addr)) { - err = install_file_pte(mm, vma, addr, pgoff, prot); - if (err) - return err; - } + } else if (vma->vm_flags & VM_NONLINEAR) { + /* No page was found just because we can't read it in now (being + * here implies nonblock != 0), but the page may exist, so set + * the PTE to fault it in later. */ + err = install_file_pte(mm, vma, addr, pgoff, prot); + if (err) + return err; } len -= PAGE_SIZE; @@ -1372,8 +1743,9 @@ repeat: return 0; } +EXPORT_SYMBOL(filemap_populate); -static struct vm_operations_struct generic_file_vm_ops = { +struct vm_operations_struct generic_file_vm_ops = { .nopage = filemap_nopage, .populate = filemap_populate, }; @@ -1451,7 +1823,13 @@ repeat: return page; } -/* +/** + * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * * Read into the page cache. If a page already exists, * and PageUptodate() is not set, try to fill the page. */ @@ -1489,7 +1867,6 @@ retry: out: return page; } - EXPORT_SYMBOL(read_cache_page); /* @@ -1532,11 +1909,10 @@ repeat: * if suid or (sgid and xgrp) * remove privs */ -int remove_suid(struct dentry *dentry) +int should_remove_suid(struct dentry *dentry) { mode_t mode = dentry->d_inode->i_mode; int kill = 0; - int result = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) @@ -1549,43 +1925,47 @@ int remove_suid(struct dentry *dentry) if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; - if (unlikely(kill && !capable(CAP_FSETID))) { - struct iattr newattrs; + if (unlikely(kill && !capable(CAP_FSETID))) + return kill; - newattrs.ia_valid = ATTR_FORCE | kill; - result = notify_change(dentry, &newattrs); - } - return result; + return 0; +} +EXPORT_SYMBOL(should_remove_suid); + +int __remove_suid(struct dentry *dentry, int kill) +{ + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + return notify_change(dentry, &newattrs); +} + +int remove_suid(struct dentry *dentry) +{ + int kill = should_remove_suid(dentry); + + if (unlikely(kill)) + return __remove_suid(dentry, kill); + + return 0; } EXPORT_SYMBOL(remove_suid); -/* - * Copy as much as we can into the page and return the number of bytes which - * were sucessfully copied. If a fault is encountered then clear the page - * out to (offset+bytes) and return the number of bytes which were copied. - */ static inline size_t -filemap_copy_from_user(struct page *page, unsigned long offset, - const char __user *buf, unsigned bytes) +filemap_copy_from_kernel(struct page *page, unsigned long offset, + const char *buf, unsigned bytes) { char *kaddr; - int left; - kaddr = kmap_atomic(page, KM_USER0); - left = __copy_from_user(kaddr + offset, buf, bytes); - kunmap_atomic(kaddr, KM_USER0); + kaddr = kmap(page); + memcpy(kaddr + offset, buf, bytes); + kunmap(page); - if (left != 0) { - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_from_user(kaddr + offset, buf, bytes); - kunmap(page); - } - return bytes - left; + return bytes; } -static size_t -__filemap_copy_from_user_iovec(char *vaddr, +size_t +__filemap_copy_from_user_iovec_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { size_t copied = 0, left = 0; @@ -1595,89 +1975,33 @@ __filemap_copy_from_user_iovec(char *vaddr, int copy = min(bytes, iov->iov_len - base); base = 0; - left = __copy_from_user(vaddr, buf, copy); + left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); copied += copy; bytes -= copy; vaddr += copy; iov++; - if (unlikely(left)) { - /* zero the rest of the target like __copy_from_user */ - if (bytes) - memset(vaddr, 0, bytes); + if (unlikely(left)) break; - } } return copied - left; } -/* - * This has the same sideeffects and return value as filemap_copy_from_user(). - * The difference is that on a fault we need to memset the remainder of the - * page (out to offset+bytes), to emulate filemap_copy_from_user()'s - * single-segment behaviour. - */ -static inline size_t -filemap_copy_from_user_iovec(struct page *page, unsigned long offset, - const struct iovec *iov, size_t base, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap_atomic(page, KM_USER0); - copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, - base, bytes); - kunmap_atomic(kaddr, KM_USER0); - if (copied != bytes) { - kaddr = kmap(page); - copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, - base, bytes); - kunmap(page); - } - return copied; -} - -static inline void -filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) -{ - const struct iovec *iov = *iovp; - size_t base = *basep; - - while (bytes) { - int copy = min(bytes, iov->iov_len - base); - - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - base = 0; - } - } - *iovp = iov; - *basep = base; -} - /* * Performs necessary checks before doing a write * - * Can adjust writing position aor amount of bytes to write. + * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) { struct inode *inode = file->f_mapping->host; - unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (unlikely(*pos < 0)) return -EINVAL; - if (unlikely(file->f_error)) { - int err = file->f_error; - file->f_error = 0; - return err; - } - if (!isblk) { /* FIXME: this is for backwards compatibility with 2.4 */ if (file->f_flags & O_APPEND) @@ -1727,6 +2051,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) *count = inode->i_sb->s_maxbytes - *pos; } else { +#ifdef CONFIG_BLOCK loff_t isize; if (bdev_read_only(I_BDEV(inode))) return -EPERM; @@ -1738,32 +2063,335 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (*pos + *count > isize) *count = isize - *pos; +#else + return -EPERM; +#endif } return 0; } - EXPORT_SYMBOL(generic_write_checks); +ssize_t +generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long *nr_segs, loff_t pos, loff_t *ppos, + size_t count, size_t ocount) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t written; + + if (count != ocount) + *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); + + written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); + if (written > 0) { + loff_t end = pos + written; + if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, end); + mark_inode_dirty(inode); + } + *ppos = end; + } + + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + * i_mutex is held, which protects generic_osync_inode() from + * livelocking. AIO O_DIRECT ops attempt to sync metadata here. + */ + if ((written >= 0 || written == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); + if (err < 0) + written = err; + } + return written; +} +EXPORT_SYMBOL(generic_file_direct_write); + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + long status = 0; + struct page *page; + struct page *cached_page = NULL; + size_t bytes; + struct pagevec lru_pvec; + const struct iovec *cur_iov = iov; /* current iovec */ + size_t iov_base = 0; /* offset in the current iovec */ + char __user *buf; + + pagevec_init(&lru_pvec, 0); + + /* + * handle partial DIO write. Adjust cur_iov if needed. + */ + if (likely(nr_segs == 1)) + buf = iov->iov_base + written; + else { + filemap_set_next_iovec(&cur_iov, &iov_base, written); + buf = cur_iov->iov_base + iov_base; + } + + do { + unsigned long index; + unsigned long offset; + size_t copied; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + + /* Limit the size of the copy to the caller's write size */ + bytes = min(bytes, count); + + /* + * Limit the size of the copy to that of the current segment, + * because fault_in_pages_readable() doesn't know how to walk + * segments. + */ + bytes = min(bytes, cur_iov->iov_len - iov_base); + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + */ + fault_in_pages_readable(buf, bytes); + + page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + if (!page) { + status = -ENOMEM; + break; + } + + if (unlikely(bytes == 0)) { + status = 0; + copied = 0; + goto zero_length_segment; + } + + status = a_ops->prepare_write(file, page, offset, offset+bytes); + if (unlikely(status)) { + loff_t isize = i_size_read(inode); + + if (status != AOP_TRUNCATED_PAGE) + unlock_page(page); + page_cache_release(page); + if (status == AOP_TRUNCATED_PAGE) + continue; + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. + */ + if (pos + bytes > isize) + vmtruncate(inode, isize); + break; + } + if (likely(nr_segs == 1)) + copied = filemap_copy_from_user(page, offset, + buf, bytes); + else + copied = filemap_copy_from_user_iovec(page, offset, + cur_iov, iov_base, bytes); + flush_dcache_page(page); + status = a_ops->commit_write(file, page, offset, offset+bytes); + if (status == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + continue; + } +zero_length_segment: + if (likely(copied >= 0)) { + if (!status) + status = copied; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + if (unlikely(nr_segs > 1)) { + filemap_set_next_iovec(&cur_iov, + &iov_base, status); + if (count) + buf = cur_iov->iov_base + + iov_base; + } else { + iov_base += status; + } + } + } + if (unlikely(copied != bytes)) + if (status >= 0) + status = -EFAULT; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + if (status < 0) + break; + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (count); + *ppos = pos; + + if (cached_page) + page_cache_release(cached_page); + + /* + * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + */ + if (likely(status >= 0)) { + if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (!a_ops->writepage || !is_sync_kiocb(iocb)) + status = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + } + } + + /* + * If we get here for O_DIRECT writes then we must have fallen through + * to buffered writes (block instantiation inside i_size). So we sync + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) + status = filemap_write_and_wait(mapping); + + pagevec_lru_add(&lru_pvec); + return written ? written : status; +} +EXPORT_SYMBOL(generic_file_buffered_write); + /* - * Write to a file through the page cache. - * Called under i_sem for S_ISREG files. - * - * We put everything into the page cache prior to writing it. This is not a - * problem when writing full pages. With partial pages, however, we first have - * to read the data into the cache, then dirty the page, and finally schedule - * it for writing by marking it dirty. - * okir@monad.swb.de + * This writes the data from the source page to the specified page offset in + * the nominated file + * - the source page does not need to have any association with the file or the + * page offset */ -ssize_t -generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +int +generic_file_buffered_write_one_kernel_page(struct address_space *mapping, + pgoff_t index, + struct page *src) +{ + const struct address_space_operations *a_ops = mapping->a_ops; + struct pagevec lru_pvec; + struct page *page, *cached_page = NULL; + long status = 0; + + pagevec_init(&lru_pvec, 0); + +#if 0 + if (mapping->tree_lock.magic != RWLOCK_MAGIC) + printk("RWLOCK magic incorrect: %x != %x\n", + mapping->tree_lock.magic, RWLOCK_MAGIC); +#endif + + page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec); + if (!page) { + BUG_ON(cached_page); + return -ENOMEM; + } + + status = a_ops->prepare_write(NULL, page, 0, PAGE_CACHE_SIZE); + if (unlikely(status)) { + loff_t isize = i_size_read(mapping->host); + + if (status != AOP_TRUNCATED_PAGE) + unlock_page(page); + page_cache_release(page); + if (status == AOP_TRUNCATED_PAGE) + goto sync; + + /* prepare_write() may have instantiated a few blocks outside + * i_size. Trim these off again. + */ + if ((1ULL << (index + 1)) > isize) + vmtruncate(mapping->host, isize); + goto sync; + } + + copy_highpage(page, src); + flush_dcache_page(page); + + status = a_ops->commit_write(NULL, page, 0, PAGE_CACHE_SIZE); + if (status == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto sync; + } + + if (status > 0) + status = 0; + + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + if (status < 0) + return status; + + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + +sync: + if (cached_page) + page_cache_release(cached_page); + + /* the caller must handle O_SYNC themselves, but we handle S_SYNC and + * MS_SYNCHRONOUS here */ + if (unlikely(IS_SYNC(mapping->host)) && !a_ops->writepage) + status = generic_osync_inode(mapping->host, mapping, + OSYNC_METADATA | OSYNC_DATA); + + /* the caller must handle O_DIRECT for themselves */ + + pagevec_lru_add(&lru_pvec); + return status; +} +EXPORT_SYMBOL(generic_file_buffered_write_one_kernel_page); + +static inline void +filemap_set_next_kvec(const struct kvec **iovp, size_t *basep, size_t bytes) +{ + const struct kvec *iov = *iovp; + size_t base = *basep; + + while (bytes) { + int copy = min(bytes, iov->iov_len - base); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + *iovp = iov; + *basep = base; +} + +/* + * TODO: + * This largely tries to copy generic_file_aio_write_nolock(), although it + * doesn't have to be nearly as generic. A real cleanup should either + * merge this into generic_file_aio_write_nolock() as well or keep it special + * and remove as much code as possible. + */ +static ssize_t +generic_kernel_file_aio_write_nolock(struct kiocb *iocb, const struct kvec*iov, + unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; - struct address_space_operations *a_ops = mapping->a_ops; + const struct address_space_operations *a_ops = mapping->a_ops; size_t ocount; /* original count */ size_t count; /* after file limit checks */ - struct inode *inode = mapping->host; + struct inode *inode = mapping->host; long status = 0; loff_t pos; struct page *page; @@ -1773,14 +2401,14 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, ssize_t err; size_t bytes; struct pagevec lru_pvec; - const struct iovec *cur_iov = iov; /* current iovec */ - size_t iov_base = 0; /* offset in the current iovec */ + const struct kvec *cur_iov = iov; /* current kvec */ + size_t iov_base = 0; /* offset in the current kvec */ unsigned long seg; - char __user *buf; + char *buf; ocount = 0; for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; + const struct kvec *iv = &iov[seg]; /* * If any segment has a negative length, or the cumulative @@ -1789,13 +2417,6 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, ocount += iv->iov_len; if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) return -EINVAL; - if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - nr_segs = seg; - ocount -= iv->iov_len; /* This segment is no good */ - break; } count = ocount; @@ -1810,49 +2431,15 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; + if (count == 0) goto out; - err = remove_suid(file->f_dentry); - if (err) - goto out; + remove_suid(file->f_dentry); + file_update_time(file); - inode_update_time(inode, 1); - - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (unlikely(file->f_flags & O_DIRECT)) { - if (count != ocount) - nr_segs = iov_shorten((struct iovec *)iov, - nr_segs, count); - written = generic_file_direct_IO(WRITE, iocb, - iov, pos, nr_segs); - if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !isblk) { - i_size_write(inode, end); - mark_inode_dirty(inode); - } - *ppos = end; - } - /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - * i_sem is held, which protects generic_osync_inode() from - * livelocking. - */ - if (written >= 0 && file->f_flags & O_SYNC) - status = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (written == count && !is_sync_kiocb(iocb)) - written = -EIOCBQUEUED; - if (written < 0 || written == count) - goto out_status; - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - pos += written; - count -= written; - } + /* There is no sane reason to use O_DIRECT */ + BUG_ON(file->f_flags & O_DIRECT); buf = iov->iov_base; do { @@ -1866,14 +2453,6 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (bytes > count) bytes = count; - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - fault_in_pages_readable(buf, bytes); - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { status = -ENOMEM; @@ -1893,12 +2472,10 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, vmtruncate(inode, isize); break; } - if (likely(nr_segs == 1)) - copied = filemap_copy_from_user(page, offset, - buf, bytes); - else - copied = filemap_copy_from_user_iovec(page, offset, - cur_iov, iov_base, bytes); + + BUG_ON(nr_segs != 1); + copied = filemap_copy_from_kernel(page, offset, buf, bytes); + flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); if (likely(copied > 0)) { @@ -1911,7 +2488,7 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, pos += status; buf += status; if (unlikely(nr_segs > 1)) - filemap_set_next_iovec(&cur_iov, + filemap_set_next_kvec(&cur_iov, &iov_base, status); } } @@ -1939,16 +2516,7 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, status = generic_osync_inode(inode, mapping, OSYNC_METADATA|OSYNC_DATA); } - - /* - * If we get here for O_DIRECT writes then we must have fallen through - * to buffered writes (block instantiation inside i_size). So we sync - * the file data here, to try to honour O_DIRECT expectations. - */ - if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait(mapping); -out_status: err = written ? written : status; out: pagevec_lru_add(&lru_pvec); @@ -1956,108 +2524,292 @@ out: return err; } -EXPORT_SYMBOL(generic_file_aio_write_nolock); - -ssize_t -generic_file_write_nolock(struct file *file, const struct iovec *iov, +static ssize_t +__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + struct inode *inode = mapping->host; + unsigned long seg; + loff_t pos; + ssize_t written; + ssize_t err; + + ocount = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + nr_segs = seg; + ocount -= iv->iov_len; /* This segment is no good */ + break; + } + + count = ocount; + pos = *ppos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + written = 0; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = remove_suid(file->f_path.dentry); + if (err) + goto out; + + file_update_time(file); + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (unlikely(file->f_flags & O_DIRECT)) { + loff_t endbyte; + ssize_t written_buffered; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, + ppos, count, ocount); + if (written < 0 || written == count) + goto out; + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + pos += written; + count -= written; + written_buffered = generic_file_buffered_write(iocb, iov, + nr_segs, pos, ppos, count, + written); + /* + * If generic_file_buffered_write() retuned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + written_buffered - written - 1; + err = do_sync_file_range(file, pos, endbyte, + SYNC_FILE_RANGE_WAIT_BEFORE| + SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + if (err == 0) { + written = written_buffered; + invalidate_mapping_pages(mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, ppos, count, written); + } +out: + current->backing_dev_info = NULL; + return written ? written : err; +} + +static ssize_t +generic_kernel_file_write_nolock(struct file *file, const struct kvec *iov, + unsigned long nr_segs, loff_t *ppos) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, file); - ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); - if (-EIOCBQUEUED == ret) + ret = generic_kernel_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); return ret; } -EXPORT_SYMBOL(generic_file_write_nolock); - -ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, - size_t count, loff_t pos) +static ssize_t generic_kernel_file_write(struct file *file, const char *buf, + size_t count, loff_t *ppos) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - ssize_t err; - struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; - - BUG_ON(iocb->ki_pos != pos); + struct inode *inode = file->f_mapping->host; + ssize_t err; + struct kvec local_iov = { .iov_base = (char *) buf, + .iov_len = count }; - down(&inode->i_sem); - err = generic_file_aio_write_nolock(iocb, &local_iov, 1, - &iocb->ki_pos); - up(&inode->i_sem); + mutex_lock(&inode->i_mutex); + err = generic_kernel_file_write_nolock(file, &local_iov, 1, ppos); + mutex_unlock(&inode->i_mutex); return err; } -EXPORT_SYMBOL(generic_file_aio_write); -ssize_t generic_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = file->f_mapping->host; - ssize_t err; - struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; - down(&inode->i_sem); - err = generic_file_write_nolock(file, &local_iov, 1, ppos); - up(&inode->i_sem); + BUG_ON(iocb->ki_pos != pos); - return err; -} + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, + &iocb->ki_pos); -EXPORT_SYMBOL(generic_file_write); + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; -ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + err = sync_page_range_nolock(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } return ret; } +EXPORT_SYMBOL(generic_file_aio_write_nolock); -EXPORT_SYMBOL(generic_file_readv); - -ssize_t generic_file_writev(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t * ppos) +ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - struct inode *inode = file->f_mapping->host; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; ssize_t ret; - down(&inode->i_sem); - ret = generic_file_write_nolock(file, iov, nr_segs, ppos); - up(&inode->i_sem); + BUG_ON(iocb->ki_pos != pos); + + mutex_lock(&inode->i_mutex); + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, + &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } return ret; } - -EXPORT_SYMBOL(generic_file_writev); +EXPORT_SYMBOL(generic_file_aio_write); /* - * Called under i_sem for writes to S_ISREG files + * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something + * went wrong during pagecache shootdown. */ -ssize_t +static ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t retval; + size_t write_len; + pgoff_t end = 0; /* silence gcc */ + + /* + * If it's a write, unmap all mmappings of the file up-front. This + * will cause any pte dirty bits to be propagated into the pageframes + * for the subsequent filemap_write_and_wait(). + */ + if (rw == WRITE) { + write_len = iov_length(iov, nr_segs); + end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; + if (mapping_mapped(mapping)) + unmap_mapping_range(mapping, offset, write_len, 0); + } retval = filemap_write_and_wait(mapping); - if (retval == 0) { - retval = mapping->a_ops->direct_IO(rw, iocb, iov, - offset, nr_segs); - if (rw == WRITE && mapping->nrpages) - invalidate_inode_pages2(mapping); + if (retval) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (rw == WRITE && mapping->nrpages) { + retval = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, end); + if (retval) + goto out; } + + retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); + if (retval) + goto out; + + /* + * Finally, try again to invalidate clean pages which might have been + * faulted in by get_user_pages() if the source of the write was an + * mmap()ed region of the file we're writing. That's a pretty crazy + * thing to do, so we don't support it 100%. If this invalidation + * fails and we have -EIOCBQUEUED we ignore the failure. + */ + if (rw == WRITE && mapping->nrpages) { + int err = invalidate_inode_pages2_range(mapping, + offset >> PAGE_CACHE_SHIFT, end); + if (err && retval >= 0) + retval = err; + } +out: return retval; } -EXPORT_SYMBOL_GPL(generic_file_direct_IO); +/** + * try_to_release_page() - release old fs-specific metadata on a page + * + * @page: the page which the kernel is trying to free + * @gfp_mask: memory allocation flags (and I/O mode) + * + * The address_space is to try to release any data against the page + * (presumably at page->private). If the release was successful, return `1'. + * Otherwise return zero. + * + * The @gfp_mask argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * + * NOTE: @gfp_mask may go away, and this function may become non-blocking. + */ +int try_to_release_page(struct page *page, gfp_t gfp_mask) +{ + struct address_space * const mapping = page->mapping; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping && mapping->a_ops->releasepage) + return mapping->a_ops->releasepage(page, gfp_mask); + return try_to_free_buffers(page); +} + +EXPORT_SYMBOL(try_to_release_page);