X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Ffilemap.c;h=4f2fb2c40f7881697fbd64b9cdfb22f6463657c7;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=272c3e0a6fed2b8cf934aad6a2dabd7c5ce71daa;hpb=87fc8d1bb10cd459024a742c6a10961fefcef18f;p=linux-2.6.git diff --git a/mm/filemap.c b/mm/filemap.c index 272c3e0a6..4f2fb2c40 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * This is needed for the following functions: * - try_to_release_page @@ -130,9 +131,12 @@ void remove_from_page_cache(struct page *page) spin_unlock_irq(&mapping->tree_lock); } -static inline int sync_page(struct page *page) +static int sync_page(void *word) { struct address_space *mapping; + struct page *page; + + page = container_of((page_flags_t *)word, struct page, flags); /* * FIXME, fercrissake. What is this barrier here for? @@ -140,7 +144,8 @@ static inline int sync_page(struct page *page) smp_mb(); mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) - return mapping->a_ops->sync_page(page); + mapping->a_ops->sync_page(page); + io_schedule(); return 0; } @@ -186,12 +191,11 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -int filemap_fdatawrite_range(struct address_space *mapping, +static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } -EXPORT_SYMBOL(filemap_fdatawrite_range); /* * This is a mostly non-blocking flush. Not suitable for data-integrity @@ -279,6 +283,29 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, } EXPORT_SYMBOL(sync_page_range); +/* + * Note: Holding i_sem across sync_page_range_nolock is not a good idea + * as it forces O_SYNC writers to different parts of the same file + * to be serialised right until io completion. + */ +int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, + loff_t pos, size_t count) +{ + pgoff_t start = pos >> PAGE_CACHE_SHIFT; + pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + int ret; + + if (mapping->backing_dev_info->memory_backed || !count) + return 0; + ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); + if (ret == 0) + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + if (ret == 0) + ret = wait_on_page_writeback_range(mapping, start, end); + return ret; +} +EXPORT_SYMBOL(sync_page_range_nolock); + /** * filemap_fdatawait - walk the list of under-writeback pages of the given * address space and wait for all of them. @@ -359,40 +386,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -struct page_wait_queue { - struct page *page; - int bit; - wait_queue_t wait; -}; - -static int page_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) -{ - struct page *page = key; - struct page_wait_queue *wq; - - wq = container_of(wait, struct page_wait_queue, wait); - if (wq->page != page || test_bit(wq->bit, &page->flags)) - return 0; - else - return autoremove_wake_function(wait, mode, sync, NULL); -} - -#define __DEFINE_PAGE_WAIT(name, p, b, f) \ - struct page_wait_queue name = { \ - .page = p, \ - .bit = b, \ - .wait = { \ - .task = current, \ - .func = page_wake_function, \ - .flags = f, \ - .task_list = LIST_HEAD_INIT(name.wait.task_list),\ - }, \ - } - -#define DEFINE_PAGE_WAIT(name, p, b) __DEFINE_PAGE_WAIT(name, p, b, 0) -#define DEFINE_PAGE_WAIT_EXCLUSIVE(name, p, b) \ - __DEFINE_PAGE_WAIT(name, p, b, WQ_FLAG_EXCLUSIVE) - static wait_queue_head_t *page_waitqueue(struct page *page) { const struct zone *zone = page_zone(page); @@ -400,30 +393,19 @@ static wait_queue_head_t *page_waitqueue(struct page *page) return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; } -static void wake_up_page(struct page *page) +static inline void wake_up_page(struct page *page, int bit) { - const unsigned int mode = TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE; - wait_queue_head_t *waitqueue = page_waitqueue(page); - - if (waitqueue_active(waitqueue)) - __wake_up(waitqueue, mode, 1, page); + __wake_up_bit(page_waitqueue(page), &page->flags, bit); } void fastcall wait_on_page_bit(struct page *page, int bit_nr) { - wait_queue_head_t *waitqueue = page_waitqueue(page); - DEFINE_PAGE_WAIT(wait, page, bit_nr); + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); - do { - prepare_to_wait(waitqueue, &wait.wait, TASK_UNINTERRUPTIBLE); - if (test_bit(bit_nr, &page->flags)) { - sync_page(page); - io_schedule(); - } - } while (test_bit(bit_nr, &page->flags)); - finish_wait(waitqueue, &wait.wait); + if (test_bit(bit_nr, &page->flags)) + __wait_on_bit(page_waitqueue(page), &wait, sync_page, + TASK_UNINTERRUPTIBLE); } - EXPORT_SYMBOL(wait_on_page_bit); /** @@ -447,11 +429,9 @@ void fastcall unlock_page(struct page *page) if (!TestClearPageLocked(page)) BUG(); smp_mb__after_clear_bit(); - wake_up_page(page); + wake_up_page(page, PG_locked); } - EXPORT_SYMBOL(unlock_page); -EXPORT_SYMBOL(lock_page); /* * End writeback against a page. @@ -461,11 +441,10 @@ void end_page_writeback(struct page *page) if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { if (!test_clear_page_writeback(page)) BUG(); - smp_mb__after_clear_bit(); } - wake_up_page(page); + smp_mb__after_clear_bit(); + wake_up_page(page, PG_writeback); } - EXPORT_SYMBOL(end_page_writeback); /* @@ -478,19 +457,11 @@ EXPORT_SYMBOL(end_page_writeback); */ void fastcall __lock_page(struct page *page) { - wait_queue_head_t *wqh = page_waitqueue(page); - DEFINE_PAGE_WAIT_EXCLUSIVE(wait, page, PG_locked); + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - while (TestSetPageLocked(page)) { - prepare_to_wait_exclusive(wqh, &wait.wait, TASK_UNINTERRUPTIBLE); - if (PageLocked(page)) { - sync_page(page); - io_schedule(); - } - } - finish_wait(wqh, &wait.wait); + __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + TASK_UNINTERRUPTIBLE); } - EXPORT_SYMBOL(__lock_page); /* @@ -717,7 +688,12 @@ void do_generic_mapping_read(struct address_space *mapping, read_actor_t actor) { struct inode *inode = mapping->host; - unsigned long index, end_index, offset; + unsigned long index; + unsigned long end_index; + unsigned long offset; + unsigned long req_size; + unsigned long next_index; + unsigned long prev_index; loff_t isize; struct page *cached_page; int error; @@ -725,6 +701,9 @@ void do_generic_mapping_read(struct address_space *mapping, cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; + next_index = index; + prev_index = ra.prev_page; + req_size = (desc->count + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; isize = i_size_read(inode); @@ -734,7 +713,7 @@ void do_generic_mapping_read(struct address_space *mapping, end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; - unsigned long nr, ret; + unsigned long ret_size, nr, ret; /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; @@ -749,7 +728,12 @@ void do_generic_mapping_read(struct address_space *mapping, nr = nr - offset; cond_resched(); - page_cache_readahead(mapping, &ra, filp, index); + if (index == next_index && req_size) { + ret_size = page_cache_readahead(mapping, &ra, + filp, index, req_size); + next_index += ret_size; + req_size -= ret_size; + } find_page: page = find_get_page(mapping, index); @@ -769,10 +753,12 @@ page_ok: flush_dcache_page(page); /* - * Mark the page accessed if we read the beginning. + * When (part of) the same page is read multiple times + * in succession, only mark it as accessed the first time. */ - if (!offset) + if (prev_index != index) mark_page_accessed(page); + prev_index = index; /* * Ok, we have the page, and it's up-to-date, so @@ -819,11 +805,21 @@ readpage: goto readpage_error; if (!PageUptodate(page)) { - wait_on_page_locked(page); + lock_page(page); if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* + * invalidate_inode_pages got it + */ + unlock_page(page); + page_cache_release(page); + goto find_page; + } + unlock_page(page); error = -EIO; goto readpage_error; } + unlock_page(page); } /* @@ -1195,7 +1191,7 @@ retry_all: * For sequential accesses, we use the generic readahead logic. */ if (VM_SequentialReadHint(area)) - page_cache_readahead(mapping, ra, file, pgoff); + page_cache_readahead(mapping, ra, file, pgoff, 1); /* * Do we have something in the page cache already? @@ -1462,12 +1458,9 @@ err: return NULL; } -static int filemap_populate(struct vm_area_struct *vma, - unsigned long addr, - unsigned long len, - pgprot_t prot, - unsigned long pgoff, - int nonblock) +int filemap_populate(struct vm_area_struct *vma, unsigned long addr, + unsigned long len, pgprot_t prot, unsigned long pgoff, + int nonblock) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -1527,6 +1520,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) vma->vm_ops = &generic_file_vm_ops; return 0; } +EXPORT_SYMBOL(filemap_populate); /* * This is for filesystems which do not implement ->writepage. @@ -1804,7 +1798,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) { struct inode *inode = file->f_mapping->host; - unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (unlikely(*pos < 0)) return -EINVAL; @@ -1878,7 +1872,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i } return 0; } - EXPORT_SYMBOL(generic_write_checks); ssize_t @@ -1916,7 +1909,6 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, written = -EIOCBQUEUED; return written; } - EXPORT_SYMBOL(generic_file_direct_write); ssize_t @@ -1939,7 +1931,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, pagevec_init(&lru_pvec, 0); - buf = iov->iov_base + written; /* handle partial DIO write */ + /* + * handle partial DIO write. Adjust cur_iov if needed. + */ + if (likely(nr_segs == 1)) + buf = iov->iov_base + written; + else { + filemap_set_next_iovec(&cur_iov, &iov_base, written); + buf = iov->iov_base + iov_base; + } + do { unsigned long index; unsigned long offset; @@ -2038,11 +2039,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, pagevec_lru_add(&lru_pvec); return written ? written : status; } - EXPORT_SYMBOL(generic_file_buffered_write); ssize_t -generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, +__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; @@ -2078,6 +2078,8 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, count = ocount; pos = *ppos; + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; written = 0; @@ -2115,9 +2117,44 @@ out: current->backing_dev_info = NULL; return written ? written : err; } - EXPORT_SYMBOL(generic_file_aio_write_nolock); +ssize_t +generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + loff_t pos = *ppos; + + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + err = sync_page_range_nolock(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} + +ssize_t +__generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) @@ -2131,7 +2168,6 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, ret = wait_on_sync_kiocb(&kiocb); return ret; } - EXPORT_SYMBOL(generic_file_write_nolock); ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, @@ -2147,7 +2183,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, BUG_ON(iocb->ki_pos != pos); down(&inode->i_sem); - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, + ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); up(&inode->i_sem); @@ -2172,7 +2208,7 @@ ssize_t generic_file_write(struct file *file, const char __user *buf, .iov_len = count }; down(&inode->i_sem); - ret = generic_file_write_nolock(file, &local_iov, 1, ppos); + ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { @@ -2198,7 +2234,6 @@ ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, ret = wait_on_sync_kiocb(&kiocb); return ret; } - EXPORT_SYMBOL(generic_file_readv); ssize_t generic_file_writev(struct file *file, const struct iovec *iov, @@ -2209,7 +2244,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, ssize_t ret; down(&inode->i_sem); - ret = generic_file_write_nolock(file, iov, nr_segs, ppos); + ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { @@ -2221,11 +2256,11 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, } return ret; } - EXPORT_SYMBOL(generic_file_writev); /* - * Called under i_sem for writes to S_ISREG files + * Called under i_sem for writes to S_ISREG files. Returns -EIO if something + * went wrong during pagecache shootdown. */ ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, @@ -2235,14 +2270,24 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct address_space *mapping = file->f_mapping; ssize_t retval; + /* + * If it's a write, unmap all mmappings of the file up-front. This + * will cause any pte dirty bits to be propagated into the pageframes + * for the subsequent filemap_write_and_wait(). + */ + if (rw == WRITE && mapping_mapped(mapping)) + unmap_mapping_range(mapping, 0, -1, 0); + retval = filemap_write_and_wait(mapping); if (retval == 0) { retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - if (rw == WRITE && mapping->nrpages) - invalidate_inode_pages2(mapping); + if (rw == WRITE && mapping->nrpages) { + int err = invalidate_inode_pages2(mapping); + if (err) + retval = err; + } } return retval; } - EXPORT_SYMBOL_GPL(generic_file_direct_IO);