#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
+#include <linux/syscalls.h>
/*
* This is needed for the following functions:
* - try_to_release_page
spin_unlock_irq(&mapping->tree_lock);
}
-static inline int sync_page(struct page *page)
+static int sync_page(void *word)
{
struct address_space *mapping;
+ struct page *page;
+
+ page = container_of((page_flags_t *)word, struct page, flags);
/*
* FIXME, fercrissake. What is this barrier here for?
smp_mb();
mapping = page_mapping(page);
if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
- return mapping->a_ops->sync_page(page);
+ mapping->a_ops->sync_page(page);
+ io_schedule();
return 0;
}
}
EXPORT_SYMBOL(filemap_fdatawrite);
-int filemap_fdatawrite_range(struct address_space *mapping,
+static int filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
-EXPORT_SYMBOL(filemap_fdatawrite_range);
/*
* This is a mostly non-blocking flush. Not suitable for data-integrity
}
EXPORT_SYMBOL(sync_page_range);
+/*
+ * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * as it forces O_SYNC writers to different parts of the same file
+ * to be serialised right until io completion.
+ */
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+ loff_t pos, size_t count)
+{
+ pgoff_t start = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+ int ret;
+
+ if (mapping->backing_dev_info->memory_backed || !count)
+ return 0;
+ ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
+ if (ret == 0)
+ ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ if (ret == 0)
+ ret = wait_on_page_writeback_range(mapping, start, end);
+ return ret;
+}
+EXPORT_SYMBOL(sync_page_range_nolock);
+
/**
* filemap_fdatawait - walk the list of under-writeback pages of the given
* address space and wait for all of them.
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
-struct page_wait_queue {
- struct page *page;
- int bit;
- wait_queue_t wait;
-};
-
-static int page_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
-{
- struct page *page = key;
- struct page_wait_queue *wq;
-
- wq = container_of(wait, struct page_wait_queue, wait);
- if (wq->page != page || test_bit(wq->bit, &page->flags))
- return 0;
- else
- return autoremove_wake_function(wait, mode, sync, NULL);
-}
-
-#define __DEFINE_PAGE_WAIT(name, p, b, f) \
- struct page_wait_queue name = { \
- .page = p, \
- .bit = b, \
- .wait = { \
- .task = current, \
- .func = page_wake_function, \
- .flags = f, \
- .task_list = LIST_HEAD_INIT(name.wait.task_list),\
- }, \
- }
-
-#define DEFINE_PAGE_WAIT(name, p, b) __DEFINE_PAGE_WAIT(name, p, b, 0)
-#define DEFINE_PAGE_WAIT_EXCLUSIVE(name, p, b) \
- __DEFINE_PAGE_WAIT(name, p, b, WQ_FLAG_EXCLUSIVE)
-
static wait_queue_head_t *page_waitqueue(struct page *page)
{
const struct zone *zone = page_zone(page);
return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
}
-static void wake_up_page(struct page *page)
+static inline void wake_up_page(struct page *page, int bit)
{
- const unsigned int mode = TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE;
- wait_queue_head_t *waitqueue = page_waitqueue(page);
-
- if (waitqueue_active(waitqueue))
- __wake_up(waitqueue, mode, 1, page);
+ __wake_up_bit(page_waitqueue(page), &page->flags, bit);
}
void fastcall wait_on_page_bit(struct page *page, int bit_nr)
{
- wait_queue_head_t *waitqueue = page_waitqueue(page);
- DEFINE_PAGE_WAIT(wait, page, bit_nr);
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
- do {
- prepare_to_wait(waitqueue, &wait.wait, TASK_UNINTERRUPTIBLE);
- if (test_bit(bit_nr, &page->flags)) {
- sync_page(page);
- io_schedule();
- }
- } while (test_bit(bit_nr, &page->flags));
- finish_wait(waitqueue, &wait.wait);
+ if (test_bit(bit_nr, &page->flags))
+ __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+ TASK_UNINTERRUPTIBLE);
}
-
EXPORT_SYMBOL(wait_on_page_bit);
/**
if (!TestClearPageLocked(page))
BUG();
smp_mb__after_clear_bit();
- wake_up_page(page);
+ wake_up_page(page, PG_locked);
}
-
EXPORT_SYMBOL(unlock_page);
-EXPORT_SYMBOL(lock_page);
/*
* End writeback against a page.
if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
if (!test_clear_page_writeback(page))
BUG();
- smp_mb__after_clear_bit();
}
- wake_up_page(page);
+ smp_mb__after_clear_bit();
+ wake_up_page(page, PG_writeback);
}
-
EXPORT_SYMBOL(end_page_writeback);
/*
*/
void fastcall __lock_page(struct page *page)
{
- wait_queue_head_t *wqh = page_waitqueue(page);
- DEFINE_PAGE_WAIT_EXCLUSIVE(wait, page, PG_locked);
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- while (TestSetPageLocked(page)) {
- prepare_to_wait_exclusive(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
- if (PageLocked(page)) {
- sync_page(page);
- io_schedule();
- }
- }
- finish_wait(wqh, &wait.wait);
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+ TASK_UNINTERRUPTIBLE);
}
-
EXPORT_SYMBOL(__lock_page);
/*
read_actor_t actor)
{
struct inode *inode = mapping->host;
- unsigned long index, end_index, offset;
+ unsigned long index;
+ unsigned long end_index;
+ unsigned long offset;
+ unsigned long req_size;
+ unsigned long next_index;
+ unsigned long prev_index;
loff_t isize;
struct page *cached_page;
int error;
cached_page = NULL;
index = *ppos >> PAGE_CACHE_SHIFT;
+ next_index = index;
+ prev_index = ra.prev_page;
+ req_size = (desc->count + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
- unsigned long nr, ret;
+ unsigned long ret_size, nr, ret;
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
nr = nr - offset;
cond_resched();
- page_cache_readahead(mapping, &ra, filp, index);
+ if (index == next_index && req_size) {
+ ret_size = page_cache_readahead(mapping, &ra,
+ filp, index, req_size);
+ next_index += ret_size;
+ req_size -= ret_size;
+ }
find_page:
page = find_get_page(mapping, index);
flush_dcache_page(page);
/*
- * Mark the page accessed if we read the beginning.
+ * When (part of) the same page is read multiple times
+ * in succession, only mark it as accessed the first time.
*/
- if (!offset)
+ if (prev_index != index)
mark_page_accessed(page);
+ prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
goto readpage_error;
if (!PageUptodate(page)) {
- wait_on_page_locked(page);
+ lock_page(page);
if (!PageUptodate(page)) {
+ if (page->mapping == NULL) {
+ /*
+ * invalidate_inode_pages got it
+ */
+ unlock_page(page);
+ page_cache_release(page);
+ goto find_page;
+ }
+ unlock_page(page);
error = -EIO;
goto readpage_error;
}
+ unlock_page(page);
}
/*
* For sequential accesses, we use the generic readahead logic.
*/
if (VM_SequentialReadHint(area))
- page_cache_readahead(mapping, ra, file, pgoff);
+ page_cache_readahead(mapping, ra, file, pgoff, 1);
/*
* Do we have something in the page cache already?
return NULL;
}
-static int filemap_populate(struct vm_area_struct *vma,
- unsigned long addr,
- unsigned long len,
- pgprot_t prot,
- unsigned long pgoff,
- int nonblock)
+int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long len, pgprot_t prot, unsigned long pgoff,
+ int nonblock)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
+EXPORT_SYMBOL(filemap_populate);
/*
* This is for filesystems which do not implement ->writepage.
inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
{
struct inode *inode = file->f_mapping->host;
- unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+ unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
if (unlikely(*pos < 0))
return -EINVAL;
}
return 0;
}
-
EXPORT_SYMBOL(generic_write_checks);
ssize_t
written = -EIOCBQUEUED;
return written;
}
-
EXPORT_SYMBOL(generic_file_direct_write);
ssize_t
pagevec_init(&lru_pvec, 0);
- buf = iov->iov_base + written; /* handle partial DIO write */
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1))
+ buf = iov->iov_base + written;
+ else {
+ filemap_set_next_iovec(&cur_iov, &iov_base, written);
+ buf = iov->iov_base + iov_base;
+ }
+
do {
unsigned long index;
unsigned long offset;
pagevec_lru_add(&lru_pvec);
return written ? written : status;
}
-
EXPORT_SYMBOL(generic_file_buffered_write);
ssize_t
-generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
count = ocount;
pos = *ppos;
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
written = 0;
current->backing_dev_info = NULL;
return written ? written : err;
}
-
EXPORT_SYMBOL(generic_file_aio_write_nolock);
+ssize_t
+generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+ loff_t pos = *ppos;
+
+ ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
+
+ if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err;
+
+ err = sync_page_range_nolock(inode, mapping, pos, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
+}
+
+ssize_t
+__generic_file_write_nolock(struct file *file, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+ if (ret == -EIOCBQUEUED)
+ ret = wait_on_sync_kiocb(&kiocb);
+ return ret;
+}
+
ssize_t
generic_file_write_nolock(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
ret = wait_on_sync_kiocb(&kiocb);
return ret;
}
-
EXPORT_SYMBOL(generic_file_write_nolock);
ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
BUG_ON(iocb->ki_pos != pos);
down(&inode->i_sem);
- ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
+ ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
&iocb->ki_pos);
up(&inode->i_sem);
.iov_len = count };
down(&inode->i_sem);
- ret = generic_file_write_nolock(file, &local_iov, 1, ppos);
+ ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
up(&inode->i_sem);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
ret = wait_on_sync_kiocb(&kiocb);
return ret;
}
-
EXPORT_SYMBOL(generic_file_readv);
ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
ssize_t ret;
down(&inode->i_sem);
- ret = generic_file_write_nolock(file, iov, nr_segs, ppos);
+ ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
up(&inode->i_sem);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
}
return ret;
}
-
EXPORT_SYMBOL(generic_file_writev);
/*
- * Called under i_sem for writes to S_ISREG files
+ * Called under i_sem for writes to S_ISREG files. Returns -EIO if something
+ * went wrong during pagecache shootdown.
*/
ssize_t
generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
struct address_space *mapping = file->f_mapping;
ssize_t retval;
+ /*
+ * If it's a write, unmap all mmappings of the file up-front. This
+ * will cause any pte dirty bits to be propagated into the pageframes
+ * for the subsequent filemap_write_and_wait().
+ */
+ if (rw == WRITE && mapping_mapped(mapping))
+ unmap_mapping_range(mapping, 0, -1, 0);
+
retval = filemap_write_and_wait(mapping);
if (retval == 0) {
retval = mapping->a_ops->direct_IO(rw, iocb, iov,
offset, nr_segs);
- if (rw == WRITE && mapping->nrpages)
- invalidate_inode_pages2(mapping);
+ if (rw == WRITE && mapping->nrpages) {
+ int err = invalidate_inode_pages2(mapping);
+ if (err)
+ retval = err;
+ }
}
return retval;
}
-
EXPORT_SYMBOL_GPL(generic_file_direct_IO);