struct backing_dev_info default_backing_dev_info = {
.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
.state = 0,
+ .capabilities = BDI_CAP_MAP_COPY,
.unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
}
+static inline void reset_ahead_window(struct file_ra_state *ra)
+{
+ /*
+ * ... but preserve ahead_start + ahead_size value,
+ * see 'recheck:' label in page_cache_readahead().
+ * Note: We never use ->ahead_size as rvalue without
+ * checking ->ahead_start != 0 first.
+ */
+ ra->ahead_size += ra->ahead_start;
+ ra->ahead_start = 0;
+}
+
static inline void ra_off(struct file_ra_state *ra)
{
ra->start = 0;
ra->flags = 0;
- ra->size = -1;
- ra->ahead_start = 0;
- ra->ahead_size = 0;
+ ra->size = 0;
+ reset_ahead_window(ra);
return;
}
{
unsigned long newsize = roundup_pow_of_two(size);
- if (newsize <= max / 64)
- newsize = newsize * newsize;
+ if (newsize <= max / 32)
+ newsize = newsize * 4;
else if (newsize <= max / 4)
- newsize = max / 4;
+ newsize = newsize * 2;
else
newsize = max;
return newsize;
* not for each call to readahead. If a cache miss occured, reduce next I/O
* size, else increase depending on how close to max we are.
*/
-static unsigned long get_next_ra_size(unsigned long cur, unsigned long max,
- unsigned long min, unsigned long * flags)
+static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
{
+ unsigned long max = get_max_readahead(ra);
+ unsigned long min = get_min_readahead(ra);
+ unsigned long cur = ra->size;
unsigned long newsize;
- if (*flags & RA_FLAG_MISS) {
+ if (ra->flags & RA_FLAG_MISS) {
+ ra->flags &= ~RA_FLAG_MISS;
newsize = max((cur - 2), min);
- *flags &= ~RA_FLAG_MISS;
} else if (cur < max / 16) {
newsize = 4 * cur;
} else {
{
unsigned page_idx;
struct pagevec lru_pvec;
- int ret = 0;
+ int ret;
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
list_del(&page->lru);
if (!add_to_page_cache(page, mapping,
page->index, GFP_KERNEL)) {
- mapping->a_ops->readpage(filp, page);
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
- } else {
- page_cache_release(page);
+ ret = mapping->a_ops->readpage(filp, page);
+ if (ret != AOP_TRUNCATED_PAGE) {
+ if (!pagevec_add(&lru_pvec, page))
+ __pagevec_lru_add(&lru_pvec);
+ continue;
+ } /* else fall through to release */
}
+ page_cache_release(page);
}
pagevec_lru_add(&lru_pvec);
+ ret = 0;
out:
return ret;
}
* size: Number of pages in that read
* Together, these form the "current window".
* Together, start and size represent the `readahead window'.
- * next_size: The number of pages to read on the next readahead miss.
- * Has the magical value -1UL if readahead has been disabled.
* prev_page: The page which the readahead algorithm most-recently inspected.
- * prev_page is mainly an optimisation: if page_cache_readahead
- * sees that it is again being called for a page which it just
- * looked at, it can return immediately without making any state
- * changes.
+ * It is mainly used to detect sequential file reading.
+ * If page_cache_readahead sees that it is again being called for
+ * a page which it just looked at, it can return immediately without
+ * making any state changes.
* ahead_start,
* ahead_size: Together, these form the "ahead window".
* ra_pages: The externally controlled max readahead for this fd.
*
- * When readahead is in the off state (size == -1UL), readahead is disabled.
+ * When readahead is in the off state (size == 0), readahead is disabled.
* In this state, prev_page is used to detect the resumption of sequential I/O.
*
* The readahead code manages two windows - the "current" and the "ahead"
* ahead window.
*
* A `readahead hit' occurs when a read request is made against a page which is
- * the next sequential page. Ahead windowe calculations are done only when it
+ * the next sequential page. Ahead window calculations are done only when it
* is time to submit a new IO. The code ramps up the size agressively at first,
* but slow down as it approaches max_readhead.
*
* read happens to be the first page of the file, it is assumed that a linear
* read is about to happen and the window is immediately set to the initial size
* based on I/O request size and the max_readahead.
- *
- * A page request at (start + size) is not a miss at all - it's just a part of
- * sequential file reading.
*
* This function is to be called for every read request, rather than when
- * it is time to perform readahead. It is called only oce for the entire I/O
+ * it is time to perform readahead. It is called only once for the entire I/O
* regardless of size unless readahead is unable to start enough I/O to satisfy
* the request (I/O request > max_readahead).
*/
*/
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
struct inode *inode = mapping->host;
struct page *page;
/*
* Preallocate as many pages as we will need.
*/
- spin_lock_irq(&mapping->tree_lock);
+ read_lock_irq(&mapping->tree_lock);
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
- unsigned long page_offset = offset + page_idx;
+ pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
if (page)
continue;
- spin_unlock_irq(&mapping->tree_lock);
+ read_unlock_irq(&mapping->tree_lock);
page = page_cache_alloc_cold(mapping);
- spin_lock_irq(&mapping->tree_lock);
+ read_lock_irq(&mapping->tree_lock);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
ret++;
}
- spin_unlock_irq(&mapping->tree_lock);
+ read_unlock_irq(&mapping->tree_lock);
/*
* Now start the IO. We ignore I/O errors - if the page is not
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
int ret = 0;
* readahead isn't helping.
*
*/
-int check_ra_success(struct file_ra_state *ra, unsigned long nr_to_read,
- unsigned long actual)
+static inline int check_ra_success(struct file_ra_state *ra,
+ unsigned long nr_to_read, unsigned long actual)
{
if (actual == 0) {
ra->cache_hit += nr_to_read;
* request queues.
*/
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
if (bdi_read_congested(mapping->backing_dev_info))
return -1;
*/
static int
blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
- unsigned long offset, unsigned long nr_to_read,
+ pgoff_t offset, unsigned long nr_to_read,
struct file_ra_state *ra, int block)
{
int actual;
- if (block) {
- actual = __do_page_cache_readahead(mapping, filp,
- offset, nr_to_read);
- } else {
- actual = do_page_cache_readahead(mapping, filp,
- offset, nr_to_read);
- if (actual == -1)
- return 0;
- }
+ if (!block && bdi_read_congested(mapping->backing_dev_info))
+ return 0;
+
+ actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+
return check_ra_success(ra, nr_to_read, actual);
}
-/*
- * page_cache_readahead is the main function. If performs the adaptive
+static int make_ahead_window(struct address_space *mapping, struct file *filp,
+ struct file_ra_state *ra, int force)
+{
+ int block, ret;
+
+ ra->ahead_size = get_next_ra_size(ra);
+ ra->ahead_start = ra->start + ra->size;
+
+ block = force || (ra->prev_page >= ra->ahead_start);
+ ret = blockable_page_cache_readahead(mapping, filp,
+ ra->ahead_start, ra->ahead_size, ra, block);
+
+ if (!ret && !force) {
+ /* A read failure in blocking mode, implies pages are
+ * all cached. So we can safely assume we have taken
+ * care of all the pages requested in this call.
+ * A read failure in non-blocking mode, implies we are
+ * reading more pages than requested in this call. So
+ * we safely assume we have taken care of all the pages
+ * requested in this call.
+ *
+ * Just reset the ahead window in case we failed due to
+ * congestion. The ahead window will any way be closed
+ * in case we failed due to excessive page cache hits.
+ */
+ reset_ahead_window(ra);
+ }
+
+ return ret;
+}
+
+/**
+ * page_cache_readahead - generic adaptive readahead
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
+ * @req_size: hint: total size of the read which the caller is performing in
+ * PAGE_CACHE_SIZE units
+ *
+ * page_cache_readahead() is the main function. If performs the adaptive
* readahead window size management and submits the readahead I/O.
+ *
+ * Note that @filp is purely used for passing on to the ->readpage[s]()
+ * handler: it may refer to a different file from @mapping (so we may not use
+ * @filp->f_mapping or @filp->f_dentry->d_inode here).
+ * Also, @ra may not be equal to &@filp->f_ra.
+ *
*/
unsigned long
page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
- struct file *filp, unsigned long offset,
- unsigned long req_size)
+ struct file *filp, pgoff_t offset, unsigned long req_size)
{
- unsigned long max, min;
- unsigned long newsize = req_size;
- unsigned long block;
+ unsigned long max, newsize;
+ int sequential;
/*
- * Here we detect the case where the application is performing
- * sub-page sized reads. We avoid doing extra work and bogusly
- * perturbing the readahead window expansion logic.
- * If size is zero, there is no read ahead window so we need one
+ * We avoid doing extra work and bogusly perturbing the readahead
+ * window expansion logic.
*/
- if (offset == ra->prev_page && req_size == 1 && ra->size != 0)
- goto out;
+ if (offset == ra->prev_page && --req_size)
+ ++offset;
+
+ /* Note that prev_page == -1 if it is a first read */
+ sequential = (offset == ra->prev_page + 1);
+ ra->prev_page = offset;
max = get_max_readahead(ra);
- min = get_min_readahead(ra);
newsize = min(req_size, max);
- if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) {
- newsize = 1;
- ra->prev_page = offset;
- goto out; /* No readahead or file already in cache */
- }
+ /* No readahead or sub-page sized read or file already in cache */
+ if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
+ goto out;
+
+ ra->prev_page += newsize - 1;
+
/*
- * Special case - first read. We'll assume it's a whole-file read if
- * at start of file, and grow the window fast. Or detect first
+ * Special case - first read at start of file. We'll assume it's
+ * a whole-file read and grow the window fast. Or detect first
* sequential access
*/
- if ((ra->size == 0 && offset == 0) /* first io and start of file */
- || (ra->size == -1 && ra->prev_page == offset - 1)) {
- /* First sequential */
- ra->prev_page = offset + newsize - 1;
+ if (sequential && ra->size == 0) {
ra->size = get_init_ra_size(newsize, max);
ra->start = offset;
if (!blockable_page_cache_readahead(mapping, filp, offset,
* IOs,* thus preventing stalls. so issue the ahead window
* immediately.
*/
- if (req_size >= max) {
- ra->ahead_size = get_next_ra_size(ra->size, max, min,
- &ra->flags);
- ra->ahead_start = ra->start + ra->size;
- blockable_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size, ra, 1);
- }
+ if (req_size >= max)
+ make_ahead_window(mapping, filp, ra, 1);
+
goto out;
}
* partial page reads and first access were handled above,
* so this must be the next page otherwise it is random
*/
- if ((offset != (ra->prev_page+1) || (ra->size == 0))) {
+ if (!sequential) {
ra_off(ra);
- ra->prev_page = offset + newsize - 1;
blockable_page_cache_readahead(mapping, filp, offset,
newsize, ra, 1);
goto out;
* If we get here we are doing sequential IO and this was not the first
* occurence (ie we have an existing window)
*/
-
if (ra->ahead_start == 0) { /* no ahead window yet */
- ra->ahead_size = get_next_ra_size(ra->size, max, min,
- &ra->flags);
- ra->ahead_start = ra->start + ra->size;
- block = ((offset + newsize -1) >= ra->ahead_start);
- if (!blockable_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size, ra, block)) {
- /* A read failure in blocking mode, implies pages are
- * all cached. So we can safely assume we have taken
- * care of all the pages requested in this call. A read
- * failure in non-blocking mode, implies we are reading
- * more pages than requested in this call. So we safely
- * assume we have taken care of all the pages requested
- * in this call.
- *
- * Just reset the ahead window in case we failed due to
- * congestion. The ahead window will any way be closed
- * in case we failed due to exessive page cache hits.
- */
- ra->ahead_start = 0;
- ra->ahead_size = 0;
- goto out;
- }
+ if (!make_ahead_window(mapping, filp, ra, 0))
+ goto recheck;
}
+
/*
* Already have an ahead window, check if we crossed into it.
* If so, shift windows and issue a new ahead window.
* we get called back on the first page of the ahead window which
* will allow us to submit more IO.
*/
- if ((offset + newsize - 1) >= ra->ahead_start) {
+ if (ra->prev_page >= ra->ahead_start) {
ra->start = ra->ahead_start;
ra->size = ra->ahead_size;
- ra->ahead_start = ra->ahead_start + ra->ahead_size;
- ra->ahead_size = get_next_ra_size(ra->ahead_size,
- max, min, &ra->flags);
- block = ((offset + newsize - 1) >= ra->ahead_start);
- if (!blockable_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size, ra, block)) {
- /* A read failure in blocking mode, implies pages are
- * all cached. So we can safely assume we have taken
- * care of all the pages requested in this call.
- * A read failure in non-blocking mode, implies we are
- * reading more pages than requested in this call. So
- * we safely assume we have taken care of all the pages
- * requested in this call.
- *
- * Just reset the ahead window in case we failed due to
- * congestion. The ahead window will any way be closed
- * in case we failed due to excessive page cache hits.
- */
- ra->ahead_start = 0;
- ra->ahead_size = 0;
- }
+ make_ahead_window(mapping, filp, ra, 0);
+recheck:
+ /* prev_page shouldn't overrun the ahead window */
+ ra->prev_page = min(ra->prev_page,
+ ra->ahead_start + ra->ahead_size - 1);
}
out:
- ra->prev_page = offset + newsize - 1;
- return(newsize);
+ return ra->prev_page + 1;
}
+EXPORT_SYMBOL_GPL(page_cache_readahead);
/*
* handle_ra_miss() is called when it is known that a page which should have
{
ra->flags |= RA_FLAG_MISS;
ra->flags &= ~RA_FLAG_INCACHE;
+ ra->cache_hit = 0;
}
/*