X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Freadahead.c;h=a5e6906a01e03073ff399da20f75821efb47e2dc;hb=f1227cd3e0e73c48b93368800aa89f4341103a00;hp=b840e7c6ea740433d1851c379705efe191259e1d;hpb=340e2b1a4c74f653454348914c408420d5d3c28a;p=linux-2.6.git diff --git a/mm/readahead.c b/mm/readahead.c index b840e7c6e..a5e6906a0 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -23,7 +23,6 @@ EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, .state = 0, - .capabilities = BDI_CAP_MAP_COPY, .unplug_io_fn = default_unplug_io_fn, }; EXPORT_SYMBOL_GPL(default_backing_dev_info); @@ -36,7 +35,7 @@ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = mapping->backing_dev_info->ra_pages; - ra->prev_page = -1; + ra->average = ra->ra_pages / 2; } /* @@ -52,58 +51,6 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } -static inline void ra_off(struct file_ra_state *ra) -{ - ra->start = 0; - ra->flags = 0; - ra->size = 0; - ra->ahead_start = 0; - ra->ahead_size = 0; - return; -} - -/* - * Set the initial window size, round to next power of 2 and square - * for small size, x 4 for medium, and x 2 for large - * for 128k (32 page) max ra - * 1-8 page = 32k initial, > 8 page = 128k initial - */ -static unsigned long get_init_ra_size(unsigned long size, unsigned long max) -{ - unsigned long newsize = roundup_pow_of_two(size); - - if (newsize <= max / 64) - newsize = newsize * newsize; - else if (newsize <= max / 4) - newsize = max / 4; - else - newsize = max; - return newsize; -} - -/* - * Set the new window size, this is called only when I/O is to be submitted, - * not for each call to readahead. If a cache miss occured, reduce next I/O - * size, else increase depending on how close to max we are. - */ -static inline unsigned long get_next_ra_size(struct file_ra_state *ra) -{ - unsigned long max = get_max_readahead(ra); - unsigned long min = get_min_readahead(ra); - unsigned long cur = ra->size; - unsigned long newsize; - - if (ra->flags & RA_FLAG_MISS) { - ra->flags &= ~RA_FLAG_MISS; - newsize = max((cur - 2), min); - } else if (cur < max / 16) { - newsize = 4 * cur; - } else { - newsize = 2 * cur; - } - return min(newsize, max); -} - #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) /** @@ -118,7 +65,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra) * Hides the details of the LRU cache etc from the filesystems. */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, - int (*filler)(void *, struct page *), void *data) + int (*filler)(void *, struct page *), void *data) { struct page *page; struct pagevec lru_pvec; @@ -193,25 +140,30 @@ out: * size: Number of pages in that read * Together, these form the "current window". * Together, start and size represent the `readahead window'. + * next_size: The number of pages to read on the next readahead miss. + * Has the magical value -1UL if readahead has been disabled. * prev_page: The page which the readahead algorithm most-recently inspected. - * It is mainly used to detect sequential file reading. - * If page_cache_readahead sees that it is again being called for - * a page which it just looked at, it can return immediately without - * making any state changes. + * prev_page is mainly an optimisation: if page_cache_readahead + * sees that it is again being called for a page which it just + * looked at, it can return immediately without making any state + * changes. * ahead_start, * ahead_size: Together, these form the "ahead window". * ra_pages: The externally controlled max readahead for this fd. * - * When readahead is in the off state (size == 0), readahead is disabled. - * In this state, prev_page is used to detect the resumption of sequential I/O. + * When readahead is in the "maximally shrunk" state (next_size == -1UL), + * readahead is disabled. In this state, prev_page and size are used, inside + * handle_ra_miss(), to detect the resumption of sequential I/O. Once there + * has been a decent run of sequential I/O (defined by get_min_readahead), + * readahead is reenabled. * * The readahead code manages two windows - the "current" and the "ahead" * windows. The intent is that while the application is walking the pages * in the current window, I/O is underway on the ahead window. When the * current window is fully traversed, it is replaced by the ahead window * and the ahead window is invalidated. When this copying happens, the - * new current window's pages are probably still locked. So - * we submit a new batch of I/O immediately, creating a new ahead window. + * new current window's pages are probably still locked. When I/O has + * completed, we submit a new batch of I/O, creating a new ahead window. * * So: * @@ -223,22 +175,34 @@ out: * ahead window. * * A `readahead hit' occurs when a read request is made against a page which is - * the next sequential page. Ahead window calculations are done only when it - * is time to submit a new IO. The code ramps up the size agressively at first, - * but slow down as it approaches max_readhead. + * inside the current window. Hits are good, and the window size (next_size) + * is grown aggressively when hits occur. Two pages are added to the next + * window size on each hit, which will end up doubling the next window size by + * the time I/O is submitted for it. * - * Any seek/ramdom IO will result in readahead being turned off. It will resume - * at the first sequential access. + * If readahead hits are more sparse (say, the application is only reading + * every second page) then the window will build more slowly. + * + * On a readahead miss (the application seeked away) the readahead window is + * shrunk by 25%. We don't want to drop it too aggressively, because it is a + * good assumption that an application which has built a good readahead window + * will continue to perform linear reads. Either at the new file position, or + * at the old one after another seek. + * + * After enough misses, readahead is fully disabled. (next_size = -1UL). * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear - * read is about to happen and the window is immediately set to the initial size - * based on I/O request size and the max_readahead. + * read is about to happen and the window is immediately set to half of the + * device maximum. + * + * A page request at (start + size) is not a miss at all - it's just a part of + * sequential file reading. * - * This function is to be called for every read request, rather than when - * it is time to perform readahead. It is called only once for the entire I/O - * regardless of size unless readahead is unable to start enough I/O to satisfy - * the request (I/O request > max_readahead). + * This function is to be called for every page which is read, rather than when + * it is time to perform readahead. This is so the readahead algorithm can + * centrally work out the access patterns. This could be costly with many tiny + * read()s, so we specifically optimise for that case with prev_page. */ /* @@ -247,12 +211,9 @@ out: * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * - * Returns the number of pages requested, or the maximum amount of I/O allowed. - * - * do_page_cache_readahead() returns -1 if it encountered request queue - * congestion. + * Returns the number of pages which actually had IO started against them. */ -static int +static inline int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, unsigned long offset, unsigned long nr_to_read) { @@ -272,7 +233,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, /* * Preallocate as many pages as we will need. */ - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -283,16 +244,16 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (page) continue; - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); page = page_cache_alloc_cold(mapping); - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; list_add(&page->lru, &page_pool); ret++; } - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); /* * Now start the IO. We ignore I/O errors - if the page is not @@ -338,28 +299,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return ret; } -/* - * Check how effective readahead is being. If the amount of started IO is - * less than expected then the file is partly or fully in pagecache and - * readahead isn't helping. - * - */ -static inline int check_ra_success(struct file_ra_state *ra, - unsigned long nr_to_read, unsigned long actual) -{ - if (actual == 0) { - ra->cache_hit += nr_to_read; - if (ra->cache_hit >= VM_MAX_CACHE_HIT) { - ra_off(ra); - ra->flags |= RA_FLAG_INCACHE; - return 0; - } - } else { - ra->cache_hit=0; - } - return 1; -} - /* * This version skips the IO if the queue is read-congested, and will tell the * block layer to abandon the readahead if request allocation would block. @@ -370,176 +309,256 @@ static inline int check_ra_success(struct file_ra_state *ra, int do_page_cache_readahead(struct address_space *mapping, struct file *filp, unsigned long offset, unsigned long nr_to_read) { - if (bdi_read_congested(mapping->backing_dev_info)) - return -1; - - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); + if (!bdi_read_congested(mapping->backing_dev_info)) + return __do_page_cache_readahead(mapping, filp, + offset, nr_to_read); + return 0; } /* - * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' - * is set wait till the read completes. Otherwise attempt to read without - * blocking. - * Returns 1 meaning 'success' if read is succesfull without switching off - * readhaead mode. Otherwise return failure. + * Check how effective readahead is being. If the amount of started IO is + * less than expected then the file is partly or fully in pagecache and + * readahead isn't helping. Shrink the window. + * + * But don't shrink it too much - the application may read the same page + * occasionally. */ -static int -blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, - unsigned long offset, unsigned long nr_to_read, - struct file_ra_state *ra, int block) +static inline void +check_ra_success(struct file_ra_state *ra, pgoff_t attempt, + pgoff_t actual, pgoff_t orig_next_size) { - int actual; - - if (!block && bdi_read_congested(mapping->backing_dev_info)) - return 0; - - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read); - - return check_ra_success(ra, nr_to_read, actual); -} - -static int make_ahead_window(struct address_space *mapping, struct file *filp, - struct file_ra_state *ra, int force) -{ - int block, ret; - - ra->ahead_size = get_next_ra_size(ra); - ra->ahead_start = ra->start + ra->size; - - block = force || (ra->prev_page >= ra->ahead_start); - ret = blockable_page_cache_readahead(mapping, filp, - ra->ahead_start, ra->ahead_size, ra, block); - - if (!ret && !force) { - /* A read failure in blocking mode, implies pages are - * all cached. So we can safely assume we have taken - * care of all the pages requested in this call. - * A read failure in non-blocking mode, implies we are - * reading more pages than requested in this call. So - * we safely assume we have taken care of all the pages - * requested in this call. - * - * Just reset the ahead window in case we failed due to - * congestion. The ahead window will any way be closed - * in case we failed due to excessive page cache hits. - */ - ra->ahead_start = 0; - ra->ahead_size = 0; + if (actual == 0) { + if (orig_next_size > 1) { + ra->next_size = orig_next_size - 1; + if (ra->ahead_size) + ra->ahead_size = ra->next_size; + } else { + ra->next_size = -1UL; + ra->size = 0; + } } - - return ret; } /* * page_cache_readahead is the main function. If performs the adaptive * readahead window size management and submits the readahead I/O. */ -unsigned long +void page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, - struct file *filp, unsigned long offset, - unsigned long req_size) + struct file *filp, unsigned long offset) { - unsigned long max, newsize; - int sequential; + unsigned max; + unsigned orig_next_size; + unsigned actual; + int first_access=0; + unsigned long average; /* - * We avoid doing extra work and bogusly perturbing the readahead - * window expansion logic. + * Here we detect the case where the application is performing + * sub-page sized reads. We avoid doing extra work and bogusly + * perturbing the readahead window expansion logic. + * If next_size is zero, this is the very first read for this + * file handle, or the window is maximally shrunk. */ - if (offset == ra->prev_page && --req_size) - ++offset; + if (offset == ra->prev_page) { + if (ra->next_size != 0) + goto out; + } - /* Note that prev_page == -1 if it is a first read */ - sequential = (offset == ra->prev_page + 1); - ra->prev_page = offset; + if (ra->next_size == -1UL) + goto out; /* Maximally shrunk */ max = get_max_readahead(ra); - newsize = min(req_size, max); + if (max == 0) + goto out; /* No readahead */ - /* No readahead or sub-page sized read or file already in cache */ - if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) - goto out; + orig_next_size = ra->next_size; - ra->prev_page += newsize - 1; + if (ra->next_size == 0) { + /* + * Special case - first read. + * We'll assume it's a whole-file read, and + * grow the window fast. + */ + first_access=1; + ra->next_size = max / 2; + ra->prev_page = offset; + ra->currnt_wnd_hit++; + goto do_io; + } - /* - * Special case - first read at start of file. We'll assume it's - * a whole-file read and grow the window fast. Or detect first - * sequential access - */ - if (sequential && ra->size == 0) { - ra->size = get_init_ra_size(newsize, max); - ra->start = offset; - if (!blockable_page_cache_readahead(mapping, filp, offset, - ra->size, ra, 1)) - goto out; + ra->prev_page = offset; + if (offset >= ra->start && offset <= (ra->start + ra->size)) { /* - * If the request size is larger than our max readahead, we - * at least want to be sure that we get 2 IOs in flight and - * we know that we will definitly need the new I/O. - * once we do this, subsequent calls should be able to overlap - * IOs,* thus preventing stalls. so issue the ahead window - * immediately. + * A readahead hit. Either inside the window, or one + * page beyond the end. Expand the next readahead size. */ - if (req_size >= max) - make_ahead_window(mapping, filp, ra, 1); + ra->next_size += 2; - goto out; + if (ra->currnt_wnd_hit <= (max * 2)) + ra->currnt_wnd_hit++; + } else { + /* + * A miss - lseek, pagefault, pread, etc. Shrink the readahead + * window. + */ + ra->next_size -= 2; + + average = ra->average; + if (average < ra->currnt_wnd_hit) { + average++; + } + ra->average = (average + ra->currnt_wnd_hit) / 2; + ra->currnt_wnd_hit = 1; } - /* - * Now handle the random case: - * partial page reads and first access were handled above, - * so this must be the next page otherwise it is random - */ - if (!sequential) { - ra_off(ra); - blockable_page_cache_readahead(mapping, filp, offset, - newsize, ra, 1); - goto out; + if ((long)ra->next_size > (long)max) + ra->next_size = max; + if ((long)ra->next_size <= 0L) { + ra->next_size = -1UL; + ra->size = 0; + goto out; /* Readahead is off */ } /* - * If we get here we are doing sequential IO and this was not the first - * occurence (ie we have an existing window) + * Is this request outside the current window? */ - - if (ra->ahead_start == 0) { /* no ahead window yet */ - if (!make_ahead_window(mapping, filp, ra, 0)) + if (offset < ra->start || offset >= (ra->start + ra->size)) { + /* + * A miss against the current window. Have we merely + * advanced into the ahead window? + */ + if (offset == ra->ahead_start) { + /* + * Yes, we have. The ahead window now becomes + * the current window. + */ + ra->start = ra->ahead_start; + ra->size = ra->ahead_size; + ra->prev_page = ra->start; + ra->ahead_start = 0; + ra->ahead_size = 0; + + /* + * Control now returns, probably to sleep until I/O + * completes against the first ahead page. + * When the second page in the old ahead window is + * requested, control will return here and more I/O + * will be submitted to build the new ahead window. + */ goto out; + } +do_io: + /* + * This is the "unusual" path. We come here during + * startup or after an lseek. We invalidate the + * ahead window and get some I/O underway for the new + * current window. + */ + if (!first_access) { + /* Heuristic: there is a high probability + * that around ra->average number of + * pages shall be accessed in the next + * current window. + */ + average = ra->average; + if (ra->currnt_wnd_hit > average) + average = (ra->currnt_wnd_hit + ra->average + 1) / 2; + + ra->next_size = min(average , (unsigned long)max); + } + ra->start = offset; + ra->size = ra->next_size; + ra->ahead_start = 0; /* Invalidate these */ + ra->ahead_size = 0; + actual = do_page_cache_readahead(mapping, filp, offset, + ra->size); + if(!first_access) { + /* + * do not adjust the readahead window size the first + * time, the ahead window might get closed if all + * the pages are already in the cache. + */ + check_ra_success(ra, ra->size, actual, orig_next_size); + } + } else { + /* + * This read request is within the current window. It may be + * time to submit I/O for the ahead window while the + * application is about to step into the ahead window. + */ + if (ra->ahead_start == 0) { + /* + * If the average io-size is more than maximum + * readahead size of the file the io pattern is + * sequential. Hence bring in the readahead window + * immediately. + * If the average io-size is less than maximum + * readahead size of the file the io pattern is + * random. Hence don't bother to readahead. + */ + average = ra->average; + if (ra->currnt_wnd_hit > average) + average = (ra->currnt_wnd_hit + ra->average + 1) / 2; + + if (average > max) { + ra->ahead_start = ra->start + ra->size; + ra->ahead_size = ra->next_size; + actual = do_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size); + check_ra_success(ra, ra->ahead_size, + actual, orig_next_size); + } + } } - /* - * Already have an ahead window, check if we crossed into it. - * If so, shift windows and issue a new ahead window. - * Only return the #pages that are in the current window, so that - * we get called back on the first page of the ahead window which - * will allow us to submit more IO. - */ - if (ra->prev_page >= ra->ahead_start) { - ra->start = ra->ahead_start; - ra->size = ra->ahead_size; - make_ahead_window(mapping, filp, ra, 0); - } - out: - return ra->prev_page + 1; + return; } + /* * handle_ra_miss() is called when it is known that a page which should have * been present in the pagecache (we just did some readahead there) was in fact * not found. This will happen if it was evicted by the VM (readahead - * thrashing) + * thrashing) or if the readahead window is maximally shrunk. + * + * If the window has been maximally shrunk (next_size == -1UL) then look to see + * if we are getting misses against sequential file offsets. If so, and this + * persists then resume readahead. * - * Turn on the cache miss flag in the RA struct, this will cause the RA code - * to reduce the RA size on the next read. + * Otherwise we're thrashing, so shrink the readahead window by three pages. + * This is because it is grown by two pages on a readahead hit. Theory being + * that the readahead window size will stabilise around the maximum level at + * which there is no thrashing. */ void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset) { - ra->flags |= RA_FLAG_MISS; - ra->flags &= ~RA_FLAG_INCACHE; + if (ra->next_size == -1UL) { + const unsigned long max = get_max_readahead(ra); + + if (offset != ra->prev_page + 1) { + ra->size = ra->size?ra->size-1:0; /* Not sequential */ + } else { + ra->size++; /* A sequential read */ + if (ra->size >= max) { /* Resume readahead */ + ra->start = offset - max; + ra->next_size = max; + ra->size = max; + ra->ahead_start = 0; + ra->ahead_size = 0; + ra->average = max / 2; + } + } + ra->prev_page = offset; + } else { + const unsigned long min = get_min_readahead(ra); + + ra->next_size -= 3; + if (ra->next_size < min) + ra->next_size = min; + } } /*