X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Freadahead.c;h=b7c3b746de679eda620ae4cb8bcb7593f9f1bc35;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=a5e6906a01e03073ff399da20f75821efb47e2dc;hpb=87fc8d1bb10cd459024a742c6a10961fefcef18f;p=linux-2.6.git diff --git a/mm/readahead.c b/mm/readahead.c index a5e6906a0..b7c3b746d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -35,7 +35,7 @@ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = mapping->backing_dev_info->ra_pages; - ra->average = ra->ra_pages / 2; + ra->prev_page = -1; } /* @@ -51,6 +51,56 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } +static inline void ra_off(struct file_ra_state *ra) +{ + ra->start = 0; + ra->flags = 0; + ra->size = -1; + ra->ahead_start = 0; + ra->ahead_size = 0; + return; +} + +/* + * Set the initial window size, round to next power of 2 and square + * for small size, x 4 for medium, and x 2 for large + * for 128k (32 page) max ra + * 1-8 page = 32k initial, > 8 page = 128k initial + */ +static unsigned long get_init_ra_size(unsigned long size, unsigned long max) +{ + unsigned long newsize = roundup_pow_of_two(size); + + if (newsize <= max / 64) + newsize = newsize * newsize; + else if (newsize <= max / 4) + newsize = max / 4; + else + newsize = max; + return newsize; +} + +/* + * Set the new window size, this is called only when I/O is to be submitted, + * not for each call to readahead. If a cache miss occured, reduce next I/O + * size, else increase depending on how close to max we are. + */ +static unsigned long get_next_ra_size(unsigned long cur, unsigned long max, + unsigned long min, unsigned long * flags) +{ + unsigned long newsize; + + if (*flags & RA_FLAG_MISS) { + newsize = max((cur - 2), min); + *flags &= ~RA_FLAG_MISS; + } else if (cur < max / 16) { + newsize = 4 * cur; + } else { + newsize = 2 * cur; + } + return min(newsize, max); +} + #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) /** @@ -65,7 +115,7 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) * Hides the details of the LRU cache etc from the filesystems. */ int read_cache_pages(struct address_space *mapping, struct list_head *pages, - int (*filler)(void *, struct page *), void *data) + int (*filler)(void *, struct page *), void *data) { struct page *page; struct pagevec lru_pvec; @@ -151,19 +201,16 @@ out: * ahead_size: Together, these form the "ahead window". * ra_pages: The externally controlled max readahead for this fd. * - * When readahead is in the "maximally shrunk" state (next_size == -1UL), - * readahead is disabled. In this state, prev_page and size are used, inside - * handle_ra_miss(), to detect the resumption of sequential I/O. Once there - * has been a decent run of sequential I/O (defined by get_min_readahead), - * readahead is reenabled. + * When readahead is in the off state (size == -1UL), readahead is disabled. + * In this state, prev_page is used to detect the resumption of sequential I/O. * * The readahead code manages two windows - the "current" and the "ahead" * windows. The intent is that while the application is walking the pages * in the current window, I/O is underway on the ahead window. When the * current window is fully traversed, it is replaced by the ahead window * and the ahead window is invalidated. When this copying happens, the - * new current window's pages are probably still locked. When I/O has - * completed, we submit a new batch of I/O, creating a new ahead window. + * new current window's pages are probably still locked. So + * we submit a new batch of I/O immediately, creating a new ahead window. * * So: * @@ -175,34 +222,25 @@ out: * ahead window. * * A `readahead hit' occurs when a read request is made against a page which is - * inside the current window. Hits are good, and the window size (next_size) - * is grown aggressively when hits occur. Two pages are added to the next - * window size on each hit, which will end up doubling the next window size by - * the time I/O is submitted for it. - * - * If readahead hits are more sparse (say, the application is only reading - * every second page) then the window will build more slowly. - * - * On a readahead miss (the application seeked away) the readahead window is - * shrunk by 25%. We don't want to drop it too aggressively, because it is a - * good assumption that an application which has built a good readahead window - * will continue to perform linear reads. Either at the new file position, or - * at the old one after another seek. + * the next sequential page. Ahead windowe calculations are done only when it + * is time to submit a new IO. The code ramps up the size agressively at first, + * but slow down as it approaches max_readhead. * - * After enough misses, readahead is fully disabled. (next_size = -1UL). + * Any seek/ramdom IO will result in readahead being turned off. It will resume + * at the first sequential access. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear - * read is about to happen and the window is immediately set to half of the - * device maximum. + * read is about to happen and the window is immediately set to the initial size + * based on I/O request size and the max_readahead. * * A page request at (start + size) is not a miss at all - it's just a part of * sequential file reading. * - * This function is to be called for every page which is read, rather than when - * it is time to perform readahead. This is so the readahead algorithm can - * centrally work out the access patterns. This could be costly with many tiny - * read()s, so we specifically optimise for that case with prev_page. + * This function is to be called for every read request, rather than when + * it is time to perform readahead. It is called only oce for the entire I/O + * regardless of size unless readahead is unable to start enough I/O to satisfy + * the request (I/O request > max_readahead). */ /* @@ -211,9 +249,12 @@ out: * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * - * Returns the number of pages which actually had IO started against them. + * Returns the number of pages requested, or the maximum amount of I/O allowed. + * + * do_page_cache_readahead() returns -1 if it encountered request queue + * congestion. */ -static inline int +static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, unsigned long offset, unsigned long nr_to_read) { @@ -299,6 +340,28 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return ret; } +/* + * Check how effective readahead is being. If the amount of started IO is + * less than expected then the file is partly or fully in pagecache and + * readahead isn't helping. + * + */ +int check_ra_success(struct file_ra_state *ra, unsigned long nr_to_read, + unsigned long actual) +{ + if (actual == 0) { + ra->cache_hit += nr_to_read; + if (ra->cache_hit >= VM_MAX_CACHE_HIT) { + ra_off(ra); + ra->flags |= RA_FLAG_INCACHE; + return 0; + } + } else { + ra->cache_hit=0; + } + return 1; +} + /* * This version skips the IO if the queue is read-congested, and will tell the * block layer to abandon the readahead if request allocation would block. @@ -309,256 +372,196 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, int do_page_cache_readahead(struct address_space *mapping, struct file *filp, unsigned long offset, unsigned long nr_to_read) { - if (!bdi_read_congested(mapping->backing_dev_info)) - return __do_page_cache_readahead(mapping, filp, - offset, nr_to_read); - return 0; + if (bdi_read_congested(mapping->backing_dev_info)) + return -1; + + return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); } /* - * Check how effective readahead is being. If the amount of started IO is - * less than expected then the file is partly or fully in pagecache and - * readahead isn't helping. Shrink the window. - * - * But don't shrink it too much - the application may read the same page - * occasionally. + * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' + * is set wait till the read completes. Otherwise attempt to read without + * blocking. + * Returns 1 meaning 'success' if read is succesfull without switching off + * readhaead mode. Otherwise return failure. */ -static inline void -check_ra_success(struct file_ra_state *ra, pgoff_t attempt, - pgoff_t actual, pgoff_t orig_next_size) +static int +blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read, + struct file_ra_state *ra, int block) { - if (actual == 0) { - if (orig_next_size > 1) { - ra->next_size = orig_next_size - 1; - if (ra->ahead_size) - ra->ahead_size = ra->next_size; - } else { - ra->next_size = -1UL; - ra->size = 0; - } + int actual; + + if (block) { + actual = __do_page_cache_readahead(mapping, filp, + offset, nr_to_read); + } else { + actual = do_page_cache_readahead(mapping, filp, + offset, nr_to_read); + if (actual == -1) + return 0; } + return check_ra_success(ra, nr_to_read, actual); } /* * page_cache_readahead is the main function. If performs the adaptive * readahead window size management and submits the readahead I/O. */ -void +unsigned long page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, - struct file *filp, unsigned long offset) + struct file *filp, unsigned long offset, + unsigned long req_size) { - unsigned max; - unsigned orig_next_size; - unsigned actual; - int first_access=0; - unsigned long average; + unsigned long max, min; + unsigned long newsize = req_size; + unsigned long block; /* * Here we detect the case where the application is performing * sub-page sized reads. We avoid doing extra work and bogusly * perturbing the readahead window expansion logic. - * If next_size is zero, this is the very first read for this - * file handle, or the window is maximally shrunk. + * If size is zero, there is no read ahead window so we need one */ - if (offset == ra->prev_page) { - if (ra->next_size != 0) - goto out; - } - - if (ra->next_size == -1UL) - goto out; /* Maximally shrunk */ + if (offset == ra->prev_page && req_size == 1 && ra->size != 0) + goto out; max = get_max_readahead(ra); - if (max == 0) - goto out; /* No readahead */ + min = get_min_readahead(ra); + newsize = min(req_size, max); - orig_next_size = ra->next_size; - - if (ra->next_size == 0) { - /* - * Special case - first read. - * We'll assume it's a whole-file read, and - * grow the window fast. - */ - first_access=1; - ra->next_size = max / 2; + if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) { + newsize = 1; ra->prev_page = offset; - ra->currnt_wnd_hit++; - goto do_io; + goto out; /* No readahead or file already in cache */ } + /* + * Special case - first read. We'll assume it's a whole-file read if + * at start of file, and grow the window fast. Or detect first + * sequential access + */ + if ((ra->size == 0 && offset == 0) /* first io and start of file */ + || (ra->size == -1 && ra->prev_page == offset - 1)) { + /* First sequential */ + ra->prev_page = offset + newsize - 1; + ra->size = get_init_ra_size(newsize, max); + ra->start = offset; + if (!blockable_page_cache_readahead(mapping, filp, offset, + ra->size, ra, 1)) + goto out; - ra->prev_page = offset; - - if (offset >= ra->start && offset <= (ra->start + ra->size)) { - /* - * A readahead hit. Either inside the window, or one - * page beyond the end. Expand the next readahead size. - */ - ra->next_size += 2; - - if (ra->currnt_wnd_hit <= (max * 2)) - ra->currnt_wnd_hit++; - } else { /* - * A miss - lseek, pagefault, pread, etc. Shrink the readahead - * window. + * If the request size is larger than our max readahead, we + * at least want to be sure that we get 2 IOs in flight and + * we know that we will definitly need the new I/O. + * once we do this, subsequent calls should be able to overlap + * IOs,* thus preventing stalls. so issue the ahead window + * immediately. */ - ra->next_size -= 2; - - average = ra->average; - if (average < ra->currnt_wnd_hit) { - average++; + if (req_size >= max) { + ra->ahead_size = get_next_ra_size(ra->size, max, min, + &ra->flags); + ra->ahead_start = ra->start + ra->size; + blockable_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size, ra, 1); } - ra->average = (average + ra->currnt_wnd_hit) / 2; - ra->currnt_wnd_hit = 1; + goto out; } - if ((long)ra->next_size > (long)max) - ra->next_size = max; - if ((long)ra->next_size <= 0L) { - ra->next_size = -1UL; - ra->size = 0; - goto out; /* Readahead is off */ + /* + * Now handle the random case: + * partial page reads and first access were handled above, + * so this must be the next page otherwise it is random + */ + if ((offset != (ra->prev_page+1) || (ra->size == 0))) { + ra_off(ra); + ra->prev_page = offset + newsize - 1; + blockable_page_cache_readahead(mapping, filp, offset, + newsize, ra, 1); + goto out; } /* - * Is this request outside the current window? + * If we get here we are doing sequential IO and this was not the first + * occurence (ie we have an existing window) */ - if (offset < ra->start || offset >= (ra->start + ra->size)) { - /* - * A miss against the current window. Have we merely - * advanced into the ahead window? - */ - if (offset == ra->ahead_start) { - /* - * Yes, we have. The ahead window now becomes - * the current window. + + if (ra->ahead_start == 0) { /* no ahead window yet */ + ra->ahead_size = get_next_ra_size(ra->size, max, min, + &ra->flags); + ra->ahead_start = ra->start + ra->size; + block = ((offset + newsize -1) >= ra->ahead_start); + if (!blockable_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size, ra, block)) { + /* A read failure in blocking mode, implies pages are + * all cached. So we can safely assume we have taken + * care of all the pages requested in this call. A read + * failure in non-blocking mode, implies we are reading + * more pages than requested in this call. So we safely + * assume we have taken care of all the pages requested + * in this call. + * + * Just reset the ahead window in case we failed due to + * congestion. The ahead window will any way be closed + * in case we failed due to exessive page cache hits. */ - ra->start = ra->ahead_start; - ra->size = ra->ahead_size; - ra->prev_page = ra->start; ra->ahead_start = 0; ra->ahead_size = 0; - - /* - * Control now returns, probably to sleep until I/O - * completes against the first ahead page. - * When the second page in the old ahead window is - * requested, control will return here and more I/O - * will be submitted to build the new ahead window. - */ goto out; } -do_io: - /* - * This is the "unusual" path. We come here during - * startup or after an lseek. We invalidate the - * ahead window and get some I/O underway for the new - * current window. - */ - if (!first_access) { - /* Heuristic: there is a high probability - * that around ra->average number of - * pages shall be accessed in the next - * current window. - */ - average = ra->average; - if (ra->currnt_wnd_hit > average) - average = (ra->currnt_wnd_hit + ra->average + 1) / 2; - - ra->next_size = min(average , (unsigned long)max); - } - ra->start = offset; - ra->size = ra->next_size; - ra->ahead_start = 0; /* Invalidate these */ - ra->ahead_size = 0; - actual = do_page_cache_readahead(mapping, filp, offset, - ra->size); - if(!first_access) { - /* - * do not adjust the readahead window size the first - * time, the ahead window might get closed if all - * the pages are already in the cache. - */ - check_ra_success(ra, ra->size, actual, orig_next_size); - } - } else { - /* - * This read request is within the current window. It may be - * time to submit I/O for the ahead window while the - * application is about to step into the ahead window. - */ - if (ra->ahead_start == 0) { - /* - * If the average io-size is more than maximum - * readahead size of the file the io pattern is - * sequential. Hence bring in the readahead window - * immediately. - * If the average io-size is less than maximum - * readahead size of the file the io pattern is - * random. Hence don't bother to readahead. + } + /* + * Already have an ahead window, check if we crossed into it. + * If so, shift windows and issue a new ahead window. + * Only return the #pages that are in the current window, so that + * we get called back on the first page of the ahead window which + * will allow us to submit more IO. + */ + if ((offset + newsize - 1) >= ra->ahead_start) { + ra->start = ra->ahead_start; + ra->size = ra->ahead_size; + ra->ahead_start = ra->ahead_start + ra->ahead_size; + ra->ahead_size = get_next_ra_size(ra->ahead_size, + max, min, &ra->flags); + block = ((offset + newsize - 1) >= ra->ahead_start); + if (!blockable_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size, ra, block)) { + /* A read failure in blocking mode, implies pages are + * all cached. So we can safely assume we have taken + * care of all the pages requested in this call. + * A read failure in non-blocking mode, implies we are + * reading more pages than requested in this call. So + * we safely assume we have taken care of all the pages + * requested in this call. + * + * Just reset the ahead window in case we failed due to + * congestion. The ahead window will any way be closed + * in case we failed due to excessive page cache hits. */ - average = ra->average; - if (ra->currnt_wnd_hit > average) - average = (ra->currnt_wnd_hit + ra->average + 1) / 2; - - if (average > max) { - ra->ahead_start = ra->start + ra->size; - ra->ahead_size = ra->next_size; - actual = do_page_cache_readahead(mapping, filp, - ra->ahead_start, ra->ahead_size); - check_ra_success(ra, ra->ahead_size, - actual, orig_next_size); - } + ra->ahead_start = 0; + ra->ahead_size = 0; } } + out: - return; + ra->prev_page = offset + newsize - 1; + return(newsize); } - /* * handle_ra_miss() is called when it is known that a page which should have * been present in the pagecache (we just did some readahead there) was in fact * not found. This will happen if it was evicted by the VM (readahead - * thrashing) or if the readahead window is maximally shrunk. + * thrashing) * - * If the window has been maximally shrunk (next_size == -1UL) then look to see - * if we are getting misses against sequential file offsets. If so, and this - * persists then resume readahead. - * - * Otherwise we're thrashing, so shrink the readahead window by three pages. - * This is because it is grown by two pages on a readahead hit. Theory being - * that the readahead window size will stabilise around the maximum level at - * which there is no thrashing. + * Turn on the cache miss flag in the RA struct, this will cause the RA code + * to reduce the RA size on the next read. */ void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset) { - if (ra->next_size == -1UL) { - const unsigned long max = get_max_readahead(ra); - - if (offset != ra->prev_page + 1) { - ra->size = ra->size?ra->size-1:0; /* Not sequential */ - } else { - ra->size++; /* A sequential read */ - if (ra->size >= max) { /* Resume readahead */ - ra->start = offset - max; - ra->next_size = max; - ra->size = max; - ra->ahead_start = 0; - ra->ahead_size = 0; - ra->average = max / 2; - } - } - ra->prev_page = offset; - } else { - const unsigned long min = get_min_readahead(ra); - - ra->next_size -= 3; - if (ra->next_size < min) - ra->next_size = min; - } + ra->flags |= RA_FLAG_MISS; + ra->flags &= ~RA_FLAG_INCACHE; } /*