X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Freadahead.c;h=b7c3b746de679eda620ae4cb8bcb7593f9f1bc35;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=a5e6906a01e03073ff399da20f75821efb47e2dc;hpb=87fc8d1bb10cd459024a742c6a10961fefcef18f;p=linux-2.6.git

diff --git a/mm/readahead.c b/mm/readahead.c
index a5e6906a0..b7c3b746d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -35,7 +35,7 @@ void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 {
 	ra->ra_pages = mapping->backing_dev_info->ra_pages;
-	ra->average = ra->ra_pages / 2;
+	ra->prev_page = -1;
 }
 
 /*
@@ -51,6 +51,56 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
 	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 }
 
+static inline void ra_off(struct file_ra_state *ra)
+{
+	ra->start = 0;
+	ra->flags = 0;
+	ra->size = -1;
+	ra->ahead_start = 0;
+	ra->ahead_size = 0;
+	return;
+}
+
+/*
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
+ */
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+{
+	unsigned long newsize = roundup_pow_of_two(size);
+
+	if (newsize <= max / 64)
+		newsize = newsize * newsize;
+	else if (newsize <= max / 4)
+		newsize = max / 4;
+	else
+		newsize = max;
+	return newsize;
+}
+
+/*
+ * Set the new window size, this is called only when I/O is to be submitted,
+ * not for each call to readahead.  If a cache miss occured, reduce next I/O
+ * size, else increase depending on how close to max we are.
+ */
+static unsigned long get_next_ra_size(unsigned long cur, unsigned long max,
+				unsigned long min, unsigned long * flags)
+{
+	unsigned long newsize;
+
+	if (*flags & RA_FLAG_MISS) {
+		newsize = max((cur - 2), min);
+		*flags &= ~RA_FLAG_MISS;
+	} else if (cur < max / 16) {
+		newsize = 4 * cur;
+	} else {
+		newsize = 2 * cur;
+	}
+	return min(newsize, max);
+}
+
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 
 /**
@@ -65,7 +115,7 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
  * Hides the details of the LRU cache etc from the filesystems.
  */
 int read_cache_pages(struct address_space *mapping, struct list_head *pages,
-		 int (*filler)(void *, struct page *), void *data)
+			int (*filler)(void *, struct page *), void *data)
 {
 	struct page *page;
 	struct pagevec lru_pvec;
@@ -151,19 +201,16 @@ out:
  * ahead_size:  Together, these form the "ahead window".
  * ra_pages:	The externally controlled max readahead for this fd.
  *
- * When readahead is in the "maximally shrunk" state (next_size == -1UL),
- * readahead is disabled.  In this state, prev_page and size are used, inside
- * handle_ra_miss(), to detect the resumption of sequential I/O.  Once there
- * has been a decent run of sequential I/O (defined by get_min_readahead),
- * readahead is reenabled.
+ * When readahead is in the off state (size == -1UL), readahead is disabled.
+ * In this state, prev_page is used to detect the resumption of sequential I/O.
  *
  * The readahead code manages two windows - the "current" and the "ahead"
  * windows.  The intent is that while the application is walking the pages
  * in the current window, I/O is underway on the ahead window.  When the
  * current window is fully traversed, it is replaced by the ahead window
  * and the ahead window is invalidated.  When this copying happens, the
- * new current window's pages are probably still locked.  When I/O has
- * completed, we submit a new batch of I/O, creating a new ahead window.
+ * new current window's pages are probably still locked.  So
+ * we submit a new batch of I/O immediately, creating a new ahead window.
  *
  * So:
  *
@@ -175,34 +222,25 @@ out:
  *           ahead window.
  *
  * A `readahead hit' occurs when a read request is made against a page which is
- * inside the current window.  Hits are good, and the window size (next_size)
- * is grown aggressively when hits occur.  Two pages are added to the next
- * window size on each hit, which will end up doubling the next window size by
- * the time I/O is submitted for it.
- *
- * If readahead hits are more sparse (say, the application is only reading
- * every second page) then the window will build more slowly.
- *
- * On a readahead miss (the application seeked away) the readahead window is
- * shrunk by 25%.  We don't want to drop it too aggressively, because it is a
- * good assumption that an application which has built a good readahead window
- * will continue to perform linear reads.  Either at the new file position, or
- * at the old one after another seek.
+ * the next sequential page. Ahead windowe calculations are done only when it
+ * is time to submit a new IO.  The code ramps up the size agressively at first,
+ * but slow down as it approaches max_readhead.
  *
- * After enough misses, readahead is fully disabled. (next_size = -1UL).
+ * Any seek/ramdom IO will result in readahead being turned off.  It will resume
+ * at the first sequential access.
  *
  * There is a special-case: if the first page which the application tries to
  * read happens to be the first page of the file, it is assumed that a linear
- * read is about to happen and the window is immediately set to half of the
- * device maximum.
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
  * 
  * A page request at (start + size) is not a miss at all - it's just a part of
  * sequential file reading.
  *
- * This function is to be called for every page which is read, rather than when
- * it is time to perform readahead.  This is so the readahead algorithm can
- * centrally work out the access patterns.  This could be costly with many tiny
- * read()s, so we specifically optimise for that case with prev_page.
+ * This function is to be called for every read request, rather than when
+ * it is time to perform readahead.  It is called only oce for the entire I/O
+ * regardless of size unless readahead is unable to start enough I/O to satisfy
+ * the request (I/O request > max_readahead).
  */
 
 /*
@@ -211,9 +249,12 @@ out:
  * behaviour which would occur if page allocations are causing VM writeback.
  * We really don't want to intermingle reads and writes like that.
  *
- * Returns the number of pages which actually had IO started against them.
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ *
+ * do_page_cache_readahead() returns -1 if it encountered request queue
+ * congestion.
  */
-static inline int
+static int
 __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			unsigned long offset, unsigned long nr_to_read)
 {
@@ -299,6 +340,28 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	return ret;
 }
 
+/*
+ * Check how effective readahead is being.  If the amount of started IO is
+ * less than expected then the file is partly or fully in pagecache and
+ * readahead isn't helping.
+ *
+ */
+int check_ra_success(struct file_ra_state *ra, unsigned long nr_to_read,
+				 unsigned long actual)
+{
+	if (actual == 0) {
+		ra->cache_hit += nr_to_read;
+		if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
+			ra_off(ra);
+			ra->flags |= RA_FLAG_INCACHE;
+			return 0;
+		}
+	} else {
+		ra->cache_hit=0;
+	}
+	return 1;
+}
+
 /*
  * This version skips the IO if the queue is read-congested, and will tell the
  * block layer to abandon the readahead if request allocation would block.
@@ -309,256 +372,196 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			unsigned long offset, unsigned long nr_to_read)
 {
-	if (!bdi_read_congested(mapping->backing_dev_info))
-		return __do_page_cache_readahead(mapping, filp,
-						offset, nr_to_read);
-	return 0;
+	if (bdi_read_congested(mapping->backing_dev_info))
+		return -1;
+
+	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
 }
 
 /*
- * Check how effective readahead is being.  If the amount of started IO is
- * less than expected then the file is partly or fully in pagecache and
- * readahead isn't helping.  Shrink the window.
- *
- * But don't shrink it too much - the application may read the same page
- * occasionally.
+ * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
+ * is set wait till the read completes.  Otherwise attempt to read without
+ * blocking.
+ * Returns 1 meaning 'success' if read is succesfull without switching off
+ * readhaead mode. Otherwise return failure.
  */
-static inline void
-check_ra_success(struct file_ra_state *ra, pgoff_t attempt,
-			pgoff_t actual, pgoff_t orig_next_size)
+static int
+blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			unsigned long offset, unsigned long nr_to_read,
+			struct file_ra_state *ra, int block)
 {
-	if (actual == 0) {
-		if (orig_next_size > 1) {
-			ra->next_size = orig_next_size - 1;
-			if (ra->ahead_size)
-				ra->ahead_size = ra->next_size;
-		} else {
-			ra->next_size = -1UL;
-			ra->size = 0;
-		}
+	int actual;
+
+	if (block) {
+		actual = __do_page_cache_readahead(mapping, filp,
+						offset, nr_to_read);
+	} else {
+		actual = do_page_cache_readahead(mapping, filp,
+						offset, nr_to_read);
+		if (actual == -1)
+			return 0;
 	}
+	return check_ra_success(ra, nr_to_read, actual);
 }
 
 /*
  * page_cache_readahead is the main function.  If performs the adaptive
  * readahead window size management and submits the readahead I/O.
  */
-void
+unsigned long
 page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
-			struct file *filp, unsigned long offset)
+		     struct file *filp, unsigned long offset,
+		     unsigned long req_size)
 {
-	unsigned max;
-	unsigned orig_next_size;
-	unsigned actual;
-	int first_access=0;
-	unsigned long average;
+	unsigned long max, min;
+	unsigned long newsize = req_size;
+	unsigned long block;
 
 	/*
 	 * Here we detect the case where the application is performing
 	 * sub-page sized reads.  We avoid doing extra work and bogusly
 	 * perturbing the readahead window expansion logic.
-	 * If next_size is zero, this is the very first read for this
-	 * file handle, or the window is maximally shrunk.
+	 * If size is zero, there is no read ahead window so we need one
 	 */
-	if (offset == ra->prev_page) {
-		if (ra->next_size != 0)
-			goto out;
-	}
-
-	if (ra->next_size == -1UL)
-		goto out;	/* Maximally shrunk */
+	if (offset == ra->prev_page && req_size == 1 && ra->size != 0)
+		goto out;
 
 	max = get_max_readahead(ra);
-	if (max == 0)
-		goto out;	/* No readahead */
+	min = get_min_readahead(ra);
+	newsize = min(req_size, max);
 
-	orig_next_size = ra->next_size;
-
-	if (ra->next_size == 0) {
-		/*
-		 * Special case - first read.
-		 * We'll assume it's a whole-file read, and
-		 * grow the window fast.
-		 */
-		first_access=1;
-		ra->next_size = max / 2;
+	if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) {
+		newsize = 1;
 		ra->prev_page = offset;
-		ra->currnt_wnd_hit++;
-		goto do_io;
+		goto out;	/* No readahead or file already in cache */
 	}
+	/*
+	 * Special case - first read.  We'll assume it's a whole-file read if
+	 * at start of file, and grow the window fast.  Or detect first
+	 * sequential access
+	 */
+	if ((ra->size == 0 && offset == 0)	/* first io and start of file */
+	    || (ra->size == -1 && ra->prev_page == offset - 1)) {
+		/* First sequential */
+		ra->prev_page  = offset + newsize - 1;
+		ra->size = get_init_ra_size(newsize, max);
+		ra->start = offset;
+		if (!blockable_page_cache_readahead(mapping, filp, offset,
+							 ra->size, ra, 1))
+			goto out;
 
-	ra->prev_page = offset;
-
-	if (offset >= ra->start && offset <= (ra->start + ra->size)) {
-		/*
-		 * A readahead hit.  Either inside the window, or one
-		 * page beyond the end.  Expand the next readahead size.
-		 */
-		ra->next_size += 2;
-
-		if (ra->currnt_wnd_hit <= (max * 2))
-			ra->currnt_wnd_hit++;
-	} else {
 		/*
-		 * A miss - lseek, pagefault, pread, etc.  Shrink the readahead
-		 * window.
+		 * If the request size is larger than our max readahead, we
+		 * at least want to be sure that we get 2 IOs in flight and
+		 * we know that we will definitly need the new I/O.
+		 * once we do this, subsequent calls should be able to overlap
+		 * IOs,* thus preventing stalls. so issue the ahead window
+		 * immediately.
 		 */
-		ra->next_size -= 2;
-
-		average = ra->average;
-		if (average < ra->currnt_wnd_hit) {
-			average++;
+		if (req_size >= max) {
+			ra->ahead_size = get_next_ra_size(ra->size, max, min,
+							  &ra->flags);
+			ra->ahead_start = ra->start + ra->size;
+			blockable_page_cache_readahead(mapping, filp,
+				 ra->ahead_start, ra->ahead_size, ra, 1);
 		}
-		ra->average = (average + ra->currnt_wnd_hit) / 2;
-		ra->currnt_wnd_hit = 1;
+		goto out;
 	}
 
-	if ((long)ra->next_size > (long)max)
-		ra->next_size = max;
-	if ((long)ra->next_size <= 0L) {
-		ra->next_size = -1UL;
-		ra->size = 0;
-		goto out;		/* Readahead is off */
+	/*
+	 * Now handle the random case:
+	 * partial page reads and first access were handled above,
+	 * so this must be the next page otherwise it is random
+	 */
+	if ((offset != (ra->prev_page+1) || (ra->size == 0))) {
+		ra_off(ra);
+		ra->prev_page  = offset + newsize - 1;
+		blockable_page_cache_readahead(mapping, filp, offset,
+				 newsize, ra, 1);
+		goto out;
 	}
 
 	/*
-	 * Is this request outside the current window?
+	 * If we get here we are doing sequential IO and this was not the first
+	 * occurence (ie we have an existing window)
 	 */
-	if (offset < ra->start || offset >= (ra->start + ra->size)) {
-		/*
-		 * A miss against the current window.  Have we merely
-		 * advanced into the ahead window?
-		 */
-		if (offset == ra->ahead_start) {
-			/*
-			 * Yes, we have.  The ahead window now becomes
-			 * the current window.
+
+	if (ra->ahead_start == 0) {	 /* no ahead window yet */
+		ra->ahead_size = get_next_ra_size(ra->size, max, min,
+						  &ra->flags);
+		ra->ahead_start = ra->start + ra->size;
+		block = ((offset + newsize -1) >= ra->ahead_start);
+		if (!blockable_page_cache_readahead(mapping, filp,
+		    ra->ahead_start, ra->ahead_size, ra, block)) {
+			/* A read failure in blocking mode, implies pages are
+			 * all cached. So we can safely assume we have taken
+			 * care of all the pages requested in this call. A read
+			 * failure in non-blocking mode, implies we are reading
+			 * more pages than requested in this call.  So we safely
+			 * assume we have taken care of all the pages requested
+			 * in this call.
+			 *
+			 * Just reset the ahead window in case we failed due to
+			 * congestion.  The ahead window will any way be closed
+			 * in case we failed due to exessive page cache hits.
 			 */
-			ra->start = ra->ahead_start;
-			ra->size = ra->ahead_size;
-			ra->prev_page = ra->start;
 			ra->ahead_start = 0;
 			ra->ahead_size = 0;
-
-			/*
-			 * Control now returns, probably to sleep until I/O
-			 * completes against the first ahead page.
-			 * When the second page in the old ahead window is
-			 * requested, control will return here and more I/O
-			 * will be submitted to build the new ahead window.
-			 */
 			goto out;
 		}
-do_io:
-		/*
-		 * This is the "unusual" path.  We come here during
-		 * startup or after an lseek.  We invalidate the
-		 * ahead window and get some I/O underway for the new
-		 * current window.
-		 */
-		if (!first_access) {
-			 /* Heuristic: there is a high probability
-			  * that around  ra->average number of
-			  * pages shall be accessed in the next
-			  * current window.
-			  */
-			average = ra->average;
-			if (ra->currnt_wnd_hit > average)
-				average = (ra->currnt_wnd_hit + ra->average + 1) / 2;
-
-			ra->next_size = min(average , (unsigned long)max);
-		}
-		ra->start = offset;
-		ra->size = ra->next_size;
-		ra->ahead_start = 0;		/* Invalidate these */
-		ra->ahead_size = 0;
-		actual = do_page_cache_readahead(mapping, filp, offset,
-						 ra->size);
-		if(!first_access) {
-			/*
-			 * do not adjust the readahead window size the first
-			 * time, the ahead window might get closed if all
-			 * the pages are already in the cache.
-			 */
-			check_ra_success(ra, ra->size, actual, orig_next_size);
-		}
-	} else {
-		/*
-		 * This read request is within the current window.  It may be
-		 * time to submit I/O for the ahead window while the
-		 * application is about to step into the ahead window.
-		 */
-		if (ra->ahead_start == 0) {
-			/*
-			 * If the average io-size is more than maximum
-			 * readahead size of the file the io pattern is
-			 * sequential. Hence  bring in the readahead window
-			 * immediately.
-			 * If the average io-size is less than maximum
-			 * readahead size of the file the io pattern is
-			 * random. Hence don't bother to readahead.
+	}
+	/*
+	 * Already have an ahead window, check if we crossed into it.
+	 * If so, shift windows and issue a new ahead window.
+	 * Only return the #pages that are in the current window, so that
+	 * we get called back on the first page of the ahead window which
+	 * will allow us to submit more IO.
+	 */
+	if ((offset + newsize - 1) >= ra->ahead_start) {
+		ra->start = ra->ahead_start;
+		ra->size = ra->ahead_size;
+		ra->ahead_start = ra->ahead_start + ra->ahead_size;
+		ra->ahead_size = get_next_ra_size(ra->ahead_size,
+						  max, min, &ra->flags);
+		block = ((offset + newsize - 1) >= ra->ahead_start);
+		if (!blockable_page_cache_readahead(mapping, filp,
+			ra->ahead_start, ra->ahead_size, ra, block)) {
+			/* A read failure in blocking mode, implies pages are
+			 * all cached. So we can safely assume we have taken
+			 * care of all the pages requested in this call.
+			 * A read failure in non-blocking mode, implies we are
+			 * reading more pages than requested in this call.  So
+			 * we safely assume we have taken care of all the pages
+			 * requested in this call.
+			 *
+			 * Just reset the ahead window in case we failed due to
+			 * congestion.  The ahead window will any way be closed
+			 * in case we failed due to excessive page cache hits.
 			 */
-			average = ra->average;
-			if (ra->currnt_wnd_hit > average)
-				average = (ra->currnt_wnd_hit + ra->average + 1) / 2;
-
-			if (average > max) {
-				ra->ahead_start = ra->start + ra->size;
-				ra->ahead_size = ra->next_size;
-				actual = do_page_cache_readahead(mapping, filp,
-					ra->ahead_start, ra->ahead_size);
-				check_ra_success(ra, ra->ahead_size,
-						actual, orig_next_size);
-			}
+			ra->ahead_start = 0;
+			ra->ahead_size = 0;
 		}
 	}
+
 out:
-	return;
+	ra->prev_page = offset + newsize - 1;
+	return(newsize);
 }
 
-
 /*
  * handle_ra_miss() is called when it is known that a page which should have
  * been present in the pagecache (we just did some readahead there) was in fact
  * not found.  This will happen if it was evicted by the VM (readahead
- * thrashing) or if the readahead window is maximally shrunk.
+ * thrashing)
  *
- * If the window has been maximally shrunk (next_size == -1UL) then look to see
- * if we are getting misses against sequential file offsets.  If so, and this
- * persists then resume readahead.
- *
- * Otherwise we're thrashing, so shrink the readahead window by three pages.
- * This is because it is grown by two pages on a readahead hit.  Theory being
- * that the readahead window size will stabilise around the maximum level at
- * which there is no thrashing.
+ * Turn on the cache miss flag in the RA struct, this will cause the RA code
+ * to reduce the RA size on the next read.
  */
 void handle_ra_miss(struct address_space *mapping,
 		struct file_ra_state *ra, pgoff_t offset)
 {
-	if (ra->next_size == -1UL) {
-		const unsigned long max = get_max_readahead(ra);
-
-		if (offset != ra->prev_page + 1) {
-			ra->size = ra->size?ra->size-1:0; /* Not sequential */
-		} else {
-			ra->size++;			/* A sequential read */
-			if (ra->size >= max) {		/* Resume readahead */
-				ra->start = offset - max;
-				ra->next_size = max;
-				ra->size = max;
-				ra->ahead_start = 0;
-				ra->ahead_size = 0;
-				ra->average = max / 2;
-			}
-		}
-		ra->prev_page = offset;
-	} else {
-		const unsigned long min = get_min_readahead(ra);
-
-		ra->next_size -= 3;
-		if (ra->next_size < min)
-			ra->next_size = min;
-	}
+	ra->flags |= RA_FLAG_MISS;
+	ra->flags &= ~RA_FLAG_INCACHE;
 }
 
 /*