#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/writeback.h>
-#include <linux/suspend.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page(),
buffer_heads_over_limit */
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
+#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
+#include <linux/delay.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
-/* possible outcome of pageout() */
-typedef enum {
- /* failed to write page out, page is locked */
- PAGE_KEEP,
- /* move page to the active list, page is locked */
- PAGE_ACTIVATE,
- /* page has been sent to the disk successfully, page is unlocked */
- PAGE_SUCCESS,
- /* page is clean and locked */
- PAGE_CLEAN,
-} pageout_t;
+#include "internal.h"
struct scan_control {
- /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
- unsigned long nr_to_scan;
-
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
- /* Incremented by the number of pages reclaimed */
- unsigned long nr_reclaimed;
-
unsigned long nr_mapped; /* From page_state */
- /* How many pages shrink_cache() should reclaim */
- int nr_to_reclaim;
-
- /* Ask shrink_caches, or shrink_zone to scan at this priority */
- unsigned int priority;
-
/* This context's GFP mask */
- unsigned int gfp_mask;
+ gfp_t gfp_mask;
int may_writepage;
+
+ /* Can pages be swapped as part of reclaim? */
+ int may_swap;
+
+ /* This context's SWAP_CLUSTER_MAX. If freeing memory for
+ * suspend, we effectively ignore SWAP_CLUSTER_MAX.
+ * In this context, it doesn't matter that we scan the
+ * whole list at once. */
+ int swap_cluster_max;
};
/*
shrinker->seeks = seeks;
shrinker->nr = 0;
down_write(&shrinker_rwsem);
- list_add(&shrinker->list, &shrinker_list);
+ list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
}
return shrinker;
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
*/
-static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
unsigned long lru_pages)
{
struct shrinker *shrinker;
+ unsigned long ret = 0;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem))
- return 0;
+ return 1; /* Assume we'll be able to shrink next time */
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
+ unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
- delta *= (*shrinker->shrinker)(0, gfp_mask);
+ delta *= max_pass;
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
- if (shrinker->nr < 0)
- shrinker->nr = LONG_MAX; /* It wrapped! */
+ if (shrinker->nr < 0) {
+ printk(KERN_ERR "%s: nr=%ld\n",
+ __FUNCTION__, shrinker->nr);
+ shrinker->nr = max_pass;
+ }
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (shrinker->nr > max_pass * 2)
+ shrinker->nr = max_pass * 2;
total_scan = shrinker->nr;
shrinker->nr = 0;
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
+ int nr_before;
+ nr_before = (*shrinker->shrinker)(0, gfp_mask);
shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
if (shrink_ret == -1)
break;
+ if (shrink_ret < nr_before)
+ ret += nr_before - shrink_ret;
mod_page_state(slabs_scanned, this_scan);
total_scan -= this_scan;
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
- return 0;
+ return ret;
}
/* Called without lock on whether page is mapped, so answer is unstable */
static int may_write_to_queue(struct backing_dev_info *bdi)
{
- if (current_is_kswapd())
- return 1;
- if (current_is_pdflush()) /* This is unlikely, but why not... */
+ if (current->flags & PF_SWAPWRITE)
return 1;
if (!bdi_write_congested(bdi))
return 1;
}
/*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
{
/*
* If the page is dirty, only perform writeback if that write
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
- if (!mapping)
+ if (!mapping) {
+ /*
+ * Some data journaling orphaned pages can have
+ * page->mapping == NULL while being dirty with clean buffers.
+ */
+ if (PagePrivate(page)) {
+ if (try_to_free_buffers(page)) {
+ ClearPageDirty(page);
+ printk("%s: orphaned page\n", __FUNCTION__);
+ return PAGE_CLEAN;
+ }
+ }
return PAGE_KEEP;
+ }
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
if (!may_write_to_queue(mapping->backing_dev_info))
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
- if (res == WRITEPAGE_ACTIVATE) {
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
return PAGE_CLEAN;
}
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+ if (!mapping)
+ return 0; /* truncate got there first */
+
+ write_lock_irq(&mapping->tree_lock);
+
+ /*
+ * The non-racy check for busy page. It is critical to check
+ * PageDirty _after_ making sure that the page is freeable and
+ * not in use by anybody. (pagecache + us == 2)
+ */
+ if (unlikely(page_count(page) != 2))
+ goto cannot_free;
+ smp_rmb();
+ if (unlikely(PageDirty(page)))
+ goto cannot_free;
+
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+ __delete_from_swap_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ swap_free(swap);
+ __put_page(page); /* The pagecache ref */
+ return 1;
+ }
+
+ __remove_from_page_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ __put_page(page);
+ return 1;
+
+cannot_free:
+ write_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
/*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ * shrink_page_list() returns the number of reclaimed pages
*/
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_page_list(struct list_head *page_list,
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
int pgactivate = 0;
- int reclaimed = 0;
+ unsigned long nr_reclaimed = 0;
cond_resched();
int may_enter_fs;
int referenced;
+ cond_resched();
+
page = lru_to_page(page_list);
list_del(&page->lru);
BUG_ON(PageActive(page));
- if (PageWriteback(page))
+ sc->nr_scanned++;
+
+ if (!sc->may_swap && page_mapped(page))
goto keep_locked;
- sc->nr_scanned++;
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
+ if (PageWriteback(page))
+ goto keep_locked;
+
referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
if (referenced && page_mapping_inuse(page))
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
- if (!add_to_swap(page))
+ if (PageAnon(page) && !PageSwapCache(page))
+ if (!add_to_swap(page, GFP_ATOMIC))
goto activate_locked;
- }
#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page)) {
+ switch (try_to_unmap(page, 0)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
- if (laptop_mode && !sc->may_writepage)
+ if (!sc->may_writepage)
goto keep_locked;
/* Page is dirty, try to write it out here */
goto free_it;
}
- if (!mapping)
- goto keep_locked; /* truncate got there first */
-
- spin_lock_irq(&mapping->tree_lock);
-
- /*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
- */
- if (page_count(page) != 2 || PageDirty(page)) {
- spin_unlock_irq(&mapping->tree_lock);
+ if (!remove_mapping(mapping, page))
goto keep_locked;
- }
-
-#ifdef CONFIG_SWAP
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page->private };
- __delete_from_swap_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
- swap_free(swap);
- __put_page(page); /* The pagecache ref */
- goto free_it;
- }
-#endif /* CONFIG_SWAP */
-
- __remove_from_page_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
- __put_page(page);
free_it:
unlock_page(page);
- reclaimed++;
+ nr_reclaimed++;
if (!pagevec_add(&freed_pvec, page))
__pagevec_release_nonlru(&freed_pvec);
continue;
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
mod_page_state(pgactivate, pgactivate);
- sc->nr_reclaimed += reclaimed;
- return reclaimed;
+ return nr_reclaimed;
}
/*
- * zone->lru_lock is heavily contented. We relieve it by quickly privatising
- * a batch of pages and working on them outside the lock. Any pages which were
- * not freed will be added back to the LRU.
+ * zone->lru_lock is heavily contended. Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
*
- * For pagecache intensive workloads, the first loop here is the hottest spot
- * in the kernel (apart from the copy_*_user functions).
+ * Appropriate locks must be held before calling this function.
+ *
+ * @nr_to_scan: The number of pages to look through on the list.
+ * @src: The LRU list to pull pages off.
+ * @dst: The temp list to put pages on to.
+ * @scanned: The number of pages that were scanned.
+ *
+ * returns how many pages were moved onto *@dst.
*/
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+ struct list_head *src, struct list_head *dst,
+ unsigned long *scanned)
+{
+ unsigned long nr_taken = 0;
+ struct page *page;
+ unsigned long scan;
+
+ for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ struct list_head *target;
+ page = lru_to_page(src);
+ prefetchw_prev_lru_page(page, src, flags);
+
+ BUG_ON(!PageLRU(page));
+
+ list_del(&page->lru);
+ target = src;
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ ClearPageLRU(page);
+ target = dst;
+ nr_taken++;
+ } /* else it is being freed elsewhere */
+
+ list_add(&page->lru, target);
+ }
+
+ *scanned = scan;
+ return nr_taken;
+}
+
+/*
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
+ */
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+ struct zone *zone, struct scan_control *sc)
{
LIST_HEAD(page_list);
struct pagevec pvec;
- int max_scan = sc->nr_to_scan;
+ unsigned long nr_scanned = 0;
+ unsigned long nr_reclaimed = 0;
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- while (max_scan > 0) {
+ do {
struct page *page;
- int nr_taken = 0;
- int nr_scan = 0;
- int nr_freed;
-
- while (nr_scan++ < SWAP_CLUSTER_MAX &&
- !list_empty(&zone->inactive_list)) {
- page = lru_to_page(&zone->inactive_list);
+ unsigned long nr_taken;
+ unsigned long nr_scan;
+ unsigned long nr_freed;
- prefetchw_prev_lru_page(page,
- &zone->inactive_list, flags);
-
- if (!TestClearPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (get_page_testone(page)) {
- /*
- * It is being freed elsewhere
- */
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, &zone->inactive_list);
- continue;
- }
- list_add(&page->lru, &page_list);
- nr_taken++;
- }
+ nr_taken = isolate_lru_pages(sc->swap_cluster_max,
+ &zone->inactive_list,
+ &page_list, &nr_scan);
zone->nr_inactive -= nr_taken;
- zone->pages_scanned += nr_taken;
+ zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
+ nr_scanned += nr_scan;
+ nr_freed = shrink_page_list(&page_list, sc);
+ nr_reclaimed += nr_freed;
+ local_irq_disable();
+ if (current_is_kswapd()) {
+ __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+ __mod_page_state(kswapd_steal, nr_freed);
+ } else
+ __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+ __mod_page_state_zone(zone, pgsteal, nr_freed);
+
if (nr_taken == 0)
goto done;
- max_scan -= nr_scan;
- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
- nr_freed = shrink_list(&page_list, sc);
- if (current_is_kswapd())
- mod_page_state(kswapd_steal, nr_freed);
- mod_page_state_zone(zone, pgsteal, nr_freed);
- sc->nr_to_reclaim -= nr_freed;
-
- spin_lock_irq(&zone->lru_lock);
+ spin_lock(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
- if (TestSetPageLRU(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
spin_lock_irq(&zone->lru_lock);
}
}
- }
- spin_unlock_irq(&zone->lru_lock);
+ } while (nr_scanned < max_scan);
+ spin_unlock(&zone->lru_lock);
done:
+ local_irq_enable();
pagevec_release(&pvec);
+ return nr_reclaimed;
}
/*
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/
-static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+ struct scan_control *sc)
{
- int pgmoved;
+ unsigned long pgmoved;
int pgdeactivate = 0;
- int pgscanned = 0;
- int nr_pages = sc->nr_to_scan;
+ unsigned long pgscanned;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
LIST_HEAD(l_active); /* Pages to go onto the active_list */
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
- lru_add_drain();
- pgmoved = 0;
- spin_lock_irq(&zone->lru_lock);
- while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
- page = lru_to_page(&zone->active_list);
- prefetchw_prev_lru_page(page, &zone->active_list, flags);
- if (!TestClearPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (get_page_testone(page)) {
- /*
- * It was already free! release_pages() or put_page()
- * are about to remove it from the LRU and free it. So
- * put the refcount back and put the page back on the
- * LRU
- */
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, &zone->active_list);
- } else {
- list_add(&page->lru, &l_hold);
- pgmoved++;
- }
- pgscanned++;
- }
- zone->nr_active -= pgmoved;
- spin_unlock_irq(&zone->lru_lock);
+ if (sc->may_swap) {
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
+ /*
+ * `distress' is a measure of how much trouble we're having
+ * reclaiming pages. 0 -> no problems. 100 -> great trouble.
+ */
+ distress = 100 >> zone->prev_priority;
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+ /*
+ * The point of this algorithm is to decide when to start
+ * reclaiming mapped memory instead of just pagecache. Work out
+ * how much memory
+ * is mapped.
+ */
+ mapped_ratio = (sc->nr_mapped * 100) / total_memory;
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+ /*
+ * Now decide how much we really want to unmap some pages. The
+ * mapped ratio is downgraded - just because there's a lot of
+ * mapped memory doesn't necessarily mean that page reclaim
+ * isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start
+ * going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm
+ * altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
+ /*
+ * Now use this metric to decide whether to start moving mapped
+ * memory onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+ reclaim_mapped = 1;
+ }
+
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
+ pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+ &l_hold, &pgscanned);
+ zone->pages_scanned += pgscanned;
+ zone->nr_active -= pgmoved;
+ spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
+ cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (page_mapped(page)) {
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
- if (TestSetPageLRU(page))
- BUG();
- if (!TestClearPageActive(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ BUG_ON(!PageActive(page));
+ ClearPageActive(page);
+
list_move(&page->lru, &zone->inactive_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
while (!list_empty(&l_active)) {
page = lru_to_page(&l_active);
prefetchw_prev_lru_page(page, &l_active, flags);
- if (TestSetPageLRU(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
pgmoved++;
}
}
zone->nr_active += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
+ spin_unlock(&zone->lru_lock);
+
+ __mod_page_state_zone(zone, pgrefill, pgscanned);
+ __mod_page_state(pgdeactivate, pgdeactivate);
+ local_irq_enable();
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
+ pagevec_release(&pvec);
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void
-shrink_zone(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_zone(int priority, struct zone *zone,
+ struct scan_control *sc)
{
unsigned long nr_active;
unsigned long nr_inactive;
+ unsigned long nr_to_scan;
+ unsigned long nr_reclaimed = 0;
+
+ atomic_inc(&zone->reclaim_in_progress);
/*
* Add one to `nr_to_scan' just to make sure that the kernel will
* slowly sift through the active list.
*/
- zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+ zone->nr_scan_active += (zone->nr_active >> priority) + 1;
nr_active = zone->nr_scan_active;
- if (nr_active >= SWAP_CLUSTER_MAX)
+ if (nr_active >= sc->swap_cluster_max)
zone->nr_scan_active = 0;
else
nr_active = 0;
- zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+ zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
nr_inactive = zone->nr_scan_inactive;
- if (nr_inactive >= SWAP_CLUSTER_MAX)
+ if (nr_inactive >= sc->swap_cluster_max)
zone->nr_scan_inactive = 0;
else
nr_inactive = 0;
- sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
-
while (nr_active || nr_inactive) {
if (nr_active) {
- sc->nr_to_scan = min(nr_active,
- (unsigned long)SWAP_CLUSTER_MAX);
- nr_active -= sc->nr_to_scan;
- refill_inactive_zone(zone, sc);
+ nr_to_scan = min(nr_active,
+ (unsigned long)sc->swap_cluster_max);
+ nr_active -= nr_to_scan;
+ shrink_active_list(nr_to_scan, zone, sc);
}
if (nr_inactive) {
- sc->nr_to_scan = min(nr_inactive,
- (unsigned long)SWAP_CLUSTER_MAX);
- nr_inactive -= sc->nr_to_scan;
- shrink_cache(zone, sc);
- if (sc->nr_to_reclaim <= 0)
- break;
+ nr_to_scan = min(nr_inactive,
+ (unsigned long)sc->swap_cluster_max);
+ nr_inactive -= nr_to_scan;
+ nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+ sc);
}
}
+
+ throttle_vm_writeout();
+
+ atomic_dec(&zone->reclaim_in_progress);
+ return nr_reclaimed;
}
/*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static void
-shrink_caches(struct zone **zones, struct scan_control *sc)
+static unsigned long shrink_zones(int priority, struct zone **zones,
+ struct scan_control *sc)
{
+ unsigned long nr_reclaimed = 0;
int i;
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
- zone->temp_priority = sc->priority;
- if (zone->prev_priority > sc->priority)
- zone->prev_priority = sc->priority;
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ continue;
+
+ zone->temp_priority = priority;
+ if (zone->prev_priority > priority)
+ zone->prev_priority = priority;
- if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- shrink_zone(zone, sc);
+ nr_reclaimed += shrink_zone(priority, zone, sc);
}
+ return nr_reclaimed;
}
/*
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones,
- unsigned int gfp_mask, unsigned int order)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
{
int priority;
int ret = 0;
- int total_scanned = 0, total_reclaimed = 0;
+ unsigned long total_scanned = 0;
+ unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct scan_control sc;
unsigned long lru_pages = 0;
int i;
-
- sc.gfp_mask = gfp_mask;
- sc.may_writepage = 0;
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = !laptop_mode,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .may_swap = 1,
+ };
inc_page_state(allocstall);
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ continue;
+
zone->temp_priority = DEF_PRIORITY;
lru_pages += zone->nr_active + zone->nr_inactive;
}
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = priority;
- shrink_caches(zones, &sc);
+ if (!priority)
+ disable_swap_token();
+ nr_reclaimed += shrink_zones(priority, zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
- if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) {
+ total_scanned += sc.nr_scanned;
+ if (nr_reclaimed >= sc.swap_cluster_max) {
ret = 1;
goto out;
}
- total_scanned += sc.nr_scanned;
- total_reclaimed += sc.nr_reclaimed;
/*
* Try to write back as many pages as we just scanned. This
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
- if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
- wakeup_bdflush(laptop_mode ? 0 : total_scanned);
+ if (total_scanned > sc.swap_cluster_max +
+ sc.swap_cluster_max / 2) {
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned);
sc.may_writepage = 1;
}
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
- out_of_memory(gfp_mask);
out:
- for (i = 0; zones[i] != 0; i++)
- zones[i]->prev_priority = zones[i]->temp_priority;
+ for (i = 0; zones[i] != 0; i++) {
+ struct zone *zone = zones[i];
+
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ continue;
+
+ zone->prev_priority = zone->temp_priority;
+ }
return ret;
}
* the page allocator fallback scheme to ensure that aging of pages is balanced
* across the zones.
*/
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+ int order)
{
- int to_free = nr_pages;
+ unsigned long to_free = nr_pages;
int all_zones_ok;
int priority;
int i;
- int total_scanned, total_reclaimed;
+ unsigned long total_scanned;
+ unsigned long nr_reclaimed;
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct scan_control sc;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 1,
+ .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+ };
loop_again:
total_scanned = 0;
- total_reclaimed = 0;
- sc.gfp_mask = GFP_KERNEL;
- sc.may_writepage = 0;
+ nr_reclaimed = 0;
+ sc.may_writepage = !laptop_mode;
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
+ /* The swap token gets in the way of swapout... */
+ if (!priority)
+ disable_swap_token();
+
all_zones_ok = 1;
if (nr_pages == 0) {
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable &&
priority != DEF_PRIORITY)
continue;
- if (zone->free_pages <= zone->pages_high) {
+ if (!zone_watermark_ok(zone, order,
+ zone->pages_high, 0, 0)) {
end_zone = i;
goto scan;
}
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
+ int nr_slab;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
if (nr_pages == 0) { /* Not software suspend */
- if (zone->free_pages <= zone->pages_high)
+ if (!zone_watermark_ok(zone, order,
+ zone->pages_high, end_zone, 0))
all_zones_ok = 0;
}
zone->temp_priority = priority;
if (zone->prev_priority > priority)
zone->prev_priority = priority;
sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = priority;
- shrink_zone(zone, &sc);
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_reclaimed += sc.nr_reclaimed;
+ nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+ lru_pages);
+ nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (zone->pages_scanned >= (zone->nr_active +
- zone->nr_inactive) * 4)
+ if (nr_slab == 0 && zone->pages_scanned >=
+ (zone->nr_active + zone->nr_inactive) * 4)
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > total_reclaimed+total_reclaimed/2)
+ total_scanned > nr_reclaimed + nr_reclaimed / 2)
sc.may_writepage = 1;
}
- if (nr_pages && to_free > total_reclaimed)
+ if (nr_pages && to_free > nr_reclaimed)
continue; /* swsusp: need to do more work */
if (all_zones_ok)
break; /* kswapd: all done */
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if (total_reclaimed >= SWAP_CLUSTER_MAX)
+ if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
break;
}
out:
goto loop_again;
}
- return total_reclaimed;
+ return nr_reclaimed;
}
/*
*/
static int kswapd(void *p)
{
+ unsigned long order;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
DEFINE_WAIT(wait);
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
- tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ order = 0;
for ( ; ; ) {
- if (current->flags & PF_FREEZE)
- refrigerator(PF_FREEZE);
+ unsigned long new_order;
+
+ try_to_freeze();
+
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- schedule();
+ new_order = pgdat->kswapd_max_order;
+ pgdat->kswapd_max_order = 0;
+ if (order < new_order) {
+ /*
+ * Don't sleep if someone wants a larger 'order'
+ * allocation
+ */
+ order = new_order;
+ } else {
+ schedule();
+ order = pgdat->kswapd_max_order;
+ }
finish_wait(&pgdat->kswapd_wait, &wait);
- balance_pgdat(pgdat, 0);
+ balance_pgdat(pgdat, 0, order);
}
return 0;
}
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
-void wakeup_kswapd(struct zone *zone)
+void wakeup_kswapd(struct zone *zone, int order)
{
- if (zone->present_pages == 0)
+ pg_data_t *pgdat;
+
+ if (!populated_zone(zone))
return;
- if (zone->free_pages > zone->pages_low)
+
+ pgdat = zone->zone_pgdat;
+ if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+ return;
+ if (pgdat->kswapd_max_order < order)
+ pgdat->kswapd_max_order = order;
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
return;
- if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+ if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
+ wake_up_interruptible(&pgdat->kswapd_wait);
}
#ifdef CONFIG_PM
* Try to free `nr_pages' of memory, system-wide. Returns the number of freed
* pages.
*/
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
{
pg_data_t *pgdat;
- int nr_to_free = nr_pages;
- int ret = 0;
+ unsigned long nr_to_free = nr_pages;
+ unsigned long ret = 0;
+ unsigned retry = 2;
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
current->reclaim_state = &reclaim_state;
- for_each_pgdat(pgdat) {
- int freed;
- freed = balance_pgdat(pgdat, nr_to_free);
+repeat:
+ for_each_online_pgdat(pgdat) {
+ unsigned long freed;
+
+ freed = balance_pgdat(pgdat, nr_to_free, 0);
ret += freed;
nr_to_free -= freed;
- if (nr_to_free <= 0)
+ if ((long)nr_to_free <= 0)
break;
}
+ if (retry-- && ret < nr_pages) {
+ blk_congestion_wait(WRITE, HZ/5);
+ goto repeat;
+ }
current->reclaim_state = NULL;
return ret;
}
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
-static int __devinit cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
{
pg_data_t *pgdat;
cpumask_t mask;
if (action == CPU_ONLINE) {
- for_each_pgdat(pgdat) {
+ for_each_online_pgdat(pgdat) {
mask = node_to_cpumask(pgdat->node_id);
if (any_online_cpu(mask) != NR_CPUS)
/* One of our CPUs online: restore mask */
static int __init kswapd_init(void)
{
pg_data_t *pgdat;
+
swap_setup();
- for_each_pgdat(pgdat)
- pgdat->kswapd
- = find_task_by_real_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+ for_each_online_pgdat(pgdat) {
+ pid_t pid;
+
+ pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+ BUG_ON(pid < 0);
+ read_lock(&tasklist_lock);
+ pgdat->kswapd = find_task_by_real_pid(pid);
+ read_unlock(&tasklist_lock);
+ }
total_memory = nr_free_pagecache_pages();
hotcpu_notifier(cpu_callback, 0);
return 0;
}
module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+int zone_reclaim_interval __read_mostly = 30*HZ;
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
+ int priority;
+ unsigned long nr_reclaimed = 0;
+ struct scan_control sc = {
+ .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+ .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .nr_mapped = read_page_state(nr_mapped),
+ .swap_cluster_max = max_t(unsigned long, nr_pages,
+ SWAP_CLUSTER_MAX),
+ .gfp_mask = gfp_mask,
+ };
+
+ disable_swap_token();
+ cond_resched();
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_SWAP.
+ */
+ p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+ /*
+ * Free memory by calling shrink zone with increasing priorities
+ * until we have enough memory freed.
+ */
+ priority = ZONE_RECLAIM_PRIORITY;
+ do {
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
+ priority--;
+ } while (priority >= 0 && nr_reclaimed < nr_pages);
+
+ if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+ /*
+ * shrink_slab() does not currently allow us to determine how
+ * many pages were freed in this zone. So we just shake the slab
+ * a bit and then go off node for this particular allocation
+ * despite possibly having freed enough memory to allocate in
+ * this zone. If we freed local memory then the next
+ * allocations will be local again.
+ *
+ * shrink_slab will free memory on all zones and may take
+ * a long time.
+ */
+ shrink_slab(sc.nr_scanned, gfp_mask, order);
+ }
+
+ p->reclaim_state = NULL;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+
+ if (nr_reclaimed == 0) {
+ /*
+ * We were unable to reclaim enough pages to stay on node. We
+ * now allow off node accesses for a certain time period before
+ * trying again to reclaim pages from the local zone.
+ */
+ zone->last_unsuccessful_zone_reclaim = jiffies;
+ }
+
+ return nr_reclaimed >= nr_pages;
+}
+
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ cpumask_t mask;
+ int node_id;
+
+ /*
+ * Do not reclaim if there was a recent unsuccessful attempt at zone
+ * reclaim. In that case we let allocations go off node for the
+ * zone_reclaim_interval. Otherwise we would scan for each off-node
+ * page allocation.
+ */
+ if (time_before(jiffies,
+ zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+ return 0;
+
+ /*
+ * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+ * not have reclaimable pages and if we should not delay the allocation
+ * then do not scan.
+ */
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0 ||
+ (current->flags & PF_MEMALLOC))
+ return 0;
+
+ /*
+ * Only run zone reclaim on the local zone or on zones that do not
+ * have associated processors. This will favor the local processor
+ * over remote processors and spread off node memory allocations
+ * as wide as possible.
+ */
+ node_id = zone->zone_pgdat->node_id;
+ mask = node_to_cpumask(node_id);
+ if (!cpus_empty(mask) && node_id != numa_node_id())
+ return 0;
+ return __zone_reclaim(zone, gfp_mask, order);
+}
+#endif