#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/writeback.h>
-#include <linux/suspend.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page(),
buffer_heads_over_limit */
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
+#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <asm/div64.h>
#include <linux/swapops.h>
-#include <linux/ckrm_mem.h>
-#include <linux/vs_cvirt.h>
-
/* possible outcome of pageout() */
typedef enum {
unsigned long nr_mapped; /* From page_state */
- /* How many pages shrink_cache() should reclaim */
- int nr_to_reclaim;
-
/* Ask shrink_caches, or shrink_zone to scan at this priority */
unsigned int priority;
/* This context's GFP mask */
- unsigned int gfp_mask;
+ gfp_t gfp_mask;
int may_writepage;
+
+ /* Can pages be swapped as part of reclaim? */
+ int may_swap;
+
+ /* This context's SWAP_CLUSTER_MAX. If freeing memory for
+ * suspend, we effectively ignore SWAP_CLUSTER_MAX.
+ * In this context, it doesn't matter that we scan the
+ * whole list at once. */
+ int swap_cluster_max;
};
/*
shrinker->seeks = seeks;
shrinker->nr = 0;
down_write(&shrinker_rwsem);
- list_add(&shrinker->list, &shrinker_list);
+ list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
}
return shrinker;
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
*/
-static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
- unsigned long lru_pages)
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
{
struct shrinker *shrinker;
+ int ret = 0;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem))
- return 0;
+ return 1; /* Assume we'll be able to shrink next time */
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
+ unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
- delta *= (*shrinker->shrinker)(0, gfp_mask);
+ delta *= max_pass;
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
- if (shrinker->nr < 0)
- shrinker->nr = LONG_MAX; /* It wrapped! */
+ if (shrinker->nr < 0) {
+ printk(KERN_ERR "%s: nr=%ld\n",
+ __FUNCTION__, shrinker->nr);
+ shrinker->nr = max_pass;
+ }
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (shrinker->nr > max_pass * 2)
+ shrinker->nr = max_pass * 2;
total_scan = shrinker->nr;
shrinker->nr = 0;
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
+ int nr_before;
+ nr_before = (*shrinker->shrinker)(0, gfp_mask);
shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
if (shrink_ret == -1)
break;
+ if (shrink_ret < nr_before)
+ ret += nr_before - shrink_ret;
mod_page_state(slabs_scanned, this_scan);
total_scan -= this_scan;
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
- return 0;
+ return ret;
}
/* Called without lock on whether page is mapped, so answer is unstable */
static int may_write_to_queue(struct backing_dev_info *bdi)
{
- if (current_is_kswapd())
- return 1;
- if (current_is_pdflush()) /* This is unlikely, but why not... */
+ if (current->flags & PF_SWAPWRITE)
return 1;
if (!bdi_write_congested(bdi))
return 1;
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
- if (!mapping)
+ if (!mapping) {
+ /*
+ * Some data journaling orphaned pages can have
+ * page->mapping == NULL while being dirty with clean buffers.
+ */
+ if (PagePrivate(page)) {
+ if (try_to_free_buffers(page)) {
+ ClearPageDirty(page);
+ printk("%s: orphaned page\n", __FUNCTION__);
+ return PAGE_CLEAN;
+ }
+ }
return PAGE_KEEP;
+ }
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
if (!may_write_to_queue(mapping->backing_dev_info))
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
- if (res == WRITEPAGE_ACTIVATE) {
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
return PAGE_CLEAN;
}
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+ if (!mapping)
+ return 0; /* truncate got there first */
+
+ write_lock_irq(&mapping->tree_lock);
+
+ /*
+ * The non-racy check for busy page. It is critical to check
+ * PageDirty _after_ making sure that the page is freeable and
+ * not in use by anybody. (pagecache + us == 2)
+ */
+ if (unlikely(page_count(page) != 2))
+ goto cannot_free;
+ smp_rmb();
+ if (unlikely(PageDirty(page)))
+ goto cannot_free;
+
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+ __delete_from_swap_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ swap_free(swap);
+ __put_page(page); /* The pagecache ref */
+ return 1;
+ }
+
+ __remove_from_page_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ __put_page(page);
+ return 1;
+
+cannot_free:
+ write_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
/*
* shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
*/
int may_enter_fs;
int referenced;
+ cond_resched();
+
page = lru_to_page(page_list);
list_del(&page->lru);
BUG_ON(PageActive(page));
sc->nr_scanned++;
+
+ if (!sc->may_swap && page_mapped(page))
+ goto keep_locked;
+
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
if (PageWriteback(page))
goto keep_locked;
- referenced = page_referenced(page, 1, sc->priority <= 0);
+ referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
if (referenced && page_mapping_inuse(page))
goto activate_locked;
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
- if (!add_to_swap(page))
+ if (!sc->may_swap)
+ goto keep_locked;
+ if (!add_to_swap(page, GFP_ATOMIC))
goto activate_locked;
}
#endif /* CONFIG_SWAP */
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page)) {
+ /*
+ * No unmapping if we do not swap
+ */
+ if (!sc->may_swap)
+ goto keep_locked;
+
+ switch (try_to_unmap(page, 0)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
- if (laptop_mode && !sc->may_writepage)
+ if (!sc->may_writepage)
goto keep_locked;
/* Page is dirty, try to write it out here */
goto free_it;
}
- if (!mapping)
- goto keep_locked; /* truncate got there first */
-
- spin_lock_irq(&mapping->tree_lock);
-
- /*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
- */
- if (page_count(page) != 2 || PageDirty(page)) {
- spin_unlock_irq(&mapping->tree_lock);
+ if (!remove_mapping(mapping, page))
goto keep_locked;
- }
-
-#ifdef CONFIG_SWAP
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page->private };
- __delete_from_swap_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
- swap_free(swap);
- __put_page(page); /* The pagecache ref */
- goto free_it;
- }
-#endif /* CONFIG_SWAP */
-
- __remove_from_page_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
- __put_page(page);
free_it:
unlock_page(page);
return reclaimed;
}
+#ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+ list_del(&page->lru);
+ if (PageActive(page)) {
+ /*
+ * lru_cache_add_active checks that
+ * the PG_active bit is off.
+ */
+ ClearPageActive(page);
+ lru_cache_add_active(page);
+ } else {
+ lru_cache_add(page);
+ }
+ put_page(page);
+}
+
/*
- * zone->lru_lock is heavily contented. We relieve it by quickly privatising
- * a batch of pages and working on them outside the lock. Any pages which were
- * not freed will be added back to the LRU.
+ * Add isolated pages on the list back to the LRU.
*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+ struct page *page;
+ struct page *page2;
+ int count = 0;
+
+ list_for_each_entry_safe(page, page2, l, lru) {
+ move_to_lru(page);
+ count++;
+ }
+ return count;
+}
+
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+ return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ if (page_mapped(page) && mapping)
+ if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+ goto unlock_retry;
+
+ if (PageDirty(page)) {
+ /* Page is dirty, try to write it out here */
+ switch(pageout(page, mapping)) {
+ case PAGE_KEEP:
+ case PAGE_ACTIVATE:
+ goto unlock_retry;
+
+ case PAGE_SUCCESS:
+ goto retry;
+
+ case PAGE_CLEAN:
+ ; /* try to free the page below */
+ }
+ }
+
+ if (PagePrivate(page)) {
+ if (!try_to_release_page(page, GFP_KERNEL) ||
+ (!mapping && page_count(page) == 1))
+ goto unlock_retry;
+ }
+
+ if (remove_mapping(mapping, page)) {
+ /* Success */
+ unlock_page(page);
+ return 0;
+ }
+
+unlock_retry:
+ unlock_page(page);
+
+retry:
+ return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+
+/*
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
*
- * For pagecache intensive workloads, the first loop here is the hottest spot
- * in the kernel (apart from the copy_*_user functions).
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
*/
-#ifdef CONFIG_CKRM_RES_MEM
-static void shrink_cache(struct ckrm_zone *ckrm_zone, struct scan_control *sc)
-#else
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+ struct page *page, int nr_refs)
+{
+ struct address_space *mapping = page_mapping(page);
+ struct page **radix_pointer;
+
+ /*
+ * Avoid doing any of the following work if the page count
+ * indicates that the page is in use or truncate has removed
+ * the page.
+ */
+ if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+ return -EAGAIN;
+
+ /*
+ * Establish swap ptes for anonymous pages or destroy pte
+ * maps for files.
+ *
+ * In order to reestablish file backed mappings the fault handlers
+ * will take the radix tree_lock which may then be used to stop
+ * processses from accessing this page until the new page is ready.
+ *
+ * A process accessing via a swap pte (an anonymous page) will take a
+ * page_lock on the old page which will block the process until the
+ * migration attempt is complete. At that time the PageSwapCache bit
+ * will be examined. If the page was migrated then the PageSwapCache
+ * bit will be clear and the operation to retrieve the page will be
+ * retried which will find the new page in the radix tree. Then a new
+ * direct mapping may be generated based on the radix tree contents.
+ *
+ * If the page was not migrated then the PageSwapCache bit
+ * is still set and the operation may continue.
+ */
+ if (try_to_unmap(page, 1) == SWAP_FAIL)
+ /* A vma has VM_LOCKED set -> Permanent failure */
+ return -EPERM;
+
+ /*
+ * Give up if we were unable to remove all mappings.
+ */
+ if (page_mapcount(page))
+ return -EAGAIN;
+
+ write_lock_irq(&mapping->tree_lock);
+
+ radix_pointer = (struct page **)radix_tree_lookup_slot(
+ &mapping->page_tree,
+ page_index(page));
+
+ if (!page_mapping(page) || page_count(page) != nr_refs ||
+ *radix_pointer != page) {
+ write_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ /*
+ * Now we know that no one else is looking at the page.
+ *
+ * Certain minimal information about a page must be available
+ * in order for other subsystems to properly handle the page if they
+ * find it through the radix tree update before we are finished
+ * copying the page.
+ */
+ get_page(newpage);
+ newpage->index = page->index;
+ newpage->mapping = page->mapping;
+ if (PageSwapCache(page)) {
+ SetPageSwapCache(newpage);
+ set_page_private(newpage, page_private(page));
+ }
+
+ *radix_pointer = newpage;
+ __put_page(page);
+ write_unlock_irq(&mapping->tree_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ copy_highpage(newpage, page);
+
+ if (PageError(page))
+ SetPageError(newpage);
+ if (PageReferenced(page))
+ SetPageReferenced(newpage);
+ if (PageUptodate(page))
+ SetPageUptodate(newpage);
+ if (PageActive(page))
+ SetPageActive(newpage);
+ if (PageChecked(page))
+ SetPageChecked(newpage);
+ if (PageMappedToDisk(page))
+ SetPageMappedToDisk(newpage);
+
+ if (PageDirty(page)) {
+ clear_page_dirty_for_io(page);
+ set_page_dirty(newpage);
+ }
+
+ ClearPageSwapCache(page);
+ ClearPageActive(page);
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page->mapping = NULL;
+
+ /*
+ * If any waiters have accumulated on the new page then
+ * wake them up.
+ */
+ if (PageWriteback(newpage))
+ end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+ int rc;
+
+ BUG_ON(PageWriteback(page)); /* Writeback must be complete */
+
+ rc = migrate_page_remove_references(newpage, page, 2);
+
+ if (rc)
+ return rc;
+
+ migrate_page_copy(newpage, page);
+
+ /*
+ * Remove auxiliary swap entries and replace
+ * them with real ptes.
+ *
+ * Note that a real pte entry will allow processes that are not
+ * waiting on the page lock to use the new page via the page tables
+ * before the new page is unlocked.
+ */
+ remove_from_swap(newpage);
+ return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+ struct list_head *moved, struct list_head *failed)
+{
+ int retry;
+ int nr_failed = 0;
+ int pass = 0;
+ struct page *page;
+ struct page *page2;
+ int swapwrite = current->flags & PF_SWAPWRITE;
+ int rc;
+
+ if (!swapwrite)
+ current->flags |= PF_SWAPWRITE;
+
+redo:
+ retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+ struct page *newpage = NULL;
+ struct address_space *mapping;
+
+ cond_resched();
+
+ rc = 0;
+ if (page_count(page) == 1)
+ /* page was freed from under us. So we are done. */
+ goto next;
+
+ if (to && list_empty(to))
+ break;
+
+ /*
+ * Skip locked pages during the first two passes to give the
+ * functions holding the lock time to release the page. Later we
+ * use lock_page() to have a higher chance of acquiring the
+ * lock.
+ */
+ rc = -EAGAIN;
+ if (pass > 2)
+ lock_page(page);
+ else
+ if (TestSetPageLocked(page))
+ goto next;
+
+ /*
+ * Only wait on writeback if we have already done a pass where
+ * we we may have triggered writeouts for lots of pages.
+ */
+ if (pass > 0) {
+ wait_on_page_writeback(page);
+ } else {
+ if (PageWriteback(page))
+ goto unlock_page;
+ }
+
+ /*
+ * Anonymous pages must have swap cache references otherwise
+ * the information contained in the page maps cannot be
+ * preserved.
+ */
+ if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!add_to_swap(page, GFP_KERNEL)) {
+ rc = -ENOMEM;
+ goto unlock_page;
+ }
+ }
+
+ if (!to) {
+ rc = swap_page(page);
+ goto next;
+ }
+
+ newpage = lru_to_page(to);
+ lock_page(newpage);
+
+ /*
+ * Pages are properly locked and writeback is complete.
+ * Try to migrate the page.
+ */
+ mapping = page_mapping(page);
+ if (!mapping)
+ goto unlock_both;
+
+ if (mapping->a_ops->migratepage) {
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
+ rc = mapping->a_ops->migratepage(newpage, page);
+ goto unlock_both;
+ }
+
+ /* Make sure the dirty bit is up to date */
+ if (try_to_unmap(page, 1) == SWAP_FAIL) {
+ rc = -EPERM;
+ goto unlock_both;
+ }
+
+ if (page_mapcount(page)) {
+ rc = -EAGAIN;
+ goto unlock_both;
+ }
+
+ /*
+ * Default handling if a filesystem does not provide
+ * a migration function. We can only migrate clean
+ * pages so try to write out any dirty pages first.
+ */
+ if (PageDirty(page)) {
+ switch (pageout(page, mapping)) {
+ case PAGE_KEEP:
+ case PAGE_ACTIVATE:
+ goto unlock_both;
+
+ case PAGE_SUCCESS:
+ unlock_page(newpage);
+ goto next;
+
+ case PAGE_CLEAN:
+ ; /* try to migrate the page below */
+ }
+ }
+
+ /*
+ * Buffers are managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+ if (!page_has_buffers(page) ||
+ try_to_release_page(page, GFP_KERNEL)) {
+ rc = migrate_page(newpage, page);
+ goto unlock_both;
+ }
+
+ /*
+ * On early passes with mapped pages simply
+ * retry. There may be a lock held for some
+ * buffers that may go away. Later
+ * swap them out.
+ */
+ if (pass > 4) {
+ /*
+ * Persistently unable to drop buffers..... As a
+ * measure of last resort we fall back to
+ * swap_page().
+ */
+ unlock_page(newpage);
+ newpage = NULL;
+ rc = swap_page(page);
+ goto next;
+ }
+
+unlock_both:
+ unlock_page(newpage);
+
+unlock_page:
+ unlock_page(page);
+
+next:
+ if (rc == -EAGAIN) {
+ retry++;
+ } else if (rc) {
+ /* Permanent failure */
+ list_move(&page->lru, failed);
+ nr_failed++;
+ } else {
+ if (newpage) {
+ /* Successful migration. Return page to LRU */
+ move_to_lru(newpage);
+ }
+ list_move(&page->lru, moved);
+ }
+ }
+ if (retry && pass++ < 10)
+ goto redo;
+
+ if (!swapwrite)
+ current->flags &= ~PF_SWAPWRITE;
+
+ return nr_failed + retry;
+}
+
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list with elevated refcount.
+ *
+ * Result:
+ * 0 = page not on LRU list
+ * 1 = page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page)
+{
+ int ret = 0;
+
+ if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (TestClearPageLRU(page)) {
+ ret = 1;
+ get_page(page);
+ if (PageActive(page))
+ del_page_from_active_list(zone, page);
+ else
+ del_page_from_inactive_list(zone, page);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
+ return ret;
+}
#endif
+
+/*
+ * zone->lru_lock is heavily contended. Some of the functions that
+ * shrink the lists perform better by taking out a batch of pages
+ * and working on them outside the LRU lock.
+ *
+ * For pagecache intensive workloads, this function is the hottest
+ * spot in the kernel (apart from copy_*_user functions).
+ *
+ * Appropriate locks must be held before calling this function.
+ *
+ * @nr_to_scan: The number of pages to look through on the list.
+ * @src: The LRU list to pull pages off.
+ * @dst: The temp list to put pages on to.
+ * @scanned: The number of pages that were scanned.
+ *
+ * returns how many pages were moved onto *@dst.
+ */
+static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+ struct list_head *dst, int *scanned)
+{
+ int nr_taken = 0;
+ struct page *page;
+ int scan = 0;
+
+ while (scan++ < nr_to_scan && !list_empty(src)) {
+ page = lru_to_page(src);
+ prefetchw_prev_lru_page(page, src, flags);
+
+ if (!TestClearPageLRU(page))
+ BUG();
+ list_del(&page->lru);
+ if (get_page_testone(page)) {
+ /*
+ * It is being freed elsewhere
+ */
+ __put_page(page);
+ SetPageLRU(page);
+ list_add(&page->lru, src);
+ continue;
+ } else {
+ list_add(&page->lru, dst);
+ nr_taken++;
+ }
+ }
+
+ *scanned = scan;
+ return nr_taken;
+}
+
+/*
+ * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ */
+static void shrink_cache(struct zone *zone, struct scan_control *sc)
{
LIST_HEAD(page_list);
struct pagevec pvec;
int max_scan = sc->nr_to_scan;
-#ifdef CONFIG_CKRM_RES_MEM
- struct zone *zone = ckrm_zone->zone;
- struct list_head *inactive_list = &ckrm_zone->inactive_list;
- struct list_head *active_list = &ckrm_zone->active_list;
-#else
- struct list_head *inactive_list = &zone->inactive_list;
- struct list_head *active_list = &zone->active_list;
-#endif
pagevec_init(&pvec, 1);
spin_lock_irq(&zone->lru_lock);
while (max_scan > 0) {
struct page *page;
- int nr_taken = 0;
- int nr_scan = 0;
+ int nr_taken;
+ int nr_scan;
int nr_freed;
- while (nr_scan++ < SWAP_CLUSTER_MAX &&
- !list_empty(inactive_list)) {
- page = lru_to_page(inactive_list);
-
- prefetchw_prev_lru_page(page,
- inactive_list, flags);
-
- if (!TestClearPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (get_page_testone(page)) {
- /*
- * It is being freed elsewhere
- */
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, inactive_list);
- continue;
- }
- list_add(&page->lru, &page_list);
- nr_taken++;
- }
+ nr_taken = isolate_lru_pages(sc->swap_cluster_max,
+ &zone->inactive_list,
+ &page_list, &nr_scan);
zone->nr_inactive -= nr_taken;
- ckrm_zone_dec_inactive(ckrm_zone, nr_taken);
+ zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
if (nr_taken == 0)
goto done;
max_scan -= nr_scan;
- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
nr_freed = shrink_list(&page_list, sc);
- if (current_is_kswapd())
- mod_page_state(kswapd_steal, nr_freed);
- mod_page_state_zone(zone, pgsteal, nr_freed);
- sc->nr_to_reclaim -= nr_freed;
- spin_lock_irq(&zone->lru_lock);
+ local_irq_disable();
+ if (current_is_kswapd()) {
+ __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+ __mod_page_state(kswapd_steal, nr_freed);
+ } else
+ __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+ __mod_page_state_zone(zone, pgsteal, nr_freed);
+
+ spin_lock(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
if (TestSetPageLRU(page))
BUG();
list_del(&page->lru);
- if (PageActive(page)) {
- ckrm_zone_inc_active(ckrm_zone, 1);
- zone->nr_active++;
- list_add(&page->lru, active_list);
- } else {
- ckrm_zone_inc_inactive(ckrm_zone, 1);
- zone->nr_inactive++;
- list_add(&page->lru, inactive_list);
- }
+ if (PageActive(page))
+ add_page_to_active_list(zone, page);
+ else
+ add_page_to_inactive_list(zone, page);
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
* But we had to alter page->flags anyway.
*/
static void
-#ifdef CONFIG_CKRM_RES_MEM
-refill_inactive_zone(struct ckrm_zone *ckrm_zone, struct scan_control *sc)
-#else
refill_inactive_zone(struct zone *zone, struct scan_control *sc)
-#endif
{
int pgmoved;
int pgdeactivate = 0;
- int pgscanned = 0;
+ int pgscanned;
int nr_pages = sc->nr_to_scan;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
-#ifdef CONFIG_CKRM_RES_MEM
- struct zone *zone = ckrm_zone->zone;
- struct list_head *active_list = &ckrm_zone->active_list;
- struct list_head *inactive_list = &ckrm_zone->inactive_list;
-#else
- struct list_head *active_list = &zone->active_list;
- struct list_head *inactive_list = &zone->inactive_list;
-#endif
+
+ if (unlikely(sc->may_swap)) {
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
+
+ /*
+ * `distress' is a measure of how much trouble we're having
+ * reclaiming pages. 0 -> no problems. 100 -> great trouble.
+ */
+ distress = 100 >> zone->prev_priority;
+
+ /*
+ * The point of this algorithm is to decide when to start
+ * reclaiming mapped memory instead of just pagecache. Work out
+ * how much memory
+ * is mapped.
+ */
+ mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+
+ /*
+ * Now decide how much we really want to unmap some pages. The
+ * mapped ratio is downgraded - just because there's a lot of
+ * mapped memory doesn't necessarily mean that page reclaim
+ * isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start
+ * going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm
+ * altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+ /*
+ * Now use this metric to decide whether to start moving mapped
+ * memory onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+ reclaim_mapped = 1;
+ }
lru_add_drain();
- pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
- while (pgscanned < nr_pages && !list_empty(active_list)) {
- page = lru_to_page(active_list);
- prefetchw_prev_lru_page(page, active_list, flags);
- if (!TestClearPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (get_page_testone(page)) {
- /*
- * It was already free! release_pages() or put_page()
- * are about to remove it from the LRU and free it. So
- * put the refcount back and put the page back on the
- * LRU
- */
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, active_list);
- } else {
- list_add(&page->lru, &l_hold);
- pgmoved++;
- }
- pgscanned++;
- }
+ pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+ &l_hold, &pgscanned);
zone->pages_scanned += pgscanned;
zone->nr_active -= pgmoved;
- ckrm_zone_dec_active(ckrm_zone, pgmoved);
spin_unlock_irq(&zone->lru_lock);
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
-
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
-
while (!list_empty(&l_hold)) {
+ cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0, sc->priority <= 0)) {
+ page_referenced(page, 0)) {
list_add(&page->lru, &l_active);
continue;
}
BUG();
if (!TestClearPageActive(page))
BUG();
- list_move(&page->lru, inactive_list);
+ list_move(&page->lru, &zone->inactive_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
- ckrm_zone_inc_inactive(ckrm_zone, pgmoved);
zone->nr_inactive += pgmoved;
spin_unlock_irq(&zone->lru_lock);
pgdeactivate += pgmoved;
spin_lock_irq(&zone->lru_lock);
}
}
- ckrm_zone_inc_inactive(ckrm_zone, pgmoved);
zone->nr_inactive += pgmoved;
pgdeactivate += pgmoved;
if (buffer_heads_over_limit) {
if (TestSetPageLRU(page))
BUG();
BUG_ON(!PageActive(page));
- list_move(&page->lru, active_list);
+ list_move(&page->lru, &zone->active_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
- ckrm_zone_inc_active(ckrm_zone, pgmoved);
zone->nr_active += pgmoved;
pgmoved = 0;
spin_unlock_irq(&zone->lru_lock);
spin_lock_irq(&zone->lru_lock);
}
}
- ckrm_zone_inc_active(ckrm_zone, pgmoved);
zone->nr_active += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
-
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
-}
-
-#ifdef CONFIG_CKRM_RES_MEM
-static int
-shrink_weight(struct ckrm_zone *czone)
-{
- u64 temp;
- struct zone *zone = czone->zone;
- struct ckrm_mem_res *cls = czone->memcls;
- int zone_usage, zone_guar, zone_total, guar, ret, cnt;
-
- zone_usage = czone->nr_active + czone->nr_inactive;
- czone->active_over = czone->inactive_over = 0;
-
- if (zone_usage < SWAP_CLUSTER_MAX * 4)
- return 0;
-
- if (cls->pg_guar == CKRM_SHARE_DONTCARE) {
- // no guarantee for this class. use implicit guarantee
- guar = cls->impl_guar / cls->nr_dontcare;
- } else {
- guar = cls->pg_unused / cls->nr_dontcare;
- }
- zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
- temp = (u64) guar * zone_total;
- do_div(temp, ckrm_tot_lru_pages);
- zone_guar = (int) temp;
-
- ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ?
- (zone_usage - zone_guar) : 0;
- if (ret) {
- cnt = czone->nr_active - (2 * zone_guar / 3);
- if (cnt > 0)
- czone->active_over = cnt;
- cnt = czone->active_over + czone->nr_inactive
- - zone_guar / 3;
- if (cnt > 0)
- czone->inactive_over = cnt;
- }
- return ret;
-}
-
-static void
-shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc)
-{
- while (czone->shrink_active || czone->shrink_inactive) {
- if (czone->shrink_active) {
- sc->nr_to_scan = min(czone->shrink_active,
- (unsigned long)SWAP_CLUSTER_MAX);
- czone->shrink_active -= sc->nr_to_scan;
- refill_inactive_zone(czone, sc);
- }
- if (czone->shrink_inactive) {
- sc->nr_to_scan = min(czone->shrink_inactive,
- (unsigned long)SWAP_CLUSTER_MAX);
- czone->shrink_inactive -= sc->nr_to_scan;
- shrink_cache(czone, sc);
- if (sc->nr_to_reclaim <= 0) {
- czone->shrink_active = 0;
- czone->shrink_inactive = 0;
- break;
- }
- }
-
- throttle_vm_writeout();
- }
-}
-
-/* insert an entry to the list and sort decendently*/
-static void
-list_add_sort(struct list_head *entry, struct list_head *head)
-{
- struct ckrm_zone *czone, *new =
- list_entry(entry, struct ckrm_zone, victim_list);
- struct list_head* pos = head->next;
-
- while (pos != head) {
- czone = list_entry(pos, struct ckrm_zone, victim_list);
- if (new->shrink_weight > czone->shrink_weight) {
- __list_add(entry, pos->prev, pos);
- return;
- }
- pos = pos->next;
- }
- list_add_tail(entry, head);
- return;
-}
-
-static void
-shrink_choose_victims(struct list_head *victims,
- unsigned long nr_active, unsigned long nr_inactive)
-{
- unsigned long nr;
- struct ckrm_zone* czone;
- struct list_head *pos, *next;
-
- pos = victims->next;
- while ((pos != victims) && (nr_active || nr_inactive)) {
- czone = list_entry(pos, struct ckrm_zone, victim_list);
-
- if (nr_active && czone->active_over) {
- nr = min(nr_active, czone->active_over);
- czone->shrink_active += nr;
- czone->active_over -= nr;
- nr_active -= nr;
- }
-
- if (nr_inactive && czone->inactive_over) {
- nr = min(nr_inactive, czone->inactive_over);
- czone->shrink_inactive += nr;
- czone->inactive_over -= nr;
- nr_inactive -= nr;
- }
- pos = pos->next;
- }
-
- pos = victims->next;
- while (pos != victims) {
- czone = list_entry(pos, struct ckrm_zone, victim_list);
- next = pos->next;
- if (czone->shrink_active == 0 && czone->shrink_inactive == 0) {
- list_del_init(pos);
- ckrm_clear_shrink(czone);
- }
- pos = next;
- }
- return;
-}
+ spin_unlock(&zone->lru_lock);
-static void
-shrink_get_victims(struct zone *zone, unsigned long nr_active,
- unsigned long nr_inactive, struct list_head *victims)
-{
- struct list_head *pos;
- struct ckrm_mem_res *cls;
- struct ckrm_zone *czone;
- int zoneindex = zone_idx(zone);
-
- if (ckrm_nr_mem_classes <= 1) {
- if (ckrm_mem_root_class) {
- czone = ckrm_mem_root_class->ckrm_zone + zoneindex;
- if (!ckrm_test_set_shrink(czone)) {
- list_add(&czone->victim_list, victims);
- czone->shrink_active = nr_active;
- czone->shrink_inactive = nr_inactive;
- }
- }
- return;
- }
- spin_lock_irq(&ckrm_mem_lock);
- list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) {
- czone = cls->ckrm_zone + zoneindex;
- if (ckrm_test_set_shrink(czone))
- continue;
+ __mod_page_state_zone(zone, pgrefill, pgscanned);
+ __mod_page_state(pgdeactivate, pgdeactivate);
+ local_irq_enable();
- czone->shrink_active = 0;
- czone->shrink_inactive = 0;
- czone->shrink_weight = shrink_weight(czone);
- if (czone->shrink_weight) {
- list_add_sort(&czone->victim_list, victims);
- } else {
- ckrm_clear_shrink(czone);
- }
- }
- pos = victims->next;
- while (pos != victims) {
- czone = list_entry(pos, struct ckrm_zone, victim_list);
- pos = pos->next;
- }
- shrink_choose_victims(victims, nr_active, nr_inactive);
- spin_unlock_irq(&ckrm_mem_lock);
- pos = victims->next;
- while (pos != victims) {
- czone = list_entry(pos, struct ckrm_zone, victim_list);
- pos = pos->next;
- }
+ pagevec_release(&pvec);
}
-#endif /* CONFIG_CKRM_RES_MEM */
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
{
unsigned long nr_active;
unsigned long nr_inactive;
-#ifdef CONFIG_CKRM_RES_MEM
- struct ckrm_zone *czone;
-#endif
+
+ atomic_inc(&zone->reclaim_in_progress);
/*
* Add one to `nr_to_scan' just to make sure that the kernel will
*/
zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
nr_active = zone->nr_scan_active;
- if (nr_active >= SWAP_CLUSTER_MAX)
+ if (nr_active >= sc->swap_cluster_max)
zone->nr_scan_active = 0;
else
nr_active = 0;
zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
nr_inactive = zone->nr_scan_inactive;
- if (nr_inactive >= SWAP_CLUSTER_MAX)
+ if (nr_inactive >= sc->swap_cluster_max)
zone->nr_scan_inactive = 0;
else
nr_inactive = 0;
- sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
-
-#ifdef CONFIG_CKRM_RES_MEM
- if (nr_active || nr_inactive) {
- struct list_head *pos, *next;
- LIST_HEAD(victims);
-
- shrink_get_victims(zone, nr_active, nr_inactive, &victims);
- pos = victims.next;
- while (pos != &victims) {
- czone = list_entry(pos, struct ckrm_zone, victim_list);
- next = pos->next;
- list_del_init(pos);
- ckrm_clear_shrink(czone);
- sc->nr_to_reclaim = czone->shrink_inactive;
- shrink_ckrmzone(czone, sc);
- pos = next;
- }
- }
-#else
while (nr_active || nr_inactive) {
if (nr_active) {
sc->nr_to_scan = min(nr_active,
- (unsigned long)SWAP_CLUSTER_MAX);
+ (unsigned long)sc->swap_cluster_max);
nr_active -= sc->nr_to_scan;
refill_inactive_zone(zone, sc);
}
if (nr_inactive) {
sc->nr_to_scan = min(nr_inactive,
- (unsigned long)SWAP_CLUSTER_MAX);
+ (unsigned long)sc->swap_cluster_max);
nr_inactive -= sc->nr_to_scan;
shrink_cache(zone, sc);
- if (sc->nr_to_reclaim <= 0)
- break;
}
}
-#endif
-}
-
-#ifdef CONFIG_CKRM_RES_MEM
-// This function needs to be given more thought.
-// Shrink the class to be at shrink_to%" of its limit
-static void
-ckrm_shrink_class(struct ckrm_mem_res *cls)
-{
- struct scan_control sc;
- struct zone *zone;
- int zindex = 0, cnt, act_credit = 0, inact_credit = 0;
- int shrink_to = ckrm_mem_get_shrink_to();
-
- sc.nr_mapped = read_page_state(nr_mapped);
- sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = 0; // always very high priority
-
- check_memclass(cls, "bef_shnk_cls");
- for_each_zone(zone) {
- int zone_total, zone_limit, active_limit,
- inactive_limit, clszone_limit;
- struct ckrm_zone *czone;
- u64 temp;
-
- czone = &cls->ckrm_zone[zindex];
- if (ckrm_test_set_shrink(czone))
- continue;
-
- zone->temp_priority = zone->prev_priority;
- zone->prev_priority = sc.priority;
- zone_total = zone->nr_active + zone->nr_inactive
- + zone->free_pages;
+ throttle_vm_writeout();
- temp = (u64) cls->pg_limit * zone_total;
- do_div(temp, ckrm_tot_lru_pages);
- zone_limit = (int) temp;
- clszone_limit = (shrink_to * zone_limit) / 100;
- active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list
- inactive_limit = clszone_limit / 3; // 1/3rd in inactive list
-
- czone->shrink_active = 0;
- cnt = czone->nr_active + act_credit - active_limit;
- if (cnt > 0) {
- czone->shrink_active = (unsigned long) cnt;
- } else {
- act_credit += cnt;
- }
-
- czone->shrink_inactive = 0;
- cnt = czone->shrink_active + inact_credit +
- (czone->nr_inactive - inactive_limit);
- if (cnt > 0) {
- czone->shrink_inactive = (unsigned long) cnt;
- } else {
- inact_credit += cnt;
- }
-
-
- if (czone->shrink_active || czone->shrink_inactive) {
- sc.nr_to_reclaim = czone->shrink_inactive;
- shrink_ckrmzone(czone, &sc);
- }
- zone->prev_priority = zone->temp_priority;
- zindex++;
- ckrm_clear_shrink(czone);
- }
- check_memclass(cls, "aft_shnk_cls");
-}
-
-static void
-ckrm_shrink_classes(void)
-{
- struct ckrm_mem_res *cls;
-
- spin_lock_irq(&ckrm_mem_lock);
- while (!ckrm_shrink_list_empty()) {
- cls = list_entry(ckrm_shrink_list.next, struct ckrm_mem_res,
- shrink_list);
- list_del(&cls->shrink_list);
- cls->flags &= ~MEM_AT_LIMIT;
- spin_unlock_irq(&ckrm_mem_lock);
- ckrm_shrink_class(cls);
- spin_lock_irq(&ckrm_mem_lock);
- }
- spin_unlock_irq(&ckrm_mem_lock);
+ atomic_dec(&zone->reclaim_in_progress);
}
-#else
-#define ckrm_shrink_classes() do { } while(0)
-#endif
-
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
+ continue;
+
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;
zone->temp_priority = sc->priority;
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones,
- unsigned int gfp_mask, unsigned int order)
+int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
{
int priority;
int ret = 0;
int i;
sc.gfp_mask = gfp_mask;
- sc.may_writepage = 0;
+ sc.may_writepage = !laptop_mode;
+ sc.may_swap = 1;
inc_page_state(allocstall);
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ continue;
+
zone->temp_priority = DEF_PRIORITY;
lru_pages += zone->nr_active + zone->nr_inactive;
}
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
sc.priority = priority;
+ sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+ if (!priority)
+ disable_swap_token();
shrink_caches(zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
- if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) {
+ total_scanned += sc.nr_scanned;
+ total_reclaimed += sc.nr_reclaimed;
+ if (total_reclaimed >= sc.swap_cluster_max) {
ret = 1;
goto out;
}
- total_scanned += sc.nr_scanned;
- total_reclaimed += sc.nr_reclaimed;
/*
* Try to write back as many pages as we just scanned. This
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
- if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
- wakeup_bdflush(laptop_mode ? 0 : total_scanned);
+ if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+ wakeup_pdflush(laptop_mode ? 0 : total_scanned);
sc.may_writepage = 1;
}
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
- out_of_memory(gfp_mask);
out:
- for (i = 0; zones[i] != 0; i++)
- zones[i]->prev_priority = zones[i]->temp_priority;
+ for (i = 0; zones[i] != 0; i++) {
+ struct zone *zone = zones[i];
+
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ continue;
+
+ zone->prev_priority = zone->temp_priority;
+ }
return ret;
}
* the page allocator fallback scheme to ensure that aging of pages is balanced
* across the zones.
*/
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages)
+static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
{
int to_free = nr_pages;
int all_zones_ok;
total_scanned = 0;
total_reclaimed = 0;
sc.gfp_mask = GFP_KERNEL;
- sc.may_writepage = 0;
+ sc.may_writepage = !laptop_mode;
+ sc.may_swap = 1;
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
+ /* The swap token gets in the way of swapout... */
+ if (!priority)
+ disable_swap_token();
+
all_zones_ok = 1;
if (nr_pages == 0) {
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable &&
priority != DEF_PRIORITY)
continue;
- if (zone->free_pages <= zone->pages_high) {
+ if (!zone_watermark_ok(zone, order,
+ zone->pages_high, 0, 0)) {
end_zone = i;
goto scan;
}
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
+ int nr_slab;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
if (nr_pages == 0) { /* Not software suspend */
- if (zone->free_pages <= zone->pages_high)
+ if (!zone_watermark_ok(zone, order,
+ zone->pages_high, end_zone, 0))
all_zones_ok = 0;
}
zone->temp_priority = priority;
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
sc.priority = priority;
+ sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+ nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+ lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_reclaimed += sc.nr_reclaimed;
- total_scanned += sc.nr_scanned;
+ total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (zone->pages_scanned >= (zone->nr_active +
- zone->nr_inactive) * 4)
+ if (nr_slab == 0 && zone->pages_scanned >=
+ (zone->nr_active + zone->nr_inactive) * 4)
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if (total_reclaimed >= SWAP_CLUSTER_MAX)
+ if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
break;
}
out:
*/
static int kswapd(void *p)
{
+ unsigned long order;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
DEFINE_WAIT(wait);
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
- tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ order = 0;
for ( ; ; ) {
- if (current->flags & PF_FREEZE)
- refrigerator(PF_FREEZE);
+ unsigned long new_order;
+
+ try_to_freeze();
+
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- schedule();
+ new_order = pgdat->kswapd_max_order;
+ pgdat->kswapd_max_order = 0;
+ if (order < new_order) {
+ /*
+ * Don't sleep if someone wants a larger 'order'
+ * allocation
+ */
+ order = new_order;
+ } else {
+ schedule();
+ order = pgdat->kswapd_max_order;
+ }
finish_wait(&pgdat->kswapd_wait, &wait);
- if (!ckrm_shrink_list_empty())
- ckrm_shrink_classes();
- else
- balance_pgdat(pgdat, 0);
+ balance_pgdat(pgdat, 0, order);
}
return 0;
}
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
-void wakeup_kswapd(struct zone *zone)
+void wakeup_kswapd(struct zone *zone, int order)
{
- if (zone->present_pages == 0)
+ pg_data_t *pgdat;
+
+ if (!populated_zone(zone))
return;
- if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty())
+
+ pgdat = zone->zone_pgdat;
+ if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+ return;
+ if (pgdat->kswapd_max_order < order)
+ pgdat->kswapd_max_order = order;
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
return;
- if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+ if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
+ wake_up_interruptible(&pgdat->kswapd_wait);
}
#ifdef CONFIG_PM
current->reclaim_state = &reclaim_state;
for_each_pgdat(pgdat) {
int freed;
- freed = balance_pgdat(pgdat, nr_to_free);
+ freed = balance_pgdat(pgdat, nr_to_free, 0);
ret += freed;
nr_to_free -= freed;
if (nr_to_free <= 0)
swap_setup();
for_each_pgdat(pgdat)
pgdat->kswapd
- = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+ = find_task_by_real_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
total_memory = nr_free_pagecache_pages();
hotcpu_notifier(cpu_callback, 0);
return 0;
}
module_init(kswapd_init)
+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+int zone_reclaim_interval __read_mostly = 30*HZ;
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ int nr_pages;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
+ struct scan_control sc;
+ cpumask_t mask;
+ int node_id;
+
+ if (time_before(jiffies,
+ zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+ return 0;
+
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0 ||
+ (p->flags & PF_MEMALLOC))
+ return 0;
+
+ node_id = zone->zone_pgdat->node_id;
+ mask = node_to_cpumask(node_id);
+ if (!cpus_empty(mask) && node_id != numa_node_id())
+ return 0;
+
+ sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
+ sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
+ sc.nr_scanned = 0;
+ sc.nr_reclaimed = 0;
+ sc.priority = ZONE_RECLAIM_PRIORITY + 1;
+ sc.nr_mapped = read_page_state(nr_mapped);
+ sc.gfp_mask = gfp_mask;
+
+ disable_swap_token();
+
+ nr_pages = 1 << order;
+ if (nr_pages > SWAP_CLUSTER_MAX)
+ sc.swap_cluster_max = nr_pages;
+ else
+ sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+ cond_resched();
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_SWAP.
+ */
+ p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+ /*
+ * Free memory by calling shrink zone with increasing priorities
+ * until we have enough memory freed.
+ */
+ do {
+ sc.priority--;
+ shrink_zone(zone, &sc);
+
+ } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
+
+ if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+ /*
+ * shrink_slab does not currently allow us to determine
+ * how many pages were freed in the zone. So we just
+ * shake the slab and then go offnode for a single allocation.
+ *
+ * shrink_slab will free memory on all zones and may take
+ * a long time.
+ */
+ shrink_slab(sc.nr_scanned, gfp_mask, order);
+ }
+
+ p->reclaim_state = NULL;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+
+ if (sc.nr_reclaimed == 0)
+ zone->last_unsuccessful_zone_reclaim = jiffies;
+
+ return sc.nr_reclaimed >= nr_pages;
+}
+#endif
+