X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=2dac95f387b5e425cde6db25a35b8ee00d37745d;hb=987b0145d94eecf292d8b301228356f44611ab7c;hp=840092d46a90244a5a0ad23aedde2bdf030ab928;hpb=f7ed79d23a47594e7834d66a8f14449796d4f3e6;p=linux-2.6.git diff --git a/mm/vmscan.c b/mm/vmscan.c index 840092d46..2dac95f38 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,21 +33,39 @@ #include #include #include -#include #include #include #include -#include "internal.h" +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; struct scan_control { + /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ + unsigned long nr_to_scan; + /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; + /* Incremented by the number of pages reclaimed */ + unsigned long nr_reclaimed; + unsigned long nr_mapped; /* From page_state */ + /* Ask shrink_caches, or shrink_zone to scan at this priority */ + unsigned int priority; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -165,11 +183,10 @@ EXPORT_SYMBOL(remove_shrinker); * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) +int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) { struct shrinker *shrinker; - unsigned long ret = 0; + int ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; @@ -289,10 +306,9 @@ static void handle_write_error(struct address_space *mapping, } /* - * pageout is called by shrink_page_list() for each dirty page. - * Calls ->writepage(). + * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). */ -pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -360,7 +376,7 @@ pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } -int remove_mapping(struct address_space *mapping, struct page *page) +static int remove_mapping(struct address_space *mapping, struct page *page) { if (!mapping) return 0; /* truncate got there first */ @@ -398,15 +414,14 @@ cannot_free: } /* - * shrink_page_list() returns the number of reclaimed pages + * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ -static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc) +static int shrink_list(struct list_head *page_list, struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; - unsigned long nr_reclaimed = 0; + int reclaimed = 0; cond_resched(); @@ -449,9 +464,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) + if (PageAnon(page) && !PageSwapCache(page)) { + if (!sc->may_swap) + goto keep_locked; if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; + } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -463,6 +481,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { + /* + * No unmapping if we do not swap + */ + if (!sc->may_swap) + goto keep_locked; + switch (try_to_unmap(page, 0)) { case SWAP_FAIL: goto activate_locked; @@ -537,7 +561,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, free_it: unlock_page(page); - nr_reclaimed++; + reclaimed++; if (!pagevec_add(&freed_pvec, page)) __pagevec_release_nonlru(&freed_pvec); continue; @@ -555,9 +579,495 @@ keep: if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); mod_page_state(pgactivate, pgactivate); - return nr_reclaimed; + sc->nr_reclaimed += reclaimed; + return reclaimed; +} + +#ifdef CONFIG_MIGRATION +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); } +/* + * Add isolated pages on the list back to the LRU. + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + +/* + * Non migratable page + */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + +/* + * swapout a single page + * page is locked upon entry, unlocked on exit + */ +static int swap_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (page_mapped(page) && mapping) + if (try_to_unmap(page, 1) != SWAP_SUCCESS) + goto unlock_retry; + + if (PageDirty(page)) { + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_retry; + + case PAGE_SUCCESS: + goto retry; + + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + if (PagePrivate(page)) { + if (!try_to_release_page(page, GFP_KERNEL) || + (!mapping && page_count(page) == 1)) + goto unlock_retry; + } + + if (remove_mapping(mapping, page)) { + /* Success */ + unlock_page(page); + return 0; + } + +unlock_retry: + unlock_page(page); + +retry: + return -EAGAIN; +} +EXPORT_SYMBOL(swap_page); + +/* + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro + * Hirokazu Takahashi + * Dave Hansen + * Christoph Lameter + */ + +/* + * Remove references for a page and establish the new page with the correct + * basic settings to be able to stop accesses to the page. + */ +int migrate_page_remove_references(struct page *newpage, + struct page *page, int nr_refs) +{ + struct address_space *mapping = page_mapping(page); + struct page **radix_pointer; + + /* + * Avoid doing any of the following work if the page count + * indicates that the page is in use or truncate has removed + * the page. + */ + if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + return -EAGAIN; + + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> Permanent failure */ + return -EPERM; + + /* + * Give up if we were unable to remove all mappings. + */ + if (page_mapcount(page)) + return -EAGAIN; + + write_lock_irq(&mapping->tree_lock); + + radix_pointer = (struct page **)radix_tree_lookup_slot( + &mapping->page_tree, + page_index(page)); + + if (!page_mapping(page) || page_count(page) != nr_refs || + *radix_pointer != page) { + write_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + /* + * Now we know that no one else is looking at the page. + * + * Certain minimal information about a page must be available + * in order for other subsystems to properly handle the page if they + * find it through the radix tree update before we are finished + * copying the page. + */ + get_page(newpage); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + *radix_pointer = newpage; + __put_page(page); + write_unlock_irq(&mapping->tree_lock); + + return 0; +} +EXPORT_SYMBOL(migrate_page_remove_references); + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageActive(page)) + SetPageActive(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + ClearPageSwapCache(page); + ClearPageActive(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page->mapping = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} +EXPORT_SYMBOL(migrate_page_copy); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct page *newpage, struct page *page) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_remove_references(newpage, page, 2); + + if (rc) + return rc; + + migrate_page_copy(newpage, page); + + /* + * Remove auxiliary swap entries and replace + * them with real ptes. + * + * Note that a real pte entry will allow processes that are not + * waiting on the page lock to use the new page via the page tables + * before the new page is unlocked. + */ + remove_from_swap(newpage); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the pages isolated + * can be moved to. If the second list is NULL then all + * pages are swapped out. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. + * + * Return: Number of pages not migrated when "to" ran empty. + */ +int migrate_pages(struct list_head *from, struct list_head *to, + struct list_head *moved, struct list_head *failed) +{ + int retry; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + +redo: + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + struct page *newpage = NULL; + struct address_space *mapping; + + cond_resched(); + + rc = 0; + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto next; + + if (to && list_empty(to)) + break; + + /* + * Skip locked pages during the first two passes to give the + * functions holding the lock time to release the page. Later we + * use lock_page() to have a higher chance of acquiring the + * lock. + */ + rc = -EAGAIN; + if (pass > 2) + lock_page(page); + else + if (TestSetPageLocked(page)) + goto next; + + /* + * Only wait on writeback if we have already done a pass where + * we we may have triggered writeouts for lots of pages. + */ + if (pass > 0) { + wait_on_page_writeback(page); + } else { + if (PageWriteback(page)) + goto unlock_page; + } + + /* + * Anonymous pages must have swap cache references otherwise + * the information contained in the page maps cannot be + * preserved. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page, GFP_KERNEL)) { + rc = -ENOMEM; + goto unlock_page; + } + } + + if (!to) { + rc = swap_page(page); + goto next; + } + + newpage = lru_to_page(to); + lock_page(newpage); + + /* + * Pages are properly locked and writeback is complete. + * Try to migrate the page. + */ + mapping = page_mapping(page); + if (!mapping) + goto unlock_both; + + if (mapping->a_ops->migratepage) { + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(newpage, page); + goto unlock_both; + } + + /* Make sure the dirty bit is up to date */ + if (try_to_unmap(page, 1) == SWAP_FAIL) { + rc = -EPERM; + goto unlock_both; + } + + if (page_mapcount(page)) { + rc = -EAGAIN; + goto unlock_both; + } + + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_both; + + case PAGE_SUCCESS: + unlock_page(newpage); + goto next; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers are managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!page_has_buffers(page) || + try_to_release_page(page, GFP_KERNEL)) { + rc = migrate_page(newpage, page); + goto unlock_both; + } + + /* + * On early passes with mapped pages simply + * retry. There may be a lock held for some + * buffers that may go away. Later + * swap them out. + */ + if (pass > 4) { + /* + * Persistently unable to drop buffers..... As a + * measure of last resort we fall back to + * swap_page(). + */ + unlock_page(newpage); + newpage = NULL; + rc = swap_page(page); + goto next; + } + +unlock_both: + unlock_page(newpage); + +unlock_page: + unlock_page(page); + +next: + if (rc == -EAGAIN) { + retry++; + } else if (rc) { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } else { + if (newpage) { + /* Successful migration. Return page to LRU */ + move_to_lru(newpage); + } + list_move(&page->lru, moved); + } + } + if (retry && pass++ < 10) + goto redo; + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + return nr_failed + retry; +} + +/* + * Isolate one page from the LRU lists and put it on the + * indicated list with elevated refcount. + * + * Result: + * 0 = page not on LRU list + * 1 = page removed from LRU list and added to the specified list. + */ +int isolate_lru_page(struct page *page) +{ + int ret = 0; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (TestClearPageLRU(page)) { + ret = 1; + get_page(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); + } + + return ret; +} +#endif + /* * zone->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages @@ -575,35 +1085,32 @@ keep: * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct list_head *src, struct list_head *dst, - unsigned long *scanned) +static int isolate_lru_pages(int nr_to_scan, struct list_head *src, + struct list_head *dst, int *scanned) { - unsigned long nr_taken = 0; + int nr_taken = 0; struct page *page; - unsigned long scan; + int scan = 0; - for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { - struct list_head *target; + while (scan++ < nr_to_scan && !list_empty(src)) { page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - BUG_ON(!PageLRU(page)); - + if (!TestClearPageLRU(page)) + BUG(); list_del(&page->lru); - target = src; - if (likely(get_page_unless_zero(page))) { + if (get_page_testone(page)) { /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. + * It is being freed elsewhere */ - ClearPageLRU(page); - target = dst; + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + continue; + } else { + list_add(&page->lru, dst); nr_taken++; - } /* else it is being freed elsewhere */ - - list_add(&page->lru, target); + } } *scanned = scan; @@ -611,26 +1118,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number - * of reclaimed pages + * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed */ -static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc) +static void shrink_cache(struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - unsigned long nr_scanned = 0; - unsigned long nr_reclaimed = 0; + int max_scan = sc->nr_to_scan; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); - do { + while (max_scan > 0) { struct page *page; - unsigned long nr_taken; - unsigned long nr_scan; - unsigned long nr_freed; + int nr_taken; + int nr_scan; + int nr_freed; nr_taken = isolate_lru_pages(sc->swap_cluster_max, &zone->inactive_list, @@ -639,9 +1143,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); - nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc); - nr_reclaimed += nr_freed; + if (nr_taken == 0) + goto done; + + max_scan -= nr_scan; + nr_freed = shrink_list(&page_list, sc); + local_irq_disable(); if (current_is_kswapd()) { __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); @@ -650,17 +1157,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_page_state_zone(zone, pgscan_direct, nr_scan); __mod_page_state_zone(zone, pgsteal, nr_freed); - if (nr_taken == 0) - goto done; - spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - BUG_ON(PageLRU(page)); - SetPageLRU(page); + if (TestSetPageLRU(page)) + BUG(); list_del(&page->lru); if (PageActive(page)) add_page_to_active_list(zone, page); @@ -672,12 +1176,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_lock_irq(&zone->lru_lock); } } - } while (nr_scanned < max_scan); - spin_unlock(&zone->lru_lock); + } + spin_unlock_irq(&zone->lru_lock); done: - local_irq_enable(); pagevec_release(&pvec); - return nr_reclaimed; } /* @@ -697,12 +1199,13 @@ done: * The downside is that we have to touch page->_count against each page. * But we had to alter page->flags anyway. */ -static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc) +static void +refill_inactive_zone(struct zone *zone, struct scan_control *sc) { - unsigned long pgmoved; + int pgmoved; int pgdeactivate = 0; - unsigned long pgscanned; + int pgscanned; + int nr_pages = sc->nr_to_scan; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */ @@ -710,7 +1213,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct pagevec pvec; int reclaim_mapped = 0; - if (sc->may_swap) { + if (unlikely(sc->may_swap)) { long mapped_ratio; long distress; long swap_tendency; @@ -780,11 +1283,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - BUG_ON(PageLRU(page)); - SetPageLRU(page); - BUG_ON(!PageActive(page)); - ClearPageActive(page); - + if (TestSetPageLRU(page)) + BUG(); + if (!TestClearPageActive(page)) + BUG(); list_move(&page->lru, &zone->inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { @@ -810,8 +1312,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - BUG_ON(PageLRU(page)); - SetPageLRU(page); + if (TestSetPageLRU(page)) + BUG(); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); pgmoved++; @@ -836,13 +1338,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static unsigned long shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static void +shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; - unsigned long nr_to_scan; - unsigned long nr_reclaimed = 0; atomic_inc(&zone->reclaim_in_progress); @@ -850,14 +1350,14 @@ static unsigned long shrink_zone(int priority, struct zone *zone, * Add one to `nr_to_scan' just to make sure that the kernel will * slowly sift through the active list. */ - zone->nr_scan_active += (zone->nr_active >> priority) + 1; + zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; nr_active = zone->nr_scan_active; if (nr_active >= sc->swap_cluster_max) zone->nr_scan_active = 0; else nr_active = 0; - zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; + zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; nr_inactive = zone->nr_scan_inactive; if (nr_inactive >= sc->swap_cluster_max) zone->nr_scan_inactive = 0; @@ -866,25 +1366,23 @@ static unsigned long shrink_zone(int priority, struct zone *zone, while (nr_active || nr_inactive) { if (nr_active) { - nr_to_scan = min(nr_active, + sc->nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); - nr_active -= nr_to_scan; - shrink_active_list(nr_to_scan, zone, sc); + nr_active -= sc->nr_to_scan; + refill_inactive_zone(zone, sc); } if (nr_inactive) { - nr_to_scan = min(nr_inactive, + sc->nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); - nr_inactive -= nr_to_scan; - nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, - sc); + nr_inactive -= sc->nr_to_scan; + shrink_cache(zone, sc); } } throttle_vm_writeout(); atomic_dec(&zone->reclaim_in_progress); - return nr_reclaimed; } /* @@ -903,10 +1401,9 @@ static unsigned long shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static unsigned long shrink_zones(int priority, struct zone **zones, - struct scan_control *sc) +static void +shrink_caches(struct zone **zones, struct scan_control *sc) { - unsigned long nr_reclaimed = 0; int i; for (i = 0; zones[i] != NULL; i++) { @@ -918,16 +1415,15 @@ static unsigned long shrink_zones(int priority, struct zone **zones, if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = priority; - if (zone->prev_priority > priority) - zone->prev_priority = priority; + zone->temp_priority = sc->priority; + if (zone->prev_priority > sc->priority) + zone->prev_priority = sc->priority; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - nr_reclaimed += shrink_zone(priority, zone, sc); + shrink_zone(zone, sc); } - return nr_reclaimed; } /* @@ -943,21 +1439,19 @@ static unsigned long shrink_zones(int priority, struct zone **zones, * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) { int priority; int ret = 0; - unsigned long total_scanned = 0; - unsigned long nr_reclaimed = 0; + int total_scanned = 0, total_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; + struct scan_control sc; unsigned long lru_pages = 0; int i; - struct scan_control sc = { - .gfp_mask = gfp_mask, - .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, - .may_swap = 1, - }; + + sc.gfp_mask = gfp_mask; + sc.may_writepage = !laptop_mode; + sc.may_swap = 1; inc_page_state(allocstall); @@ -974,16 +1468,20 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + sc.priority = priority; + sc.swap_cluster_max = SWAP_CLUSTER_MAX; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zones, &sc); + shrink_caches(zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { - nr_reclaimed += reclaim_state->reclaimed_slab; + sc.nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } total_scanned += sc.nr_scanned; - if (nr_reclaimed >= sc.swap_cluster_max) { + total_reclaimed += sc.nr_reclaimed; + if (total_reclaimed >= sc.swap_cluster_max) { ret = 1; goto out; } @@ -995,8 +1493,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc.swap_cluster_max + - sc.swap_cluster_max / 2) { + if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { wakeup_pdflush(laptop_mode ? 0 : total_scanned); sc.may_writepage = 1; } @@ -1042,26 +1539,22 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, - int order) +static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) { - unsigned long to_free = nr_pages; + int to_free = nr_pages; int all_zones_ok; int priority; int i; - unsigned long total_scanned; - unsigned long nr_reclaimed; + int total_scanned, total_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc = { - .gfp_mask = GFP_KERNEL, - .may_swap = 1, - .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, - }; + struct scan_control sc; loop_again: total_scanned = 0; - nr_reclaimed = 0; + total_reclaimed = 0; + sc.gfp_mask = GFP_KERNEL; sc.may_writepage = !laptop_mode; + sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1142,11 +1635,15 @@ scan: if (zone->prev_priority > priority) zone->prev_priority = priority; sc.nr_scanned = 0; - nr_reclaimed += shrink_zone(priority, zone, &sc); + sc.nr_reclaimed = 0; + sc.priority = priority; + sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; + shrink_zone(zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - nr_reclaimed += reclaim_state->reclaimed_slab; + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + total_reclaimed += sc.nr_reclaimed; total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; @@ -1159,10 +1656,10 @@ scan: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > nr_reclaimed + nr_reclaimed / 2) + total_scanned > total_reclaimed+total_reclaimed/2) sc.may_writepage = 1; } - if (nr_pages && to_free > nr_reclaimed) + if (nr_pages && to_free > total_reclaimed) continue; /* swsusp: need to do more work */ if (all_zones_ok) break; /* kswapd: all done */ @@ -1179,7 +1676,7 @@ scan: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) + if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) break; } out: @@ -1193,7 +1690,7 @@ out: goto loop_again; } - return nr_reclaimed; + return total_reclaimed; } /* @@ -1293,31 +1790,24 @@ void wakeup_kswapd(struct zone *zone, int order) * Try to free `nr_pages' of memory, system-wide. Returns the number of freed * pages. */ -unsigned long shrink_all_memory(unsigned long nr_pages) +int shrink_all_memory(int nr_pages) { pg_data_t *pgdat; - unsigned long nr_to_free = nr_pages; - unsigned long ret = 0; - unsigned retry = 2; + int nr_to_free = nr_pages; + int ret = 0; struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; current->reclaim_state = &reclaim_state; -repeat: - for_each_online_pgdat(pgdat) { - unsigned long freed; - + for_each_pgdat(pgdat) { + int freed; freed = balance_pgdat(pgdat, nr_to_free, 0); ret += freed; nr_to_free -= freed; - if ((long)nr_to_free <= 0) + if (nr_to_free <= 0) break; } - if (retry-- && ret < nr_pages) { - blk_congestion_wait(WRITE, HZ/5); - goto repeat; - } current->reclaim_state = NULL; return ret; } @@ -1328,14 +1818,15 @@ repeat: not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { pg_data_t *pgdat; cpumask_t mask; if (action == CPU_ONLINE) { - for_each_online_pgdat(pgdat) { + for_each_pgdat(pgdat) { mask = node_to_cpumask(pgdat->node_id); if (any_online_cpu(mask) != NR_CPUS) /* One of our CPUs online: restore mask */ @@ -1349,17 +1840,10 @@ static int cpu_callback(struct notifier_block *nfb, static int __init kswapd_init(void) { pg_data_t *pgdat; - swap_setup(); - for_each_online_pgdat(pgdat) { - pid_t pid; - - pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); - BUG_ON(pid < 0); - read_lock(&tasklist_lock); - pgdat->kswapd = find_task_by_real_pid(pid); - read_unlock(&tasklist_lock); - } + for_each_pgdat(pgdat) + pgdat->kswapd + = find_task_by_real_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; @@ -1401,24 +1885,46 @@ int zone_reclaim_interval __read_mostly = 30*HZ; /* * Try to free up some pages from this zone through reclaim. */ -static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - /* Minimum pages needed in order to stay on node */ - const unsigned long nr_pages = 1 << order; + int nr_pages; struct task_struct *p = current; struct reclaim_state reclaim_state; - int priority; - unsigned long nr_reclaimed = 0; - struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .nr_mapped = read_page_state(nr_mapped), - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, - }; + struct scan_control sc; + cpumask_t mask; + int node_id; + + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + return 0; + + if (!(gfp_mask & __GFP_WAIT) || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0 || + (p->flags & PF_MEMALLOC)) + return 0; + + node_id = zone->zone_pgdat->node_id; + mask = node_to_cpumask(node_id); + if (!cpus_empty(mask) && node_id != numa_node_id()) + return 0; + + sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); + sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + sc.priority = ZONE_RECLAIM_PRIORITY + 1; + sc.nr_mapped = read_page_state(nr_mapped); + sc.gfp_mask = gfp_mask; disable_swap_token(); + + nr_pages = 1 << order; + if (nr_pages > SWAP_CLUSTER_MAX) + sc.swap_cluster_max = nr_pages; + else + sc.swap_cluster_max = SWAP_CLUSTER_MAX; + cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -1433,20 +1939,17 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing priorities * until we have enough memory freed. */ - priority = ZONE_RECLAIM_PRIORITY; do { - nr_reclaimed += shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + sc.priority--; + shrink_zone(zone, &sc); + + } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); - if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { /* - * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we just shake the slab - * a bit and then go off node for this particular allocation - * despite possibly having freed enough memory to allocate in - * this zone. If we freed local memory then the next - * allocations will be local again. + * shrink_slab does not currently allow us to determine + * how many pages were freed in the zone. So we just + * shake the slab and then go offnode for a single allocation. * * shrink_slab will free memory on all zones and may take * a long time. @@ -1457,54 +1960,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - if (nr_reclaimed == 0) { - /* - * We were unable to reclaim enough pages to stay on node. We - * now allow off node accesses for a certain time period before - * trying again to reclaim pages from the local zone. - */ + if (sc.nr_reclaimed == 0) zone->last_unsuccessful_zone_reclaim = jiffies; - } - return nr_reclaimed >= nr_pages; -} - -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) -{ - cpumask_t mask; - int node_id; - - /* - * Do not reclaim if there was a recent unsuccessful attempt at zone - * reclaim. In that case we let allocations go off node for the - * zone_reclaim_interval. Otherwise we would scan for each off-node - * page allocation. - */ - if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) - return 0; - - /* - * Avoid concurrent zone reclaims, do not reclaim in a zone that does - * not have reclaimable pages and if we should not delay the allocation - * then do not scan. - */ - if (!(gfp_mask & __GFP_WAIT) || - zone->all_unreclaimable || - atomic_read(&zone->reclaim_in_progress) > 0 || - (current->flags & PF_MEMALLOC)) - return 0; - - /* - * Only run zone reclaim on the local zone or on zones that do not - * have associated processors. This will favor the local processor - * over remote processors and spread off node memory allocations - * as wide as possible. - */ - node_id = zone->zone_pgdat->node_id; - mask = node_to_cpumask(node_id); - if (!cpus_empty(mask) && node_id != numa_node_id()) - return 0; - return __zone_reclaim(zone, gfp_mask, order); + return sc.nr_reclaimed >= nr_pages; } #endif +