X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=084957969c85e3aff1635528580cd1b95e3de547;hb=9c920a8402f2bb9bd931801d429b65f4eb6a262b;hp=e01d5c98d431f638673e001d4b4c48aacb45a662;hpb=a91482bdcc2e0f6035702e46f1b99043a0893346;p=linux-2.6.git diff --git a/mm/vmscan.c b/mm/vmscan.c index e01d5c98d..084957969 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -37,63 +37,12 @@ #include #include -#include - -#ifndef AT_LIMIT_SUPPORT -#warning "ckrm_at_limit disabled due to problems with memory hog tests -- seting ckrm_shrink_list_empty to true" -#undef ckrm_shrink_list_empty -#define ckrm_shrink_list_empty() (1) -#endif - -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; - -struct scan_control { - /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ - unsigned long nr_to_scan; - - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Incremented by the number of pages reclaimed */ - unsigned long nr_reclaimed; - - unsigned long nr_mapped; /* From page_state */ - - /* How many pages shrink_cache() should reclaim */ - int nr_to_reclaim; - - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; - - /* This context's GFP mask */ - unsigned int gfp_mask; - - /* Flag used by CKRM */ - unsigned int ckrm_flags; - - int may_writepage; -}; /* - * The list of shrinker callbacks used by to apply pressure to - * ageable caches. + * From 0 .. 100. Higher means more swappy. */ -struct shrinker { - shrinker_t shrinker; - struct list_head list; - int seeks; /* seeks to recreate an obj */ - long nr; /* objs pending delete */ -}; +int vm_swappiness = 60; +static long total_memory; @@ -122,7 +71,7 @@ void try_to_clip_inodes(void); if ((_page)->lru.prev != _base) { \ struct page *prev; \ \ - prev = lru_to_page(&(_page->lru)); \ + prev = lru_to_page(&(_page->lru)); \ prefetchw(&prev->_field); \ } \ } while (0) @@ -131,10 +80,15 @@ void try_to_clip_inodes(void); #endif /* - * From 0 .. 100. Higher means more swappy. + * The list of shrinker callbacks used by to apply pressure to + * ageable caches. */ -int vm_swappiness = 60; -static long total_memory; +struct shrinker { + shrinker_t shrinker; + struct list_head list; + int seeks; /* seeks to recreate an obj */ + long nr; /* objs pending delete */ +}; static LIST_HEAD(shrinker_list); static DECLARE_MUTEX(shrinker_sem); @@ -157,6 +111,7 @@ struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) } return shrinker; } + EXPORT_SYMBOL(set_shrinker); /* @@ -169,6 +124,7 @@ void remove_shrinker(struct shrinker *shrinker) up(&shrinker_sem); kfree(shrinker); } + EXPORT_SYMBOL(remove_shrinker); #define SHRINK_BATCH 128 @@ -184,25 +140,22 @@ EXPORT_SYMBOL(remove_shrinker); * slab to avoid swapping. * * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. - * - * `lru_pages' represents the number of on-LRU pages in all the zones which - * are eligible for the caller's allocation attempt. It is used for balancing - * slab reclaim versus page reclaim. */ -static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, - unsigned long lru_pages) +static int shrink_slab(unsigned long scanned, unsigned int gfp_mask) { struct shrinker *shrinker; + long pages; if (down_trylock(&shrinker_sem)) return 0; + pages = nr_used_zone_pages(); list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; delta = (4 * scanned) / shrinker->seeks; delta *= (*shrinker->shrinker)(0, gfp_mask); - do_div(delta, lru_pages + 1); + do_div(delta, pages + 1); shrinker->nr += delta; if (shrinker->nr < 0) shrinker->nr = LONG_MAX; /* It wrapped! */ @@ -291,6 +244,18 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + /* * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). */ @@ -350,6 +315,27 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } +struct scan_control { + /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ + unsigned long nr_to_scan; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Incremented by the number of pages reclaimed */ + unsigned long nr_reclaimed; + + unsigned long nr_mapped; /* From page_state */ + + /* Ask shrink_caches, or shrink_zone to scan at this priority */ + unsigned int priority; + + /* This context's GFP mask */ + unsigned int gfp_mask; + + int may_writepage; +}; + /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ @@ -369,8 +355,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) int may_enter_fs; int referenced; - cond_resched(); - page = lru_to_page(page_list); list_del(&page->lru); @@ -559,23 +543,19 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan, nr_pass; - unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; + int max_scan = sc->nr_to_scan; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); -redo: - ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); - nr_pass = zone->nr_inactive; while (max_scan > 0) { struct page *page; int nr_taken = 0; int nr_scan = 0; int nr_freed; - while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX && + while (nr_scan++ < SWAP_CLUSTER_MAX && !list_empty(&zone->inactive_list)) { page = lru_to_page(&zone->inactive_list); @@ -593,25 +573,15 @@ redo: SetPageLRU(page); list_add(&page->lru, &zone->inactive_list); continue; - } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { - __put_page(page); - SetPageLRU(page); -#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE - list_add_tail(&page->lru, &zone->inactive_list); -#else - list_add(&page->lru, &zone->inactive_list); -#endif - continue; } list_add(&page->lru, &page_list); - ckrm_mem_dec_inactive(page); nr_taken++; } zone->nr_inactive -= nr_taken; zone->pages_scanned += nr_taken; spin_unlock_irq(&zone->lru_lock); - if ((bit_flag == 0) && (nr_taken == 0)) + if (nr_taken == 0) goto done; max_scan -= nr_scan; @@ -623,7 +593,6 @@ redo: if (current_is_kswapd()) mod_page_state(kswapd_steal, nr_freed); mod_page_state_zone(zone, pgsteal, nr_freed); - sc->nr_to_reclaim -= nr_freed; spin_lock_irq(&zone->lru_lock); /* @@ -644,9 +613,6 @@ redo: spin_lock_irq(&zone->lru_lock); } } - if (ckrm_flags && (nr_pass <= 0)) { - goto redo; - } } spin_unlock_irq(&zone->lru_lock); done: @@ -686,17 +652,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) long mapped_ratio; long distress; long swap_tendency; - unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; - int nr_pass; lru_add_drain(); pgmoved = 0; spin_lock_irq(&zone->lru_lock); -redo: - ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); - nr_pass = zone->nr_active; - while (pgscanned < nr_pages && !list_empty(&zone->active_list) && - nr_pass) { + while (pgscanned < nr_pages && !list_empty(&zone->active_list)) { page = lru_to_page(&zone->active_list); prefetchw_prev_lru_page(page, &zone->active_list, flags); if (!TestClearPageLRU(page)) @@ -712,24 +672,11 @@ redo: __put_page(page); SetPageLRU(page); list_add(&page->lru, &zone->active_list); - pgscanned++; - } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { - __put_page(page); - SetPageLRU(page); -#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE - list_add_tail(&page->lru, &zone->active_list); -#else - list_add(&page->lru, &zone->active_list); -#endif } else { list_add(&page->lru, &l_hold); - ckrm_mem_dec_active(page); pgmoved++; - pgscanned++; - } - if (!--nr_pass && ckrm_flags) { - goto redo; } + pgscanned++; } zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); @@ -766,7 +713,6 @@ redo: reclaim_mapped = 1; while (!list_empty(&l_hold)) { - cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); if (page_mapped(page)) { @@ -804,7 +750,6 @@ redo: if (!TestClearPageActive(page)) BUG(); list_move(&page->lru, &zone->inactive_list); - ckrm_mem_inc_inactive(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_inactive += pgmoved; @@ -833,7 +778,6 @@ redo: BUG(); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); - ckrm_mem_inc_active(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_active += pgmoved; @@ -852,163 +796,57 @@ redo: } /* + * Scan `nr_pages' from this zone. Returns the number of reclaimed pages. * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_zone(struct zone *zone, struct scan_control *sc) { - unsigned long nr_active; - unsigned long nr_inactive; + unsigned long scan_active, scan_inactive; + int count; + + scan_inactive = (zone->nr_active + zone->nr_inactive) >> sc->priority; /* - * Add one to `nr_to_scan' just to make sure that the kernel will - * slowly sift through the active list. + * Try to keep the active list 2/3 of the size of the cache. And + * make sure that refill_inactive is given a decent number of pages. + * + * The "scan_active + 1" here is important. With pagecache-intensive + * workloads the inactive list is huge, and `ratio' evaluates to zero + * all the time. Which pins the active list memory. So we add one to + * `scan_active' just to make sure that the kernel will slowly sift + * through the active list. */ - zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; - nr_active = zone->nr_scan_active; - if (nr_active >= SWAP_CLUSTER_MAX) - zone->nr_scan_active = 0; - else - nr_active = 0; - - zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= SWAP_CLUSTER_MAX) - zone->nr_scan_inactive = 0; - else - nr_inactive = 0; - - sc->nr_to_reclaim = SWAP_CLUSTER_MAX; - - while (nr_active || nr_inactive) { - sc->ckrm_flags = ckrm_setup_reclamation(); - if (nr_active) { - sc->nr_to_scan = min(nr_active, - (unsigned long)SWAP_CLUSTER_MAX); - nr_active -= sc->nr_to_scan; - refill_inactive_zone(zone, sc); - } + if (zone->nr_active >= 4*(zone->nr_inactive*2 + 1)) { + /* Don't scan more than 4 times the inactive list scan size */ + scan_active = 4*scan_inactive; + } else { + unsigned long long tmp; - if (nr_inactive) { - sc->nr_to_scan = min(nr_inactive, - (unsigned long)SWAP_CLUSTER_MAX); - nr_inactive -= sc->nr_to_scan; - shrink_cache(zone, sc); - if (sc->nr_to_reclaim <= 0) - break; - } - ckrm_teardown_reclamation(); - } -} - -#if defined(CONFIG_CKRM_RES_MEM) && defined(AT_LIMIT_SUPPORT) -// This function needs to be given more thought. -// Shrink the class to be at 90% of its limit -static void -ckrm_shrink_class(ckrm_mem_res_t *cls) -{ - struct scan_control sc; - struct zone *zone; - int zindex = 0, active_credit = 0, inactive_credit = 0; + /* Cast to long long so the multiply doesn't overflow */ - if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically - // if it is already set somebody is working on it. so... leave - return; + tmp = (unsigned long long)scan_inactive * zone->nr_active; + do_div(tmp, zone->nr_inactive*2 + 1); + scan_active = (unsigned long)tmp; } - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.ckrm_flags = ckrm_get_reclaim_flags(cls); - sc.nr_reclaimed = 0; - sc.priority = 0; // always very high priority - - for_each_zone(zone) { - int zone_total, zone_limit, active_limit, inactive_limit; - int active_over, inactive_over; - unsigned long nr_active, nr_inactive; - u64 temp; - - zone->temp_priority = zone->prev_priority; - zone->prev_priority = sc.priority; - - zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; - - temp = (u64) cls->pg_limit * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_limit = (int) temp; - active_limit = (6 * zone_limit) / 10; // 2/3rd in active list - inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list - - active_over = cls->nr_active[zindex] - active_limit + active_credit; - inactive_over = active_over + - (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit; - - if (active_over > 0) { - zone->nr_scan_active += active_over + 1; - nr_active = zone->nr_scan_active; - active_credit = 0; - } else { - active_credit += active_over; - nr_active = 0; - } - if (inactive_over > 0) { - zone->nr_scan_inactive += inactive_over; - nr_inactive = zone->nr_scan_inactive; - inactive_credit = 0; - } else { - inactive_credit += inactive_over; - nr_inactive = 0; - } - while (nr_active || nr_inactive) { - if (nr_active) { - sc.nr_to_scan = min(nr_active, - (unsigned long)SWAP_CLUSTER_MAX); - nr_active -= sc.nr_to_scan; - refill_inactive_zone(zone, &sc); - } - - if (nr_inactive) { - sc.nr_to_scan = min(nr_inactive, - (unsigned long)SWAP_CLUSTER_MAX); - nr_inactive -= sc.nr_to_scan; - shrink_cache(zone, &sc); - if (sc.nr_to_reclaim <= 0) - break; - } - } - zone->prev_priority = zone->temp_priority; - zindex++; + atomic_add(scan_active + 1, &zone->nr_scan_active); + count = atomic_read(&zone->nr_scan_active); + if (count >= SWAP_CLUSTER_MAX) { + atomic_set(&zone->nr_scan_active, 0); + sc->nr_to_scan = count; + refill_inactive_zone(zone, sc); } - ckrm_clear_shrink(cls); -} -static void -ckrm_shrink_classes(void) -{ - ckrm_mem_res_t *cls; - - spin_lock(&ckrm_mem_lock); - while (!ckrm_shrink_list_empty()) { - cls = list_entry(ckrm_shrink_list.next, ckrm_mem_res_t, - shrink_list); - spin_unlock(&ckrm_mem_lock); - ckrm_shrink_class(cls); - spin_lock(&ckrm_mem_lock); - list_del(&cls->shrink_list); - cls->flags &= ~MEM_AT_LIMIT; + atomic_add(scan_inactive, &zone->nr_scan_inactive); + count = atomic_read(&zone->nr_scan_inactive); + if (count >= SWAP_CLUSTER_MAX) { + atomic_set(&zone->nr_scan_inactive, 0); + sc->nr_to_scan = count; + shrink_cache(zone, sc); } - spin_unlock(&ckrm_mem_lock); } -#else - -#if defined(CONFIG_CKRM_RES_MEM) && !defined(AT_LIMIT_SUPPORT) -#warning "disabling ckrm_at_limit -- setting ckrm_shrink_classes to noop " -#endif - -#define ckrm_shrink_classes() do { } while(0) -#endif - /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1065,7 +903,6 @@ int try_to_free_pages(struct zone **zones, int total_scanned = 0, total_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc; - unsigned long lru_pages = 0; int i; sc.gfp_mask = gfp_mask; @@ -1073,12 +910,8 @@ int try_to_free_pages(struct zone **zones, inc_page_state(allocstall); - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; - - zone->temp_priority = DEF_PRIORITY; - lru_pages += zone->nr_active + zone->nr_inactive; - } + for (i = 0; zones[i] != 0; i++) + zones[i]->temp_priority = DEF_PRIORITY; for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc.nr_mapped = read_page_state(nr_mapped); @@ -1086,7 +919,7 @@ int try_to_free_pages(struct zone **zones, sc.nr_reclaimed = 0; sc.priority = priority; shrink_caches(zones, &sc); - shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); + shrink_slab(sc.nr_scanned, gfp_mask); if (reclaim_state) { sc.nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -1115,7 +948,7 @@ int try_to_free_pages(struct zone **zones, blk_congestion_wait(WRITE, HZ/10); } if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) - out_of_memory(gfp_mask); + out_of_memory(); out: for (i = 0; zones[i] != 0; i++) zones[i]->prev_priority = zones[i]->temp_priority; @@ -1171,7 +1004,7 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages) for (priority = DEF_PRIORITY; priority >= 0; priority--) { int all_zones_ok = 1; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long lru_pages = 0; + if (nr_pages == 0) { /* @@ -1195,12 +1028,6 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages) end_zone = pgdat->nr_zones - 1; } scan: - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - lru_pages += zone->nr_active + zone->nr_inactive; - } - /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. @@ -1228,7 +1055,7 @@ scan: sc.priority = priority; shrink_zone(zone, &sc); reclaim_state->reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); + shrink_slab(sc.nr_scanned, GFP_KERNEL); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_reclaimed += sc.nr_reclaimed; if (zone->all_unreclaimable) @@ -1277,7 +1104,7 @@ out: * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -static int kswapd(void *p) +int kswapd(void *p) { pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -1315,12 +1142,8 @@ static int kswapd(void *p) finish_wait(&pgdat->kswapd_wait, &wait); try_to_clip_inodes(); - if (!ckrm_shrink_list_empty()) - ckrm_shrink_classes(); - else balance_pgdat(pgdat, 0); } - return 0; } /* @@ -1328,7 +1151,7 @@ static int kswapd(void *p) */ void wakeup_kswapd(struct zone *zone) { - if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty()) + if (zone->free_pages > zone->pages_low) return; if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) return;