X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=e7ed79823ea7e5bdfa28b8bdac23849e3f3fc3eb;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=e5f0b091936f4fe37b10098b46f3b3def081b976;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/mm/vmscan.c b/mm/vmscan.c index e5f0b0919..e7ed79823 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include /* for try_to_release_page(), buffer_heads_over_limit */ @@ -32,18 +31,59 @@ #include #include #include +#include -#include #include #include #include +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +struct scan_control { + /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ + unsigned long nr_to_scan; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Incremented by the number of pages reclaimed */ + unsigned long nr_reclaimed; + + unsigned long nr_mapped; /* From page_state */ + + /* How many pages shrink_cache() should reclaim */ + int nr_to_reclaim; + + /* Ask shrink_caches, or shrink_zone to scan at this priority */ + unsigned int priority; + + /* This context's GFP mask */ + unsigned int gfp_mask; + + int may_writepage; +}; + /* - * From 0 .. 100. Higher means more swappy. + * The list of shrinker callbacks used by to apply pressure to + * ageable caches. */ -int vm_swappiness = 60; -static long total_memory; +struct shrinker { + shrinker_t shrinker; + struct list_head list; + int seeks; /* seeks to recreate an obj */ + long nr; /* objs pending delete */ +}; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -67,7 +107,7 @@ static long total_memory; if ((_page)->lru.prev != _base) { \ struct page *prev; \ \ - prev = lru_to_page(&(_page->lru)); \ + prev = lru_to_page(&(_page->lru)); \ prefetchw(&prev->_field); \ } \ } while (0) @@ -76,18 +116,13 @@ static long total_memory; #endif /* - * The list of shrinker callbacks used by to apply pressure to - * ageable caches. + * From 0 .. 100. Higher means more swappy. */ -struct shrinker { - shrinker_t shrinker; - struct list_head list; - int seeks; /* seeks to recreate an obj */ - long nr; /* objs pending delete */ -}; +int vm_swappiness = 60; +static long total_memory; static LIST_HEAD(shrinker_list); -static DECLARE_MUTEX(shrinker_sem); +static DECLARE_RWSEM(shrinker_rwsem); /* * Add a shrinker callback to be called from the vm @@ -101,13 +136,12 @@ struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) shrinker->shrinker = theshrinker; shrinker->seeks = seeks; shrinker->nr = 0; - down(&shrinker_sem); + down_write(&shrinker_rwsem); list_add(&shrinker->list, &shrinker_list); - up(&shrinker_sem); + up_write(&shrinker_rwsem); } return shrinker; } - EXPORT_SYMBOL(set_shrinker); /* @@ -115,14 +149,13 @@ EXPORT_SYMBOL(set_shrinker); */ void remove_shrinker(struct shrinker *shrinker) { - down(&shrinker_sem); + down_write(&shrinker_rwsem); list_del(&shrinker->list); - up(&shrinker_sem); + up_write(&shrinker_rwsem); kfree(shrinker); } - EXPORT_SYMBOL(remove_shrinker); - + #define SHRINK_BATCH 128 /* * Call the shrink functions to age shrinkable caches @@ -136,47 +169,56 @@ EXPORT_SYMBOL(remove_shrinker); * slab to avoid swapping. * * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. + * + * `lru_pages' represents the number of on-LRU pages in all the zones which + * are eligible for the caller's allocation attempt. It is used for balancing + * slab reclaim versus page reclaim. */ -static int shrink_slab(unsigned long scanned, unsigned int gfp_mask) +static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, + unsigned long lru_pages) { struct shrinker *shrinker; - long pages; - if (down_trylock(&shrinker_sem)) + if (scanned == 0) + scanned = SWAP_CLUSTER_MAX; + + if (!down_read_trylock(&shrinker_rwsem)) return 0; - pages = nr_used_zone_pages(); list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; + unsigned long total_scan; delta = (4 * scanned) / shrinker->seeks; delta *= (*shrinker->shrinker)(0, gfp_mask); - do_div(delta, pages + 1); + do_div(delta, lru_pages + 1); shrinker->nr += delta; if (shrinker->nr < 0) shrinker->nr = LONG_MAX; /* It wrapped! */ - if (shrinker->nr <= SHRINK_BATCH) - continue; - while (shrinker->nr) { - long this_scan = shrinker->nr; + total_scan = shrinker->nr; + shrinker->nr = 0; + + while (total_scan >= SHRINK_BATCH) { + long this_scan = SHRINK_BATCH; int shrink_ret; - if (this_scan > 128) - this_scan = 128; shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); - mod_page_state(slabs_scanned, this_scan); - shrinker->nr -= this_scan; if (shrink_ret == -1) break; + mod_page_state(slabs_scanned, this_scan); + total_scan -= this_scan; + cond_resched(); } + + shrinker->nr += total_scan; } - up(&shrinker_sem); + up_read(&shrinker_rwsem); return 0; } -/* Must be called with page's rmap lock held. */ +/* Called without lock on whether page is mapped, so answer is unstable */ static inline int page_mapping_inuse(struct page *page) { struct address_space *mapping; @@ -240,18 +282,6 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; - /* * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). */ @@ -311,27 +341,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } -struct scan_control { - /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ - unsigned long nr_to_scan; - - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Incremented by the number of pages reclaimed */ - unsigned long nr_reclaimed; - - unsigned long nr_mapped; /* From page_state */ - - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; - - /* This context's GFP mask */ - unsigned int gfp_mask; - - int may_writepage; -}; - /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ @@ -351,6 +360,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) int may_enter_fs; int referenced; + cond_resched(); + page = lru_to_page(page_list); list_del(&page->lru); @@ -359,34 +370,27 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) BUG_ON(PageActive(page)); - if (PageWriteback(page)) - goto keep_locked; - sc->nr_scanned++; /* Double the slab pressure for mapped and swapcache pages */ if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; - page_map_lock(page); - referenced = page_referenced(page); - if (referenced && page_mapping_inuse(page)) { - /* In active use or really unfreeable. Activate it. */ - page_map_unlock(page); + if (PageWriteback(page)) + goto keep_locked; + + referenced = page_referenced(page, 1, sc->priority <= 0); + /* In active use or really unfreeable? Activate it. */ + if (referenced && page_mapping_inuse(page)) goto activate_locked; - } #ifdef CONFIG_SWAP /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. - * - * XXX: implement swap clustering ? */ if (PageAnon(page) && !PageSwapCache(page)) { - page_map_unlock(page); if (!add_to_swap(page)) goto activate_locked; - page_map_lock(page); } #endif /* CONFIG_SWAP */ @@ -401,16 +405,13 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - page_map_unlock(page); goto activate_locked; case SWAP_AGAIN: - page_map_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - page_map_unlock(page); if (PageDirty(page)) { if (referenced) @@ -574,7 +575,7 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) nr_taken++; } zone->nr_inactive -= nr_taken; - zone->pages_scanned += nr_taken; + zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); if (nr_taken == 0) @@ -589,6 +590,7 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) if (current_is_kswapd()) mod_page_state(kswapd_steal, nr_freed); mod_page_state_zone(zone, pgsteal, nr_freed); + sc->nr_to_reclaim -= nr_freed; spin_lock_irq(&zone->lru_lock); /* @@ -674,6 +676,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } pgscanned++; } + zone->pages_scanned += pgscanned; zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); @@ -709,28 +712,16 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) reclaim_mapped = 1; while (!list_empty(&l_hold)) { + cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); if (page_mapped(page)) { - if (!reclaim_mapped) { - list_add(&page->lru, &l_active); - continue; - } - page_map_lock(page); - if (page_referenced(page)) { - page_map_unlock(page); + if (!reclaim_mapped || + (total_swap_pages == 0 && PageAnon(page)) || + page_referenced(page, 0, sc->priority <= 0)) { list_add(&page->lru, &l_active); continue; } - page_map_unlock(page); - } - /* - * FIXME: need to consider page_count(page) here if/when we - * reap orphaned pages via the LRU (Daniel's locking stuff) - */ - if (total_swap_pages == 0 && PageAnon(page)) { - list_add(&page->lru, &l_active); - continue; } list_add(&page->lru, &l_inactive); } @@ -792,54 +783,50 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } /* - * Scan `nr_pages' from this zone. Returns the number of reclaimed pages. * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_zone(struct zone *zone, struct scan_control *sc) { - unsigned long scan_active, scan_inactive; - int count; - - scan_inactive = (zone->nr_active + zone->nr_inactive) >> sc->priority; + unsigned long nr_active; + unsigned long nr_inactive; /* - * Try to keep the active list 2/3 of the size of the cache. And - * make sure that refill_inactive is given a decent number of pages. - * - * The "scan_active + 1" here is important. With pagecache-intensive - * workloads the inactive list is huge, and `ratio' evaluates to zero - * all the time. Which pins the active list memory. So we add one to - * `scan_active' just to make sure that the kernel will slowly sift - * through the active list. + * Add one to `nr_to_scan' just to make sure that the kernel will + * slowly sift through the active list. */ - if (zone->nr_active >= 4*(zone->nr_inactive*2 + 1)) { - /* Don't scan more than 4 times the inactive list scan size */ - scan_active = 4*scan_inactive; - } else { - unsigned long long tmp; - - /* Cast to long long so the multiply doesn't overflow */ - - tmp = (unsigned long long)scan_inactive * zone->nr_active; - do_div(tmp, zone->nr_inactive*2 + 1); - scan_active = (unsigned long)tmp; - } - - atomic_add(scan_active + 1, &zone->nr_scan_active); - count = atomic_read(&zone->nr_scan_active); - if (count >= SWAP_CLUSTER_MAX) { - atomic_set(&zone->nr_scan_active, 0); - sc->nr_to_scan = count; - refill_inactive_zone(zone, sc); - } + zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; + nr_active = zone->nr_scan_active; + if (nr_active >= SWAP_CLUSTER_MAX) + zone->nr_scan_active = 0; + else + nr_active = 0; + + zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; + nr_inactive = zone->nr_scan_inactive; + if (nr_inactive >= SWAP_CLUSTER_MAX) + zone->nr_scan_inactive = 0; + else + nr_inactive = 0; + + sc->nr_to_reclaim = SWAP_CLUSTER_MAX; + + while (nr_active || nr_inactive) { + if (nr_active) { + sc->nr_to_scan = min(nr_active, + (unsigned long)SWAP_CLUSTER_MAX); + nr_active -= sc->nr_to_scan; + refill_inactive_zone(zone, sc); + } - atomic_add(scan_inactive, &zone->nr_scan_inactive); - count = atomic_read(&zone->nr_scan_inactive); - if (count >= SWAP_CLUSTER_MAX) { - atomic_set(&zone->nr_scan_inactive, 0); - sc->nr_to_scan = count; - shrink_cache(zone, sc); + if (nr_inactive) { + sc->nr_to_scan = min(nr_inactive, + (unsigned long)SWAP_CLUSTER_MAX); + nr_inactive -= sc->nr_to_scan; + shrink_cache(zone, sc); + if (sc->nr_to_reclaim <= 0) + break; + } } } @@ -867,6 +854,9 @@ shrink_caches(struct zone **zones, struct scan_control *sc) for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; + if (zone->present_pages == 0) + continue; + zone->temp_priority = sc->priority; if (zone->prev_priority > sc->priority) zone->prev_priority = sc->priority; @@ -899,6 +889,7 @@ int try_to_free_pages(struct zone **zones, int total_scanned = 0, total_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc; + unsigned long lru_pages = 0; int i; sc.gfp_mask = gfp_mask; @@ -906,8 +897,12 @@ int try_to_free_pages(struct zone **zones, inc_page_state(allocstall); - for (i = 0; zones[i] != 0; i++) - zones[i]->temp_priority = DEF_PRIORITY; + for (i = 0; zones[i] != NULL; i++) { + struct zone *zone = zones[i]; + + zone->temp_priority = DEF_PRIORITY; + lru_pages += zone->nr_active + zone->nr_inactive; + } for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc.nr_mapped = read_page_state(nr_mapped); @@ -915,7 +910,7 @@ int try_to_free_pages(struct zone **zones, sc.nr_reclaimed = 0; sc.priority = priority; shrink_caches(zones, &sc); - shrink_slab(sc.nr_scanned, gfp_mask); + shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { sc.nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -943,8 +938,6 @@ int try_to_free_pages(struct zone **zones, if (sc.nr_scanned && priority < DEF_PRIORITY - 2) blk_congestion_wait(WRITE, HZ/10); } - if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) - out_of_memory(); out: for (i = 0; zones[i] != 0; i++) zones[i]->prev_priority = zones[i]->temp_priority; @@ -976,15 +969,19 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static int balance_pgdat(pg_data_t *pgdat, int nr_pages) +static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) { int to_free = nr_pages; + int all_zones_ok; int priority; int i; - int total_scanned = 0, total_reclaimed = 0; + int total_scanned, total_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc; +loop_again: + total_scanned = 0; + total_reclaimed = 0; sc.gfp_mask = GFP_KERNEL; sc.may_writepage = 0; sc.nr_mapped = read_page_state(nr_mapped); @@ -998,9 +995,10 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages) } for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int all_zones_ok = 1; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long lru_pages = 0; + all_zones_ok = 1; if (nr_pages == 0) { /* @@ -1010,11 +1008,15 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages) for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; + if (zone->present_pages == 0) + continue; + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - if (zone->free_pages <= zone->pages_high) { + if (!zone_watermark_ok(zone, order, + zone->pages_high, 0, 0, 0)) { end_zone = i; goto scan; } @@ -1024,6 +1026,12 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages) end_zone = pgdat->nr_zones - 1; } scan: + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + lru_pages += zone->nr_active + zone->nr_inactive; + } + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. @@ -1036,11 +1044,15 @@ scan: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + if (zone->present_pages == 0) + continue; + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; if (nr_pages == 0) { /* Not software suspend */ - if (zone->free_pages <= zone->pages_high) + if (!zone_watermark_ok(zone, order, + zone->pages_high, end_zone, 0, 0)) all_zones_ok = 0; } zone->temp_priority = priority; @@ -1051,12 +1063,14 @@ scan: sc.priority = priority; shrink_zone(zone, &sc); reclaim_state->reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, GFP_KERNEL); + shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_reclaimed += sc.nr_reclaimed; + total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; - if (zone->pages_scanned > zone->present_pages * 2) + if (zone->pages_scanned >= (zone->nr_active + + zone->nr_inactive) * 4) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -1077,6 +1091,15 @@ scan: */ if (total_scanned && priority < DEF_PRIORITY - 2) blk_congestion_wait(WRITE, HZ/10); + + /* + * We do this so kswapd doesn't build up large priorities for + * example when it is freeing in parallel with allocators. It + * matches the direct reclaim path behaviour in terms of impact + * on zone->*_priority. + */ + if (total_reclaimed >= SWAP_CLUSTER_MAX) + break; } out: for (i = 0; i < pgdat->nr_zones; i++) { @@ -1084,6 +1107,11 @@ out: zone->prev_priority = zone->temp_priority; } + if (!all_zones_ok) { + cond_resched(); + goto loop_again; + } + return total_reclaimed; } @@ -1100,8 +1128,9 @@ out: * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -int kswapd(void *p) +static int kswapd(void *p) { + unsigned long order; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; DEFINE_WAIT(wait); @@ -1130,24 +1159,47 @@ int kswapd(void *p) */ tsk->flags |= PF_MEMALLOC|PF_KSWAPD; + order = 0; for ( ; ; ) { + unsigned long new_order; if (current->flags & PF_FREEZE) refrigerator(PF_FREEZE); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - schedule(); + new_order = pgdat->kswapd_max_order; + pgdat->kswapd_max_order = 0; + if (order < new_order) { + /* + * Don't sleep if someone wants a larger 'order' + * allocation + */ + order = new_order; + } else { + schedule(); + order = pgdat->kswapd_max_order; + } finish_wait(&pgdat->kswapd_wait, &wait); - balance_pgdat(pgdat, 0); + balance_pgdat(pgdat, 0, order); } + return 0; } /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone) +void wakeup_kswapd(struct zone *zone, int order) { - if (zone->free_pages > zone->pages_low) + pg_data_t *pgdat; + + if (zone->present_pages == 0) + return; + + pgdat = zone->zone_pgdat; + if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0)) return; + if (pgdat->kswapd_max_order < order) + pgdat->kswapd_max_order = order; if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) return; wake_up_interruptible(&zone->zone_pgdat->kswapd_wait); @@ -1170,7 +1222,7 @@ int shrink_all_memory(int nr_pages) current->reclaim_state = &reclaim_state; for_each_pgdat(pgdat) { int freed; - freed = balance_pgdat(pgdat, nr_to_free); + freed = balance_pgdat(pgdat, nr_to_free, 0); ret += freed; nr_to_free -= freed; if (nr_to_free <= 0) @@ -1211,7 +1263,7 @@ static int __init kswapd_init(void) swap_setup(); for_each_pgdat(pgdat) pgdat->kswapd - = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + = find_task_by_real_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0;