Revert to Fedora kernel-2.6.17-1.2187_FC5 patched with vs2.0.2.1; there are too many...
[linux-2.6.git] / mm / vmscan.c
index 37951ba..840092d 100644 (file)
@@ -34,7 +34,6 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
-#include <linux/kthread.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -47,6 +46,8 @@ struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
 
+       unsigned long nr_mapped;        /* From page_state */
+
        /* This context's GFP mask */
        gfp_t gfp_mask;
 
@@ -60,10 +61,6 @@ struct scan_control {
         * In this context, it doesn't matter that we scan the
         * whole list at once. */
        int swap_cluster_max;
-
-       int swappiness;
-
-       int all_unreclaimable;
 };
 
 /*
@@ -111,7 +108,7 @@ struct shrinker {
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
-long vm_total_pages;   /* The total number of pages which the VM controls */
+static long total_memory;
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -217,7 +214,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                                break;
                        if (shrink_ret < nr_before)
                                ret += nr_before - shrink_ret;
-                       count_vm_events(SLABS_SCANNED, this_scan);
+                       mod_page_state(slabs_scanned, this_scan);
                        total_scan -= this_scan;
 
                        cond_resched();
@@ -291,23 +288,11 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
 
-/* possible outcome of pageout() */
-typedef enum {
-       /* failed to write page out, page is locked */
-       PAGE_KEEP,
-       /* move page to the active list, page is locked */
-       PAGE_ACTIVATE,
-       /* page has been sent to the disk successfully, page is unlocked */
-       PAGE_SUCCESS,
-       /* page is clean and locked */
-       PAGE_CLEAN,
-} pageout_t;
-
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -352,8 +337,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                struct writeback_control wbc = {
                        .sync_mode = WB_SYNC_NONE,
                        .nr_to_write = SWAP_CLUSTER_MAX,
-                       .range_start = 0,
-                       .range_end = LLONG_MAX,
                        .nonblocking = 1,
                        .for_reclaim = 1,
                };
@@ -571,7 +554,7 @@ keep:
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
                __pagevec_release_nonlru(&freed_pvec);
-       count_vm_events(PGACTIVATE, pgactivate);
+       mod_page_state(pgactivate, pgactivate);
        return nr_reclaimed;
 }
 
@@ -661,11 +644,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                nr_reclaimed += nr_freed;
                local_irq_disable();
                if (current_is_kswapd()) {
-                       __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
-                       __count_vm_events(KSWAPD_STEAL, nr_freed);
+                       __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+                       __mod_page_state(kswapd_steal, nr_freed);
                } else
-                       __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
-               __count_vm_events(PGACTIVATE, nr_freed);
+                       __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+               __mod_page_state_zone(zone, pgsteal, nr_freed);
 
                if (nr_taken == 0)
                        goto done;
@@ -697,25 +680,6 @@ done:
        return nr_reclaimed;
 }
 
-/*
- * We are about to scan this zone at a certain priority level.  If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone.  This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
-       if (priority < zone->prev_priority)
-               zone->prev_priority = priority;
-}
-
-static inline int zone_is_near_oom(struct zone *zone)
-{
-       return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
-}
-
 /*
  * This moves pages from the active list to the inactive list.
  *
@@ -734,7 +698,7 @@ static inline int zone_is_near_oom(struct zone *zone)
  * But we had to alter page->flags anyway.
  */
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-                               struct scan_control *sc, int priority)
+                               struct scan_control *sc)
 {
        unsigned long pgmoved;
        int pgdeactivate = 0;
@@ -751,14 +715,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                long distress;
                long swap_tendency;
 
-               if (zone_is_near_oom(zone))
-                       goto force_reclaim_mapped;
-
                /*
                 * `distress' is a measure of how much trouble we're having
                 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
                 */
-               distress = 100 >> min(zone->prev_priority, priority);
+               distress = 100 >> zone->prev_priority;
 
                /*
                 * The point of this algorithm is to decide when to start
@@ -766,9 +727,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * how much memory
                 * is mapped.
                 */
-               mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
-                               global_page_state(NR_ANON_PAGES)) * 100) /
-                                       vm_total_pages;
+               mapped_ratio = (sc->nr_mapped * 100) / total_memory;
 
                /*
                 * Now decide how much we really want to unmap some pages.  The
@@ -782,14 +741,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * A 100% value of vm_swappiness overrides this algorithm
                 * altogether.
                 */
-               swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+               swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
 
                /*
                 * Now use this metric to decide whether to start moving mapped
                 * memory onto the inactive list.
                 */
                if (swap_tendency >= 100)
-force_reclaim_mapped:
                        reclaim_mapped = 1;
        }
 
@@ -866,10 +824,11 @@ force_reclaim_mapped:
                }
        }
        zone->nr_active += pgmoved;
+       spin_unlock(&zone->lru_lock);
 
-       __count_zone_vm_events(PGREFILL, zone, pgscanned);
-       __count_vm_events(PGDEACTIVATE, pgdeactivate);
-       spin_unlock_irq(&zone->lru_lock);
+       __mod_page_state_zone(zone, pgrefill, pgscanned);
+       __mod_page_state(pgdeactivate, pgdeactivate);
+       local_irq_enable();
 
        pagevec_release(&pvec);
 }
@@ -910,7 +869,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
                        nr_to_scan = min(nr_active,
                                        (unsigned long)sc->swap_cluster_max);
                        nr_active -= nr_to_scan;
-                       shrink_active_list(nr_to_scan, zone, sc, priority);
+                       shrink_active_list(nr_to_scan, zone, sc);
                }
 
                if (nr_inactive) {
@@ -950,7 +909,6 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
        unsigned long nr_reclaimed = 0;
        int i;
 
-       sc->all_unreclaimable = 1;
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
 
@@ -960,13 +918,13 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                        continue;
 
-               note_zone_scanning_priority(zone, priority);
+               zone->temp_priority = priority;
+               if (zone->prev_priority > priority)
+                       zone->prev_priority = priority;
 
                if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                        continue;       /* Let kswapd poll it */
 
-               sc->all_unreclaimable = 0;
-
                nr_reclaimed += shrink_zone(priority, zone, sc);
        }
        return nr_reclaimed;
@@ -999,10 +957,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                .may_writepage = !laptop_mode,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .may_swap = 1,
-               .swappiness = vm_swappiness,
        };
 
-       count_vm_event(ALLOCSTALL);
+       inc_page_state(allocstall);
 
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
@@ -1010,10 +967,12 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                        continue;
 
+               zone->temp_priority = DEF_PRIORITY;
                lru_pages += zone->nr_active + zone->nr_inactive;
        }
 
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+               sc.nr_mapped = read_page_state(nr_mapped);
                sc.nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
@@ -1046,27 +1005,14 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
                        blk_congestion_wait(WRITE, HZ/10);
        }
-       /* top priority shrink_caches still had more to do? don't OOM, then */
-       if (!sc.all_unreclaimable || nr_reclaimed)
-               ret = 1;
-
 out:
-       /*
-        * Now that we've scanned all the zones at this priority level, note
-        * that level within the zone so that the next thread which performs
-        * scanning of this zone will immediately start out at this priority
-        * level.  This affects only the decision whether or not to bring
-        * mapped pages onto the inactive list.
-        */
-       if (priority < 0)
-               priority = 0;
        for (i = 0; zones[i] != 0; i++) {
                struct zone *zone = zones[i];
 
                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                        continue;
 
-               zone->prev_priority = priority;
+               zone->prev_priority = zone->temp_priority;
        }
        return ret;
 }
@@ -1075,6 +1021,10 @@ out:
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
  *
+ * If `nr_pages' is non-zero then it is the number of pages which are to be
+ * reclaimed, regardless of the zone occupancies.  This is a software suspend
+ * special.
+ *
  * Returns the number of pages which were actually freed.
  *
  * There is special handling here for zones which are full of pinned pages.
@@ -1092,8 +1042,10 @@ out:
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+                               int order)
 {
+       unsigned long to_free = nr_pages;
        int all_zones_ok;
        int priority;
        int i;
@@ -1103,23 +1055,22 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_swap = 1,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
-               .swappiness = vm_swappiness,
+               .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
        };
-       /*
-        * temp_priority is used to remember the scanning priority at which
-        * this zone was successfully refilled to free_pages == pages_high.
-        */
-       int temp_priority[MAX_NR_ZONES];
 
 loop_again:
        total_scanned = 0;
        nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
-       count_vm_event(PAGEOUTRUN);
+       sc.nr_mapped = read_page_state(nr_mapped);
 
-       for (i = 0; i < pgdat->nr_zones; i++)
-               temp_priority[i] = DEF_PRIORITY;
+       inc_page_state(pageoutrun);
+
+       for (i = 0; i < pgdat->nr_zones; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+
+               zone->temp_priority = DEF_PRIORITY;
+       }
 
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
@@ -1131,26 +1082,31 @@ loop_again:
 
                all_zones_ok = 1;
 
-               /*
-                * Scan in the highmem->dma direction for the highest
-                * zone which needs scanning
-                */
-               for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-                       struct zone *zone = pgdat->node_zones + i;
+               if (nr_pages == 0) {
+                       /*
+                        * Scan in the highmem->dma direction for the highest
+                        * zone which needs scanning
+                        */
+                       for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+                               struct zone *zone = pgdat->node_zones + i;
 
-                       if (!populated_zone(zone))
-                               continue;
+                               if (!populated_zone(zone))
+                                       continue;
 
-                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                               continue;
+                               if (zone->all_unreclaimable &&
+                                               priority != DEF_PRIORITY)
+                                       continue;
 
-                       if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                              0, 0)) {
-                               end_zone = i;
-                               goto scan;
+                               if (!zone_watermark_ok(zone, order,
+                                               zone->pages_high, 0, 0)) {
+                                       end_zone = i;
+                                       goto scan;
+                               }
                        }
+                       goto out;
+               } else {
+                       end_zone = pgdat->nr_zones - 1;
                }
-               goto out;
 scan:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -1177,12 +1133,15 @@ scan:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
 
-                       if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                              end_zone, 0))
-                               all_zones_ok = 0;
-                       temp_priority[i] = priority;
+                       if (nr_pages == 0) {    /* Not software suspend */
+                               if (!zone_watermark_ok(zone, order,
+                                               zone->pages_high, end_zone, 0))
+                                       all_zones_ok = 0;
+                       }
+                       zone->temp_priority = priority;
+                       if (zone->prev_priority > priority)
+                               zone->prev_priority = priority;
                        sc.nr_scanned = 0;
-                       note_zone_scanning_priority(zone, priority);
                        nr_reclaimed += shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1192,7 +1151,7 @@ scan:
                        if (zone->all_unreclaimable)
                                continue;
                        if (nr_slab == 0 && zone->pages_scanned >=
-                                   (zone->nr_active + zone->nr_inactive) * 6)
+                                   (zone->nr_active + zone->nr_inactive) * 4)
                                zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
@@ -1203,6 +1162,8 @@ scan:
                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
+               if (nr_pages && to_free > nr_reclaimed)
+                       continue;       /* swsusp: need to do more work */
                if (all_zones_ok)
                        break;          /* kswapd: all done */
                /*
@@ -1218,19 +1179,14 @@ scan:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-               if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+               if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
                        break;
        }
 out:
-       /*
-        * Note within each zone the priority level at which this zone was
-        * brought into a happy state.  So that the next thread which scans this
-        * zone will start out at that priority level.
-        */
        for (i = 0; i < pgdat->nr_zones; i++) {
                struct zone *zone = pgdat->node_zones + i;
 
-               zone->prev_priority = temp_priority[i];
+               zone->prev_priority = zone->temp_priority;
        }
        if (!all_zones_ok) {
                cond_resched();
@@ -1264,6 +1220,7 @@ static int kswapd(void *p)
        };
        cpumask_t cpumask;
 
+       daemonize("kswapd%d", pgdat->node_id);
        cpumask = node_to_cpumask(pgdat->node_id);
        if (!cpus_empty(cpumask))
                set_cpus_allowed(tsk, cpumask);
@@ -1304,7 +1261,7 @@ static int kswapd(void *p)
                }
                finish_wait(&pgdat->kswapd_wait, &wait);
 
-               balance_pgdat(pgdat, order);
+               balance_pgdat(pgdat, 0, order);
        }
        return 0;
 }
@@ -1333,152 +1290,35 @@ void wakeup_kswapd(struct zone *zone, int order)
 
 #ifdef CONFIG_PM
 /*
- * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
- * from LRU lists system-wide, for given pass and priority, and returns the
- * number of reclaimed pages
- *
- * For pass > 3 we also try to shrink the LRU lists that contain a few pages
- */
-static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
-                                     int prio, struct scan_control *sc)
-{
-       struct zone *zone;
-       unsigned long nr_to_scan, ret = 0;
-
-       for_each_zone(zone) {
-
-               if (!populated_zone(zone))
-                       continue;
-
-               if (zone->all_unreclaimable && prio != DEF_PRIORITY)
-                       continue;
-
-               /* For pass = 0 we don't shrink the active list */
-               if (pass > 0) {
-                       zone->nr_scan_active += (zone->nr_active >> prio) + 1;
-                       if (zone->nr_scan_active >= nr_pages || pass > 3) {
-                               zone->nr_scan_active = 0;
-                               nr_to_scan = min(nr_pages, zone->nr_active);
-                               shrink_active_list(nr_to_scan, zone, sc, prio);
-                       }
-               }
-
-               zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
-               if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
-                       zone->nr_scan_inactive = 0;
-                       nr_to_scan = min(nr_pages, zone->nr_inactive);
-                       ret += shrink_inactive_list(nr_to_scan, zone, sc);
-                       if (ret >= nr_pages)
-                               return ret;
-               }
-       }
-
-       return ret;
-}
-
-/*
- * Try to free `nr_pages' of memory, system-wide, and return the number of
- * freed pages.
- *
- * Rather than trying to age LRUs the aim is to preserve the overall
- * LRU order by reclaiming preferentially
- * inactive > active > active referenced > active mapped
+ * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
+ * pages.
  */
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
-       unsigned long lru_pages, nr_slab;
+       pg_data_t *pgdat;
+       unsigned long nr_to_free = nr_pages;
        unsigned long ret = 0;
-       int pass;
-       struct reclaim_state reclaim_state;
-       struct zone *zone;
-       struct scan_control sc = {
-               .gfp_mask = GFP_KERNEL,
-               .may_swap = 0,
-               .swap_cluster_max = nr_pages,
-               .may_writepage = 1,
-               .swappiness = vm_swappiness,
+       unsigned retry = 2;
+       struct reclaim_state reclaim_state = {
+               .reclaimed_slab = 0,
        };
 
        current->reclaim_state = &reclaim_state;
-
-       lru_pages = 0;
-       for_each_zone(zone)
-               lru_pages += zone->nr_active + zone->nr_inactive;
-
-       nr_slab = global_page_state(NR_SLAB);
-       /* If slab caches are huge, it's better to hit them first */
-       while (nr_slab >= lru_pages) {
-               reclaim_state.reclaimed_slab = 0;
-               shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
-               if (!reclaim_state.reclaimed_slab)
+repeat:
+       for_each_online_pgdat(pgdat) {
+               unsigned long freed;
+
+               freed = balance_pgdat(pgdat, nr_to_free, 0);
+               ret += freed;
+               nr_to_free -= freed;
+               if ((long)nr_to_free <= 0)
                        break;
-
-               ret += reclaim_state.reclaimed_slab;
-               if (ret >= nr_pages)
-                       goto out;
-
-               nr_slab -= reclaim_state.reclaimed_slab;
        }
-
-       /*
-        * We try to shrink LRUs in 5 passes:
-        * 0 = Reclaim from inactive_list only
-        * 1 = Reclaim from active list but don't reclaim mapped
-        * 2 = 2nd pass of type 1
-        * 3 = Reclaim mapped (normal reclaim)
-        * 4 = 2nd pass of type 3
-        */
-       for (pass = 0; pass < 5; pass++) {
-               int prio;
-
-               /* Needed for shrinking slab caches later on */
-               if (!lru_pages)
-                       for_each_zone(zone) {
-                               lru_pages += zone->nr_active;
-                               lru_pages += zone->nr_inactive;
-                       }
-
-               /* Force reclaiming mapped pages in the passes #3 and #4 */
-               if (pass > 2) {
-                       sc.may_swap = 1;
-                       sc.swappiness = 100;
-               }
-
-               for (prio = DEF_PRIORITY; prio >= 0; prio--) {
-                       unsigned long nr_to_scan = nr_pages - ret;
-
-                       sc.nr_scanned = 0;
-                       ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
-                       if (ret >= nr_pages)
-                               goto out;
-
-                       reclaim_state.reclaimed_slab = 0;
-                       shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
-                       ret += reclaim_state.reclaimed_slab;
-                       if (ret >= nr_pages)
-                               goto out;
-
-                       if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
-                               blk_congestion_wait(WRITE, HZ / 10);
-               }
-
-               lru_pages = 0;
+       if (retry-- && ret < nr_pages) {
+               blk_congestion_wait(WRITE, HZ/5);
+               goto repeat;
        }
-
-       /*
-        * If ret = 0, we could not shrink LRUs, but there may be something
-        * in slab caches
-        */
-       if (!ret)
-               do {
-                       reclaim_state.reclaimed_slab = 0;
-                       shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
-                       ret += reclaim_state.reclaimed_slab;
-               } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
-
-out:
        current->reclaim_state = NULL;
-
        return ret;
 }
 #endif
@@ -1488,7 +1328,7 @@ out:
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int cpu_callback(struct notifier_block *nfb,
                                  unsigned long action, void *hcpu)
 {
        pg_data_t *pgdat;
@@ -1506,35 +1346,21 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-/*
- * This kswapd start function will be called by init and node-hot-add.
- * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
- */
-int kswapd_run(int nid)
-{
-       pg_data_t *pgdat = NODE_DATA(nid);
-       int ret = 0;
-
-       if (pgdat->kswapd)
-               return 0;
-
-       pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
-       if (IS_ERR(pgdat->kswapd)) {
-               /* failure at boot is fatal */
-               BUG_ON(system_state == SYSTEM_BOOTING);
-               printk("Failed to start kswapd on node %d\n",nid);
-               ret = -1;
-       }
-       return ret;
-}
-
 static int __init kswapd_init(void)
 {
-       int nid;
+       pg_data_t *pgdat;
 
        swap_setup();
-       for_each_online_node(nid)
-               kswapd_run(nid);
+       for_each_online_pgdat(pgdat) {
+               pid_t pid;
+
+               pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+               BUG_ON(pid < 0);
+               read_lock(&tasklist_lock);
+               pgdat->kswapd = find_task_by_real_pid(pid);
+               read_unlock(&tasklist_lock);
+       }
+       total_memory = nr_free_pagecache_pages();
        hotcpu_notifier(cpu_callback, 0);
        return 0;
 }
@@ -1547,6 +1373,10 @@ module_init(kswapd_init)
  *
  * If non-zero call zone_reclaim when the number of free pages falls below
  * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
  */
 int zone_reclaim_mode __read_mostly;
 
@@ -1554,6 +1384,12 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_ZONE (1<<0)    /* Run shrink_cache on the zone */
 #define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)    /* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3)    /* Do a global slab shrink if the zone is out of memory */
+
+/*
+ * Mininum time between zone reclaim scans
+ */
+int zone_reclaim_interval __read_mostly = 30*HZ;
 
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1562,18 +1398,6 @@ int zone_reclaim_mode __read_mostly;
  */
 #define ZONE_RECLAIM_PRIORITY 4
 
-/*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
- * occur.
- */
-int sysctl_min_unmapped_ratio = 1;
-
-/*
- * If the number of slab pages in a zone grows beyond this percentage then
- * slab reclaim needs to occur.
- */
-int sysctl_min_slab_ratio = 5;
-
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -1588,10 +1412,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        struct scan_control sc = {
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+               .nr_mapped = read_page_state(nr_mapped),
                .swap_cluster_max = max_t(unsigned long, nr_pages,
                                        SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
-               .swappiness = vm_swappiness,
        };
 
        disable_swap_token();
@@ -1605,42 +1429,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
-       if (zone_page_state(zone, NR_FILE_PAGES) -
-               zone_page_state(zone, NR_FILE_MAPPED) >
-               zone->min_unmapped_ratio) {
-               /*
-                * Free memory by calling shrink zone with increasing
-                * priorities until we have enough memory freed.
-                */
-               priority = ZONE_RECLAIM_PRIORITY;
-               do {
-                       note_zone_scanning_priority(zone, priority);
-                       nr_reclaimed += shrink_zone(priority, zone, &sc);
-                       priority--;
-               } while (priority >= 0 && nr_reclaimed < nr_pages);
-       }
+       /*
+        * Free memory by calling shrink zone with increasing priorities
+        * until we have enough memory freed.
+        */
+       priority = ZONE_RECLAIM_PRIORITY;
+       do {
+               nr_reclaimed += shrink_zone(priority, zone, &sc);
+               priority--;
+       } while (priority >= 0 && nr_reclaimed < nr_pages);
 
-       if (zone_page_state(zone, NR_SLAB) > zone->min_slab_pages) {
+       if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
                /*
                 * shrink_slab() does not currently allow us to determine how
-                * many pages were freed in this zone. So we take the current
-                * number of slab pages and shake the slab until it is reduced
-                * by the same nr_pages that we used for reclaiming unmapped
-                * pages.
+                * many pages were freed in this zone. So we just shake the slab
+                * a bit and then go off node for this particular allocation
+                * despite possibly having freed enough memory to allocate in
+                * this zone.  If we freed local memory then the next
+                * allocations will be local again.
                 *
-                * Note that shrink_slab will free memory on all zones and may
-                * take a long time.
+                * shrink_slab will free memory on all zones and may take
+                * a long time.
                 */
-               unsigned long limit = zone_page_state(zone,
-                               NR_SLAB) - nr_pages;
-
-               while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
-                       zone_page_state(zone, NR_SLAB) > limit)
-                       ;
+               shrink_slab(sc.nr_scanned, gfp_mask, order);
        }
 
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+
+       if (nr_reclaimed == 0) {
+               /*
+                * We were unable to reclaim enough pages to stay on node.  We
+                * now allow off node accesses for a certain time period before
+                * trying again to reclaim pages from the local zone.
+                */
+               zone->last_unsuccessful_zone_reclaim = jiffies;
+       }
+
        return nr_reclaimed >= nr_pages;
 }
 
@@ -1650,20 +1475,14 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        int node_id;
 
        /*
-        * Zone reclaim reclaims unmapped file backed pages and
-        * slab pages if we are over the defined limits.
-        *
-        * A small portion of unmapped file backed pages is needed for
-        * file I/O otherwise pages read by file I/O will be immediately
-        * thrown out if the zone is overallocated. So we do not reclaim
-        * if less than a specified percentage of the zone is used by
-        * unmapped file backed pages.
+        * Do not reclaim if there was a recent unsuccessful attempt at zone
+        * reclaim.  In that case we let allocations go off node for the
+        * zone_reclaim_interval.  Otherwise we would scan for each off-node
+        * page allocation.
         */
-       if (zone_page_state(zone, NR_FILE_PAGES) -
-           zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio
-           && zone_page_state(zone, NR_SLAB)
-                       <= zone->min_slab_pages)
-               return 0;
+       if (time_before(jiffies,
+               zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+                       return 0;
 
        /*
         * Avoid concurrent zone reclaims, do not reclaim in a zone that does