Patched to 2.6.10-1.14_FC2.

[linux-2.6.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 152299c..58b13c1 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -31,12 +31,15 @@
  #include <linux/topology.h>
  #include <linux/sysctl.h>
  #include <linux/cpu.h>
+#include <linux/ckrm_mem_inline.h>
  #include <linux/vs_base.h>
  #include <linux/vs_limit.h>
+#include <linux/nodemask.h>
  
  #include <asm/tlbflush.h>
  
-DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
+nodemask_t node_online_map = NODE_MASK_NONE;
+nodemask_t node_possible_map = NODE_MASK_ALL;
  struct pglist_data *pgdat_list;
  unsigned long totalram_pages;
  unsigned long totalhigh_pages;
@@ -62,8 +65,8 @@ EXPORT_SYMBOL(zone_table);
  static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  int min_free_kbytes = 1024;
  
-static unsigned long __initdata nr_kernel_pages;
-static unsigned long __initdata nr_all_pages;
+unsigned long __initdata nr_kernel_pages;
+unsigned long __initdata nr_all_pages;
  
  /*
   * Temporary debugging check for pages not lying within a given zone.
@@ -83,9 +86,9 @@ static void bad_page(const char *function, struct page *page)
  {
         printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
                 function, current->comm, page);
-       printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n",
-               (unsigned long)page->flags, page->mapping,
-               (int)page->mapcount, page_count(page));
+       printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d (%s)\n",
+               (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+               page->mapping, page_mapcount(page), page_count(page), print_tainted());
         printk(KERN_EMERG "Backtrace:\n");
         dump_stack();
         printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
@@ -94,13 +97,12 @@ static void bad_page(const char *function, struct page *page)
                         1 << PG_lru     |
                         1 << PG_active  |
                         1 << PG_dirty   |
-                       1 << PG_maplock |
-                       1 << PG_anon    |
                         1 << PG_swapcache |
                         1 << PG_writeback);
         set_page_count(page, 0);
+       reset_page_mapcount(page);
         page->mapping = NULL;
-       page->mapcount = 0;
+       tainted |= TAINT_BAD_PAGE;
  }
  
  #if !defined(CONFIG_HUGETLB_PAGE) && !defined(CONFIG_CRASH_DUMP) \
@@ -237,8 +239,6 @@ static inline void free_pages_check(const char *function, struct page *page)
                         1 << PG_active  |
                         1 << PG_reclaim |
                         1 << PG_slab    |
-                       1 << PG_maplock |
-                       1 << PG_anon    |
                         1 << PG_swapcache |
                         1 << PG_writeback )))
                 bad_page(function, page);
@@ -276,6 +276,7 @@ free_pages_bulk(struct zone *zone, int count,
                 /* have to delete it as __free_pages_bulk list manipulates */
                 list_del(&page->lru);
                 __free_pages_bulk(page, base, zone, area, order);
+               ckrm_clear_page_class(page);
                 ret++;
         }
         spin_unlock_irqrestore(&zone->lock, flags);
@@ -360,16 +361,20 @@ static void prep_new_page(struct page *page, int order)
                         1 << PG_active  |
                         1 << PG_dirty   |
                         1 << PG_reclaim |
-                       1 << PG_maplock |
-                       1 << PG_anon    |
                         1 << PG_swapcache |
                         1 << PG_writeback )))
                 bad_page(__FUNCTION__, page);
  
         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                         1 << PG_referenced | 1 << PG_arch_1 |
+#ifdef CONFIG_CKRM_RES_MEM
+                       1 << PG_ckrm_account |
+#endif
                         1 << PG_checked | 1 << PG_mappedtodisk);
         page->private = 0;
+#ifdef CONFIG_CKRM_RES_MEM
+       page->ckrm_zone = NULL;
+#endif
         set_page_refs(page, order);
  }
  
@@ -523,6 +528,8 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
  
         kernel_map_pages(page, 1, 0);
         inc_page_state(pgfree);
+       if (PageAnon(page))
+               page->mapping = NULL;
         free_pages_check(__FUNCTION__, page);
         pcp = &zone->pageset[get_cpu()].pcp[cold];
         local_irq_save(flags);
@@ -612,83 +619,79 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
  {
         const int wait = gfp_mask & __GFP_WAIT;
         unsigned long min;
-       struct zone **zones;
+       struct zone **zones, *z;
         struct page *page;
         struct reclaim_state reclaim_state;
         struct task_struct *p = current;
         int i;
         int alloc_type;
         int do_retry;
+       int can_try_harder;
  
         might_sleep_if(wait);
  
+       /*
+        * The caller may dip into page reserves a bit more if the caller
+        * cannot run direct reclaim, or is the caller has realtime scheduling
+        * policy
+        */
+       can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+
+       if (!ckrm_class_limit_ok((ckrm_get_mem_class(current)))) {
+               return NULL;
+       }
+
         zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-       if (zones[0] == NULL)     /* no zones in the zonelist */
+
+       if (unlikely(zones[0] == NULL)) {
+               /* Should this ever happen?? */
                 return NULL;
+       }
  
         alloc_type = zone_idx(zones[0]);
  
         /* Go through the zonelist once, looking for a zone with enough free */
-       for (i = 0; zones[i] != NULL; i++) {
-               struct zone *z = zones[i];
+       for (i = 0; (z = zones[i]) != NULL; i++) {
+               min = z->pages_low + (1<<order) + z->protection[alloc_type];
  
-               min = (1<<order) + z->protection[alloc_type];
-
-               /*
-                * We let real-time tasks dip their real-time paws a little
-                * deeper into reserves.
-                */
-               if (rt_task(p))
-                       min -= z->pages_low >> 1;
+               if (z->free_pages < min)
+                       continue;
  
-               if (z->free_pages >= min ||
-                               (!wait && z->free_pages >= z->pages_high)) {
-                       page = buffered_rmqueue(z, order, gfp_mask);
-                       if (page) {
-                               zone_statistics(zonelist, z);
-                               goto got_pg;
-                       }
-               }
+               page = buffered_rmqueue(z, order, gfp_mask);
+               if (page)
+                       goto got_pg;
         }
  
-       /* we're somewhat low on memory, failed to find what we needed */
-       for (i = 0; zones[i] != NULL; i++)
-               wakeup_kswapd(zones[i]);
-
-       /* Go through the zonelist again, taking __GFP_HIGH into account */
-       for (i = 0; zones[i] != NULL; i++) {
-               struct zone *z = zones[i];
-
-               min = (1<<order) + z->protection[alloc_type];
+       for (i = 0; (z = zones[i]) != NULL; i++)
+               wakeup_kswapd(z);
  
+       /*
+        * Go through the zonelist again. Let __GFP_HIGH and allocations
+        * coming from realtime tasks to go deeper into reserves
+        */
+       for (i = 0; (z = zones[i]) != NULL; i++) {
+               min = z->pages_min;
                 if (gfp_mask & __GFP_HIGH)
-                       min -= z->pages_low >> 2;
-               if (rt_task(p))
-                       min -= z->pages_low >> 1;
+                       min /= 2;
+               if (can_try_harder)
+                       min -= min / 4;
+               min += (1<<order) + z->protection[alloc_type];
  
-               if (z->free_pages >= min ||
-                               (!wait && z->free_pages >= z->pages_high)) {
-                       page = buffered_rmqueue(z, order, gfp_mask);
-                       if (page) {
-                               zone_statistics(zonelist, z);
-                               goto got_pg;
-                       }
-               }
-       }
+               if (z->free_pages < min)
+                       continue;
  
-       /* here we're in the low on memory slow path */
+               page = buffered_rmqueue(z, order, gfp_mask);
+               if (page)
+                       goto got_pg;
+       }
  
-rebalance:
+       /* This allocation should allow future memory freeing. */
         if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
                 /* go through the zonelist yet again, ignoring mins */
-               for (i = 0; zones[i] != NULL; i++) {
-                       struct zone *z = zones[i];
-
+               for (i = 0; (z = zones[i]) != NULL; i++) {
                         page = buffered_rmqueue(z, order, gfp_mask);
-                       if (page) {
-                               zone_statistics(zonelist, z);
+                       if (page)
                                 goto got_pg;
-                       }
                 }
                 goto nopage;
         }
@@ -697,6 +700,8 @@ rebalance:
         if (!wait)
                 goto nopage;
  
+rebalance:
+       /* We now go into synchronous reclaim */
         p->flags |= PF_MEMALLOC;
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
@@ -707,27 +712,28 @@ rebalance:
         p->flags &= ~PF_MEMALLOC;
  
         /* go through the zonelist yet one more time */
-       for (i = 0; zones[i] != NULL; i++) {
-               struct zone *z = zones[i];
+       for (i = 0; (z = zones[i]) != NULL; i++) {
+               min = z->pages_min;
+               if (gfp_mask & __GFP_HIGH)
+                       min /= 2;
+               if (can_try_harder)
+                       min -= min / 4;
+               min += (1<<order) + z->protection[alloc_type];
  
-               min = (1UL << order) + z->protection[alloc_type];
+               if (z->free_pages < min)
+                       continue;
  
-               if (z->free_pages >= min ||
-                               (!wait && z->free_pages >= z->pages_high)) {
-                       page = buffered_rmqueue(z, order, gfp_mask);
-                       if (page) {
-                               zone_statistics(zonelist, z);
-                               goto got_pg;
-                       }
-               }
+               page = buffered_rmqueue(z, order, gfp_mask);
+               if (page)
+                       goto got_pg;
         }
  
         /*
          * Don't let big-order allocations loop unless the caller explicitly
          * requests that.  Wait for some write requests to complete then retry.
          *
-        * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
-        * may not be true in other implementations.
+        * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
+        * <= 3, but that may not be true in other implementations.
          */
         do_retry = 0;
         if (!(gfp_mask & __GFP_NORETRY)) {
@@ -750,6 +756,7 @@ nopage:
         }
         return NULL;
  got_pg:
+       zone_statistics(zonelist, z);
         kernel_map_pages(page, 1 << order, 1);
         return page;
  }
@@ -814,8 +821,8 @@ EXPORT_SYMBOL(__free_pages);
  fastcall void free_pages(unsigned long addr, unsigned int order)
  {
         if (addr != 0) {
-               BUG_ON(!virt_addr_valid(addr));
-               __free_pages(virt_to_page(addr), order);
+               BUG_ON(!virt_addr_valid((void *)addr));
+               __free_pages(virt_to_page((void *)addr), order);
         }
  }
  
@@ -977,18 +984,36 @@ unsigned long __read_page_state(unsigned offset)
         return ret;
  }
  
+void __get_zone_counts(unsigned long *active, unsigned long *inactive,
+                       unsigned long *free, struct pglist_data *pgdat)
+{
+       struct zone *zones = pgdat->node_zones;
+       int i;
+
+       *active = 0;
+       *inactive = 0;
+       *free = 0;
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               *active += zones[i].nr_active;
+               *inactive += zones[i].nr_inactive;
+               *free += zones[i].free_pages;
+       }
+}
+
  void get_zone_counts(unsigned long *active,
                 unsigned long *inactive, unsigned long *free)
  {
-       struct zone *zone;
+       struct pglist_data *pgdat;
  
         *active = 0;
         *inactive = 0;
         *free = 0;
-       for_each_zone(zone) {
-               *active += zone->nr_active;
-               *inactive += zone->nr_inactive;
-               *free += zone->free_pages;
+       for_each_pgdat(pgdat) {
+               unsigned long l, m, n;
+               __get_zone_counts(&l, &m, &n, pgdat);
+               *active += l;
+               *inactive += m;
+               *free += n;
         }
  }
  
@@ -1100,6 +1125,8 @@ void show_free_areas(void)
                         " active:%lukB"
                         " inactive:%lukB"
                         " present:%lukB"
+                       " pages_scanned:%lu"
+                       " all_unreclaimable? %s"
                         "\n",
                         zone->name,
                         K(zone->free_pages),
@@ -1108,7 +1135,9 @@ void show_free_areas(void)
                         K(zone->pages_high),
                         K(zone->nr_active),
                         K(zone->nr_inactive),
-                       K(zone->present_pages)
+                       K(zone->present_pages),
+                       zone->pages_scanned,
+                       (zone->all_unreclaimable ? "yes" : "no")
                         );
                 printk("protections[]:");
                 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -1206,6 +1235,12 @@ static int __init find_next_best_node(int node, void *used_node_mask)
                 if (test_bit(n, used_node_mask))
                         continue;
  
+               /* Use the local node if we haven't already */
+               if (!test_bit(node, used_node_mask)) {
+                       best_node = node;
+                       break;
+               }
+
                 /* Use the distance array to find the distance */
                 val = node_distance(node, n);
  
@@ -1393,14 +1428,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
   * up by free_all_bootmem() once the early boot process is
   * done. Non-atomic initialization, single-pass.
   */
-void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
-               unsigned long zone, unsigned long start_pfn)
+void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+               unsigned long start_pfn)
  {
+       struct page *start = pfn_to_page(start_pfn);
         struct page *page;
  
         for (page = start; page < (start + size); page++) {
                 set_page_zone(page, NODEZONE(nid, zone));
                 set_page_count(page, 0);
+               reset_page_mapcount(page);
                 SetPageReserved(page);
                 INIT_LIST_HEAD(&page->lru);
  #ifdef WANT_PAGE_VIRTUAL
@@ -1412,9 +1449,55 @@ void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
         }
  }
  
+/*
+ * Page buddy system uses "index >> (i+1)", where "index" is
+ * at most "size-1".
+ *
+ * The extra "+3" is to round down to byte size (8 bits per byte
+ * assumption). Thus we get "(size-1) >> (i+4)" as the last byte
+ * we can access.
+ *
+ * The "+1" is because we want to round the byte allocation up
+ * rather than down. So we should have had a "+7" before we shifted
+ * down by three. Also, we have to add one as we actually _use_ the
+ * last bit (it's [0,n] inclusive, not [0,n[).
+ *
+ * So we actually had +7+1 before we shift down by 3. But
+ * (n+8) >> 3 == (n >> 3) + 1 (modulo overflows, which we do not have).
+ *
+ * Finally, we LONG_ALIGN because all bitmap operations are on longs.
+ */
+unsigned long pages_to_bitmap_size(unsigned long order, unsigned long nr_pages)
+{
+       unsigned long bitmap_size;
+
+       bitmap_size = (nr_pages-1) >> (order+4);
+       bitmap_size = LONG_ALIGN(bitmap_size+1);
+
+       return bitmap_size;
+}
+
+void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, unsigned long size)
+{
+       int order;
+       for (order = 0; ; order++) {
+               unsigned long bitmap_size;
+
+               INIT_LIST_HEAD(&zone->free_area[order].free_list);
+               if (order == MAX_ORDER-1) {
+                       zone->free_area[order].map = NULL;
+                       break;
+               }
+
+               bitmap_size = pages_to_bitmap_size(order, size);
+               zone->free_area[order].map =
+                 (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+       }
+}
+
  #ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(start, size, nid, zone, start_pfn) \
-       memmap_init_zone((start), (size), (nid), (zone), (start_pfn))
+#define memmap_init(size, nid, zone, start_pfn) \
+       memmap_init_zone((size), (nid), (zone), (start_pfn))
  #endif
  
  /*
@@ -1429,7 +1512,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
         unsigned long i, j;
         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
         int cpu, nid = pgdat->node_id;
-       struct page *lmem_map = pgdat->node_mem_map;
         unsigned long zone_start_pfn = pgdat->node_start_pfn;
  
         pgdat->nr_zones = 0;
@@ -1492,8 +1574,10 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                 }
                 printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
                                 zone_names[j], realsize, batch);
+#ifndef CONFIG_CKRM_RES_MEM
                 INIT_LIST_HEAD(&zone->active_list);
                 INIT_LIST_HEAD(&zone->inactive_list);
+#endif
                 zone->nr_scan_active = 0;
                 zone->nr_scan_inactive = 0;
                 zone->nr_active = 0;
@@ -1517,71 +1601,41 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
  
                 pgdat->nr_zones = j+1;
  
-               zone->zone_mem_map = lmem_map;
+               zone->zone_mem_map = pfn_to_page(zone_start_pfn);
                 zone->zone_start_pfn = zone_start_pfn;
  
                 if ((zone_start_pfn) & (zone_required_alignment-1))
                         printk("BUG: wrong zone alignment, it will crash\n");
  
-               memmap_init(lmem_map, size, nid, j, zone_start_pfn);
+               memmap_init(size, nid, j, zone_start_pfn);
  
                 zone_start_pfn += size;
-               lmem_map += size;
-
-               for (i = 0; ; i++) {
-                       unsigned long bitmap_size;
-
-                       INIT_LIST_HEAD(&zone->free_area[i].free_list);
-                       if (i == MAX_ORDER-1) {
-                               zone->free_area[i].map = NULL;
-                               break;
-                       }
  
-                       /*
-                        * Page buddy system uses "index >> (i+1)",
-                        * where "index" is at most "size-1".
-                        *
-                        * The extra "+3" is to round down to byte
-                        * size (8 bits per byte assumption). Thus
-                        * we get "(size-1) >> (i+4)" as the last byte
-                        * we can access.
-                        *
-                        * The "+1" is because we want to round the
-                        * byte allocation up rather than down. So
-                        * we should have had a "+7" before we shifted
-                        * down by three. Also, we have to add one as
-                        * we actually _use_ the last bit (it's [0,n]
-                        * inclusive, not [0,n[).
-                        *
-                        * So we actually had +7+1 before we shift
-                        * down by 3. But (n+8) >> 3 == (n >> 3) + 1
-                        * (modulo overflows, which we do not have).
-                        *
-                        * Finally, we LONG_ALIGN because all bitmap
-                        * operations are on longs.
-                        */
-                       bitmap_size = (size-1) >> (i+4);
-                       bitmap_size = LONG_ALIGN(bitmap_size+1);
-                       zone->free_area[i].map = 
-                         (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
-               }
+               zone_init_free_lists(pgdat, zone, zone->spanned_pages);
         }
  }
  
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
-               struct page *node_mem_map, unsigned long *zones_size,
-               unsigned long node_start_pfn, unsigned long *zholes_size)
+void __init node_alloc_mem_map(struct pglist_data *pgdat)
  {
         unsigned long size;
  
+       size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+       pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+#ifndef CONFIG_DISCONTIGMEM
+       mem_map = contig_page_data.node_mem_map;
+#endif
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+               unsigned long *zones_size, unsigned long node_start_pfn,
+               unsigned long *zholes_size)
+{
         pgdat->node_id = nid;
         pgdat->node_start_pfn = node_start_pfn;
         calculate_zone_totalpages(pgdat, zones_size, zholes_size);
-       if (!node_mem_map) {
-               size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-               node_mem_map = alloc_bootmem_node(pgdat, size);
-       }
-       pgdat->node_mem_map = node_mem_map;
+
+       if (!pfn_to_page(node_start_pfn))
+               node_alloc_mem_map(pgdat);
  
         free_area_init_core(pgdat, zones_size, zholes_size);
  }
@@ -1594,9 +1648,8 @@ EXPORT_SYMBOL(contig_page_data);
  
  void __init free_area_init(unsigned long *zones_size)
  {
-       free_area_init_node(0, &contig_page_data, NULL, zones_size,
+       free_area_init_node(0, &contig_page_data, zones_size,
                         __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
-       mem_map = contig_page_data.node_mem_map;
  }
  #endif
  
@@ -1855,11 +1908,11 @@ static void setup_per_zone_protection(void)
                                  * We never protect zones that don't have memory
                                  * in them (j>max_zone) or zones that aren't in
                                  * the zonelists for a certain type of
-                                * allocation (j>i).  We have to assign these to
-                                * zero because the lower zones take
+                                * allocation (j>=i).  We have to assign these
+                                * to zero because the lower zones take
                                  * contributions from the higher zones.
                                  */
-                               if (j > max_zone || j > i) {
+                               if (j > max_zone || j >= i) {
                                         zone->protection[i] = 0;
                                         continue;
                                 }
@@ -1868,7 +1921,6 @@ static void setup_per_zone_protection(void)
                                  */
                                 zone->protection[i] = higherzone_val(zone,
                                                                 max_zone, i);
-                               zone->protection[i] += zone->pages_low;
                         }
                 }
         }
@@ -1917,8 +1969,12 @@ static void setup_per_zone_pages_min(void)
                                            lowmem_pages;
                 }
  
-               zone->pages_low = zone->pages_min * 2;
-               zone->pages_high = zone->pages_min * 3;
+               /*
+                * When interpreting these watermarks, just keep in mind that:
+                * zone->pages_min == (zone->pages_min * 4) / 4;
+                */
+               zone->pages_low   = (zone->pages_min * 5) / 4;
+               zone->pages_high  = (zone->pages_min * 6) / 4;
                 spin_unlock_irqrestore(&zone->lru_lock, flags);
         }
  }
@@ -1927,24 +1983,25 @@ static void setup_per_zone_pages_min(void)
   * Initialise min_free_kbytes.
   *
   * For small machines we want it small (128k min).  For large machines
- * we want it large (16MB max).  But it is not linear, because network
+ * we want it large (64MB max).  But it is not linear, because network
   * bandwidth does not increase linearly with machine size.  We use
   *
- *     min_free_kbytes = sqrt(lowmem_kbytes)
+ *     min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ *     min_free_kbytes = sqrt(lowmem_kbytes * 16)
   *
   * which yields
   *
- * 16MB:       128k
- * 32MB:       181k
- * 64MB:       256k
- * 128MB:      362k
- * 256MB:      512k
- * 512MB:      724k
- * 1024MB:     1024k
- * 2048MB:     1448k
- * 4096MB:     2048k
- * 8192MB:     2896k
- * 16384MB:    4096k
+ * 16MB:       512k
+ * 32MB:       724k
+ * 64MB:       1024k
+ * 128MB:      1448k
+ * 256MB:      2048k
+ * 512MB:      2896k
+ * 1024MB:     4096k
+ * 2048MB:     5792k
+ * 4096MB:     8192k
+ * 8192MB:     11584k
+ * 16384MB:    16384k
   */
  static int __init init_per_zone_pages_min(void)
  {
@@ -1952,11 +2009,11 @@ static int __init init_per_zone_pages_min(void)
  
         lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
  
-       min_free_kbytes = int_sqrt(lowmem_kbytes);
+       min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
         if (min_free_kbytes < 128)
                 min_free_kbytes = 128;
-       if (min_free_kbytes > 16384)
-               min_free_kbytes = 16384;
+       if (min_free_kbytes > 65536)
+               min_free_kbytes = 65536;
         setup_per_zone_pages_min();
         setup_per_zone_protection();
         return 0;
@@ -2003,41 +2060,40 @@ void *__init alloc_large_system_hash(const char *tablename,
                                      unsigned int *_hash_shift,
                                      unsigned int *_hash_mask)
  {
-       unsigned long mem, max, log2qty, size;
+       unsigned long long max;
+       unsigned long log2qty, size;
         void *table;
  
-       /* round applicable memory size up to nearest megabyte */
-       mem = consider_highmem ? nr_all_pages : nr_kernel_pages;
-       mem += (1UL << (20 - PAGE_SHIFT)) - 1;
-       mem >>= 20 - PAGE_SHIFT;
-       mem <<= 20 - PAGE_SHIFT;
-
-       /* limit to 1 bucket per 2^scale bytes of low memory (rounded up to
-        * nearest power of 2 in size) */
-       if (scale > PAGE_SHIFT)
-               mem >>= (scale - PAGE_SHIFT);
-       else
-               mem <<= (PAGE_SHIFT - scale);
-
-       mem = 1UL << (long_log2(mem) + 1);
+       /* allow the kernel cmdline to have a say */
+       if (!numentries) {
+               /* round applicable memory size up to nearest megabyte */
+               numentries = consider_highmem ? nr_all_pages : nr_kernel_pages;
+               numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
+               numentries >>= 20 - PAGE_SHIFT;
+               numentries <<= 20 - PAGE_SHIFT;
+
+               /* limit to 1 bucket per 2^scale bytes of low memory */
+               if (scale > PAGE_SHIFT)
+                       numentries >>= (scale - PAGE_SHIFT);
+               else
+                       numentries <<= (PAGE_SHIFT - scale);
+       }
+       /* rounded up to nearest power of 2 in size */
+       numentries = 1UL << (long_log2(numentries) + 1);
  
-       /* limit allocation size */
-       max = (1UL << (PAGE_SHIFT + MAX_SYS_HASH_TABLE_ORDER)) / bucketsize;
-       if (max > mem)
-               max = mem;
+       /* limit allocation size to 1/16 total memory */
+       max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
+       do_div(max, bucketsize);
  
-       /* allow the kernel cmdline to have a say */
-       if (!numentries || numentries > max)
+       if (numentries > max)
                 numentries = max;
  
         log2qty = long_log2(numentries);
  
         do {
                 size = bucketsize << log2qty;
-
-               table = (void *) alloc_bootmem(size);
-
-       } while (!table && size > PAGE_SIZE);
+               table = alloc_bootmem(size);
+       } while (!table && size > PAGE_SIZE && --log2qty);
  
         if (!table)
                 panic("Failed to allocate %s hash table\n", tablename);