#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
+#include <linux/cpuset.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <linux/vs_limit.h>
#include <asm/tlbflush.h>
#include "internal.h"
-/* MCD - HACK: Find somewhere to initialize this EARLY, or make this initializer cleaner */
+/*
+ * MCD - HACK: Find somewhere to initialize this EARLY, or make this
+ * initializer cleaner
+ */
nodemask_t node_online_map = { { [0] = 1UL } };
+EXPORT_SYMBOL(node_online_map);
nodemask_t node_possible_map = NODE_MASK_ALL;
+EXPORT_SYMBOL(node_possible_map);
struct pglist_data *pgdat_list;
unsigned long totalram_pages;
unsigned long totalhigh_pages;
long nr_swap_pages;
+
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
page->private = 0;
}
+/*
+ * Locate the struct page for both the matching buddy in our
+ * pair (buddy1) and the combined O(n+1) page they form (page).
+ *
+ * 1) Any buddy B1 will have an order O twin B2 which satisfies
+ * the following equation:
+ * B2 = B1 ^ (1 << O)
+ * For example, if the starting buddy (buddy2) is #8 its order
+ * 1 buddy is #10:
+ * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
+ *
+ * 2) Any buddy B will have an order O+1 parent P which
+ * satisfies the following equation:
+ * P = B & ~(1 << O)
+ *
+ * Assumption: *_mem_map is contigious at least up to MAX_ORDER
+ */
+static inline struct page *
+__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
+{
+ unsigned long buddy_idx = page_idx ^ (1 << order);
+
+ return page + (buddy_idx - page_idx);
+}
+
+static inline unsigned long
+__find_combined_index(unsigned long page_idx, unsigned int order)
+{
+ return (page_idx & ~(1 << order));
+}
+
/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
* -- wli
*/
-static inline void __free_pages_bulk (struct page *page, struct page *base,
+static inline void __free_pages_bulk (struct page *page,
struct zone *zone, unsigned int order)
{
unsigned long page_idx;
- struct page *coalesced;
int order_size = 1 << order;
if (unlikely(order))
destroy_compound_page(page, order);
- page_idx = page - base;
+ page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
BUG_ON(page_idx & (order_size - 1));
BUG_ON(bad_range(zone, page));
zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
+ unsigned long combined_idx;
struct free_area *area;
struct page *buddy;
- int buddy_idx;
- buddy_idx = (page_idx ^ (1 << order));
- buddy = base + buddy_idx;
+ combined_idx = __find_combined_index(page_idx, order);
+ buddy = __page_find_buddy(page, page_idx, order);
+
if (bad_range(zone, buddy))
break;
if (!page_is_buddy(buddy, order))
- break;
- /* Move the buddy up one level. */
+ break; /* Move the buddy up one level. */
list_del(&buddy->lru);
area = zone->free_area + order;
area->nr_free--;
rmv_page_order(buddy);
- page_idx &= buddy_idx;
+ page = page + (combined_idx - page_idx);
+ page_idx = combined_idx;
order++;
}
- coalesced = base + page_idx;
- set_page_order(coalesced, order);
- list_add(&coalesced->lru, &zone->free_area[order].free_list);
+ set_page_order(page, order);
+ list_add(&page->lru, &zone->free_area[order].free_list);
zone->free_area[order].nr_free++;
}
static inline void free_pages_check(const char *function, struct page *page)
{
- if ( page_mapped(page) ||
+ if ( page_mapcount(page) ||
page->mapping != NULL ||
page_count(page) != 0 ||
(page->flags & (
struct list_head *list, unsigned int order)
{
unsigned long flags;
- struct page *base, *page = NULL;
+ struct page *page = NULL;
int ret = 0;
- base = zone->zone_mem_map;
spin_lock_irqsave(&zone->lock, flags);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
- __free_pages_bulk(page, base, zone, order);
+ __free_pages_bulk(page, zone, order);
ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
*/
static void prep_new_page(struct page *page, int order)
{
- if (page->mapping || page_mapped(page) ||
+ if (page->mapping || page_mapcount(page) ||
(page->flags & (
1 << PG_private |
1 << PG_locked |
free_hot_cold_page(page, 1);
}
-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags)
{
int i;
* or two.
*/
static struct page *
-buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
+buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
{
unsigned long flags;
struct page *page = NULL;
* This is the 'heart' of the zoned buddy allocator.
*/
struct page * fastcall
-__alloc_pages(unsigned int gfp_mask, unsigned int order,
+__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
const int wait = gfp_mask & __GFP_WAIT;
classzone_idx, 0, 0))
continue;
+ if (!cpuset_zone_allowed(z))
+ continue;
+
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
* coming from realtime tasks to go deeper into reserves
+ *
+ * This is the last chance, in general, before the goto nopage.
+ * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min,
gfp_mask & __GFP_HIGH))
continue;
+ if (wait && !cpuset_zone_allowed(z))
+ continue;
+
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
}
/* This allocation should allow future memory freeing. */
- if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
- /* go through the zonelist yet again, ignoring mins */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
+
+ if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+ && !in_interrupt()) {
+ if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+ /* go through the zonelist yet again, ignoring mins */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ if (!cpuset_zone_allowed(z))
+ continue;
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
}
goto nopage;
}
gfp_mask & __GFP_HIGH))
continue;
+ if (!cpuset_zone_allowed(z))
+ continue;
+
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
classzone_idx, 0, 0))
continue;
+ if (!cpuset_zone_allowed(z))
+ continue;
+
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
/*
* Common helper functions.
*/
-fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
+fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)
{
struct page * page;
page = alloc_pages(gfp_mask, order);
EXPORT_SYMBOL(__get_free_pages);
-fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
+fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask)
{
struct page * page;
#define MAX_NODE_LOAD (num_online_nodes())
static int __initdata node_load[MAX_NUMNODES];
/**
- * find_next_best_node - find the next node that should appear in a given
- * node's fallback list
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
* @used_node_mask: nodemask_t of already used nodes
*
/* initialize zonelists */
for (i = 0; i < GFP_ZONETYPES; i++) {
zonelist = pgdat->node_zonelists + i;
- memset(zonelist, 0, sizeof(*zonelist));
zonelist->zones[0] = NULL;
}
struct zonelist *zonelist;
zonelist = pgdat->node_zonelists + i;
- memset(zonelist, 0, sizeof(*zonelist));
j = 0;
k = ZONE_NORMAL;
for_each_online_node(i)
build_zonelists(NODE_DATA(i));
printk("Built %i zonelists\n", num_online_nodes());
+ cpuset_init_current_mems_allowed();
}
/*
if (batch < 1)
batch = 1;
+ /*
+ * Clamp the batch to a 2^n - 1 value. Having a power
+ * of 2 value was found to be more likely to have
+ * suboptimal cache aliasing properties in some cases.
+ *
+ * For example if 2 tasks are alternately allocating
+ * batches of pages, one task can end up with a lot
+ * of pages of one half of the possible page colors
+ * and the other with pages of the other colors.
+ */
+ batch = (1 << fls(batch + batch/2)) - 1;
+
for (cpu = 0; cpu < NR_CPUS; cpu++) {
struct per_cpu_pages *pcp;
}
}
-void __init node_alloc_mem_map(struct pglist_data *pgdat)
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long size;
- size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
- pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+ /* Skip empty nodes */
+ if (!pgdat->node_spanned_pages)
+ return;
+
+ /* ia64 gets its own node_mem_map, before this, without bootmem */
+ if (!pgdat->node_mem_map) {
+ size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+ pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+ }
#ifndef CONFIG_DISCONTIGMEM
- mem_map = contig_page_data.node_mem_map;
+ /*
+ * With no DISCONTIG, the global mem_map is just set as node 0's
+ */
+ if (pgdat == NODE_DATA(0))
+ mem_map = NODE_DATA(0)->node_mem_map;
#endif
}
pgdat->node_start_pfn = node_start_pfn;
calculate_zone_totalpages(pgdat, zones_size, zholes_size);
- if (!pfn_to_page(node_start_pfn))
- node_alloc_mem_map(pgdat);
+ alloc_node_mem_map(pgdat);
free_area_init_core(pgdat, zones_size, zholes_size);
}
"allocstall",
"pgrotated",
+ "nr_bounce",
};
static void *vmstat_start(struct seq_file *m, loff_t *pos)
for_each_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone * zone = pgdat->node_zones + j;
+ struct zone *zone = pgdat->node_zones + j;
unsigned long present_pages = zone->present_pages;
zone->lowmem_reserve[j] = 0;
for (idx = j-1; idx >= 0; idx--) {
- struct zone * lower_zone = pgdat->node_zones + idx;
+ struct zone *lower_zone;
+
+ if (sysctl_lowmem_reserve_ratio[idx] < 1)
+ sysctl_lowmem_reserve_ratio[idx] = 1;
- lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx];
+ lower_zone = pgdat->node_zones + idx;
+ lower_zone->lowmem_reserve[j] = present_pages /
+ sysctl_lowmem_reserve_ratio[idx];
present_pages += lower_zone->present_pages;
}
}
* changes.
*/
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, file, buffer, length, ppos);
setup_per_zone_pages_min();
* if in function of the boot time zone sizes.
*/
int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_minmax(table, write, file, buffer, length, ppos);
setup_per_zone_lowmem_reserve();