X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=8bcef6812a14cd744a34fa535366eda452c99ba2;hb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;hp=3fe5eb3add7181262a9833c85924e2f5a30ec7b2;hpb=a2c21200f1c81b08cb55e417b68150bba439b646;p=linux-2.6.git diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3fe5eb3ad..8bcef6812 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,8 +57,8 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; -static unsigned long __initdata nr_kernel_pages; -static unsigned long __initdata nr_all_pages; +unsigned long __initdata nr_kernel_pages; +unsigned long __initdata nr_all_pages; /* * Temporary debugging check for pages not lying within a given zone. @@ -78,9 +78,9 @@ static void bad_page(const char *function, struct page *page) { printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page); - printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n", - (unsigned long)page->flags, page->mapping, - (int)page->mapcount, page_count(page)); + printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", + (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, + page->mapping, page_mapcount(page), page_count(page)); printk(KERN_EMERG "Backtrace:\n"); dump_stack(); printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); @@ -89,13 +89,11 @@ static void bad_page(const char *function, struct page *page) 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | - 1 << PG_maplock | - 1 << PG_anon | 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); + reset_page_mapcount(page); page->mapping = NULL; - page->mapcount = 0; } #ifndef CONFIG_HUGETLB_PAGE @@ -231,8 +229,6 @@ static inline void free_pages_check(const char *function, struct page *page) 1 << PG_active | 1 << PG_reclaim | 1 << PG_slab | - 1 << PG_maplock | - 1 << PG_anon | 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(function, page); @@ -281,6 +277,8 @@ void __free_pages_ok(struct page *page, unsigned int order) LIST_HEAD(list); int i; + arch_free_page(page, order); + mod_page_state(pgfree, 1 << order); for (i = 0 ; i < (1 << order) ; ++i) free_pages_check(__FUNCTION__, page + i); @@ -352,8 +350,6 @@ static void prep_new_page(struct page *page, int order) 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | - 1 << PG_maplock | - 1 << PG_anon | 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -511,8 +507,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; + arch_free_page(page, 0); + kernel_map_pages(page, 1, 0); inc_page_state(pgfree); + if (PageAnon(page)) + page->mapping = NULL; free_pages_check(__FUNCTION__, page); pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); @@ -602,83 +602,75 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, { const int wait = gfp_mask & __GFP_WAIT; unsigned long min; - struct zone **zones; + struct zone **zones, *z; struct page *page; struct reclaim_state reclaim_state; struct task_struct *p = current; int i; int alloc_type; int do_retry; + int can_try_harder; might_sleep_if(wait); + /* + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or is the caller has realtime scheduling + * policy + */ + can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; + zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ - if (zones[0] == NULL) /* no zones in the zonelist */ + + if (unlikely(zones[0] == NULL)) { + /* Should this ever happen?? */ return NULL; + } alloc_type = zone_idx(zones[0]); /* Go through the zonelist once, looking for a zone with enough free */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - - min = (1<protection[alloc_type]; + for (i = 0; (z = zones[i]) != NULL; i++) { + min = z->pages_low + (1<protection[alloc_type]; - /* - * We let real-time tasks dip their real-time paws a little - * deeper into reserves. - */ - if (rt_task(p)) - min -= z->pages_low >> 1; + if (z->free_pages < min) + continue; - if (z->free_pages >= min || - (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) { - zone_statistics(zonelist, z); - goto got_pg; - } - } + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; } - /* we're somewhat low on memory, failed to find what we needed */ - for (i = 0; zones[i] != NULL; i++) - wakeup_kswapd(zones[i]); - - /* Go through the zonelist again, taking __GFP_HIGH into account */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - - min = (1<protection[alloc_type]; + for (i = 0; (z = zones[i]) != NULL; i++) + wakeup_kswapd(z); + /* + * Go through the zonelist again. Let __GFP_HIGH and allocations + * coming from realtime tasks to go deeper into reserves + */ + for (i = 0; (z = zones[i]) != NULL; i++) { + min = z->pages_min; if (gfp_mask & __GFP_HIGH) - min -= z->pages_low >> 2; - if (rt_task(p)) - min -= z->pages_low >> 1; + min /= 2; + if (can_try_harder) + min -= min / 4; + min += (1<protection[alloc_type]; - if (z->free_pages >= min || - (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) { - zone_statistics(zonelist, z); - goto got_pg; - } - } - } + if (z->free_pages < min) + continue; - /* here we're in the low on memory slow path */ + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } -rebalance: + /* This allocation should allow future memory freeing. */ if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { /* go through the zonelist yet again, ignoring mins */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; - + for (i = 0; (z = zones[i]) != NULL; i++) { page = buffered_rmqueue(z, order, gfp_mask); - if (page) { - zone_statistics(zonelist, z); + if (page) goto got_pg; - } } goto nopage; } @@ -687,6 +679,8 @@ rebalance: if (!wait) goto nopage; +rebalance: + /* We now go into synchronous reclaim */ p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -697,27 +691,28 @@ rebalance: p->flags &= ~PF_MEMALLOC; /* go through the zonelist yet one more time */ - for (i = 0; zones[i] != NULL; i++) { - struct zone *z = zones[i]; + for (i = 0; (z = zones[i]) != NULL; i++) { + min = z->pages_min; + if (gfp_mask & __GFP_HIGH) + min /= 2; + if (can_try_harder) + min -= min / 4; + min += (1<protection[alloc_type]; - min = (1UL << order) + z->protection[alloc_type]; + if (z->free_pages < min) + continue; - if (z->free_pages >= min || - (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) { - zone_statistics(zonelist, z); - goto got_pg; - } - } + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; } /* * Don't let big-order allocations loop unless the caller explicitly * requests that. Wait for some write requests to complete then retry. * - * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that - * may not be true in other implementations. + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order + * <= 3, but that may not be true in other implementations. */ do_retry = 0; if (!(gfp_mask & __GFP_NORETRY)) { @@ -740,6 +735,7 @@ nopage: } return NULL; got_pg: + zone_statistics(zonelist, z); kernel_map_pages(page, 1 << order, 1); return page; } @@ -804,8 +800,8 @@ EXPORT_SYMBOL(__free_pages); fastcall void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { - BUG_ON(!virt_addr_valid(addr)); - __free_pages(virt_to_page(addr), order); + BUG_ON(!virt_addr_valid((void *)addr)); + __free_pages(virt_to_page((void *)addr), order); } } @@ -967,18 +963,36 @@ unsigned long __read_page_state(unsigned offset) return ret; } +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zones[i].nr_active; + *inactive += zones[i].nr_inactive; + *free += zones[i].free_pages; + } +} + void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free) { - struct zone *zone; + struct pglist_data *pgdat; *active = 0; *inactive = 0; *free = 0; - for_each_zone(zone) { - *active += zone->nr_active; - *inactive += zone->nr_inactive; - *free += zone->free_pages; + for_each_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; } } @@ -1383,14 +1397,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(struct page *start, unsigned long size, int nid, - unsigned long zone, unsigned long start_pfn) +void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, + unsigned long start_pfn) { + struct page *start = pfn_to_page(start_pfn); struct page *page; for (page = start; page < (start + size); page++) { set_page_zone(page, NODEZONE(nid, zone)); set_page_count(page, 0); + reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL @@ -1402,9 +1418,55 @@ void __init memmap_init_zone(struct page *start, unsigned long size, int nid, } } +/* + * Page buddy system uses "index >> (i+1)", where "index" is + * at most "size-1". + * + * The extra "+3" is to round down to byte size (8 bits per byte + * assumption). Thus we get "(size-1) >> (i+4)" as the last byte + * we can access. + * + * The "+1" is because we want to round the byte allocation up + * rather than down. So we should have had a "+7" before we shifted + * down by three. Also, we have to add one as we actually _use_ the + * last bit (it's [0,n] inclusive, not [0,n[). + * + * So we actually had +7+1 before we shift down by 3. But + * (n+8) >> 3 == (n >> 3) + 1 (modulo overflows, which we do not have). + * + * Finally, we LONG_ALIGN because all bitmap operations are on longs. + */ +unsigned long pages_to_bitmap_size(unsigned long order, unsigned long nr_pages) +{ + unsigned long bitmap_size; + + bitmap_size = (nr_pages-1) >> (order+4); + bitmap_size = LONG_ALIGN(bitmap_size+1); + + return bitmap_size; +} + +void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, unsigned long size) +{ + int order; + for (order = 0; ; order++) { + unsigned long bitmap_size; + + INIT_LIST_HEAD(&zone->free_area[order].free_list); + if (order == MAX_ORDER-1) { + zone->free_area[order].map = NULL; + break; + } + + bitmap_size = pages_to_bitmap_size(order, size); + zone->free_area[order].map = + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + } +} + #ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(start, size, nid, zone, start_pfn) \ - memmap_init_zone((start), (size), (nid), (zone), (start_pfn)) +#define memmap_init(size, nid, zone, start_pfn) \ + memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif /* @@ -1419,7 +1481,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long i, j; const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); int cpu, nid = pgdat->node_id; - struct page *lmem_map = pgdat->node_mem_map; unsigned long zone_start_pfn = pgdat->node_start_pfn; pgdat->nr_zones = 0; @@ -1507,71 +1568,41 @@ static void __init free_area_init_core(struct pglist_data *pgdat, pgdat->nr_zones = j+1; - zone->zone_mem_map = lmem_map; + zone->zone_mem_map = pfn_to_page(zone_start_pfn); zone->zone_start_pfn = zone_start_pfn; if ((zone_start_pfn) & (zone_required_alignment-1)) printk("BUG: wrong zone alignment, it will crash\n"); - memmap_init(lmem_map, size, nid, j, zone_start_pfn); + memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; - lmem_map += size; - - for (i = 0; ; i++) { - unsigned long bitmap_size; - - INIT_LIST_HEAD(&zone->free_area[i].free_list); - if (i == MAX_ORDER-1) { - zone->free_area[i].map = NULL; - break; - } - /* - * Page buddy system uses "index >> (i+1)", - * where "index" is at most "size-1". - * - * The extra "+3" is to round down to byte - * size (8 bits per byte assumption). Thus - * we get "(size-1) >> (i+4)" as the last byte - * we can access. - * - * The "+1" is because we want to round the - * byte allocation up rather than down. So - * we should have had a "+7" before we shifted - * down by three. Also, we have to add one as - * we actually _use_ the last bit (it's [0,n] - * inclusive, not [0,n[). - * - * So we actually had +7+1 before we shift - * down by 3. But (n+8) >> 3 == (n >> 3) + 1 - * (modulo overflows, which we do not have). - * - * Finally, we LONG_ALIGN because all bitmap - * operations are on longs. - */ - bitmap_size = (size-1) >> (i+4); - bitmap_size = LONG_ALIGN(bitmap_size+1); - zone->free_area[i].map = - (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); - } + zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } -void __init free_area_init_node(int nid, struct pglist_data *pgdat, - struct page *node_mem_map, unsigned long *zones_size, - unsigned long node_start_pfn, unsigned long *zholes_size) +void __init node_alloc_mem_map(struct pglist_data *pgdat) { unsigned long size; + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); +#ifndef CONFIG_DISCONTIGMEM + mem_map = contig_page_data.node_mem_map; +#endif +} + +void __init free_area_init_node(int nid, struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long node_start_pfn, + unsigned long *zholes_size) +{ pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_zone_totalpages(pgdat, zones_size, zholes_size); - if (!node_mem_map) { - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); - node_mem_map = alloc_bootmem_node(pgdat, size); - } - pgdat->node_mem_map = node_mem_map; + + if (!pfn_to_page(node_start_pfn)) + node_alloc_mem_map(pgdat); free_area_init_core(pgdat, zones_size, zholes_size); } @@ -1584,9 +1615,8 @@ EXPORT_SYMBOL(contig_page_data); void __init free_area_init(unsigned long *zones_size) { - free_area_init_node(0, &contig_page_data, NULL, zones_size, + free_area_init_node(0, &contig_page_data, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); - mem_map = contig_page_data.node_mem_map; } #endif @@ -1845,11 +1875,11 @@ static void setup_per_zone_protection(void) * We never protect zones that don't have memory * in them (j>max_zone) or zones that aren't in * the zonelists for a certain type of - * allocation (j>i). We have to assign these to - * zero because the lower zones take + * allocation (j>=i). We have to assign these + * to zero because the lower zones take * contributions from the higher zones. */ - if (j > max_zone || j > i) { + if (j > max_zone || j >= i) { zone->protection[i] = 0; continue; } @@ -1858,7 +1888,6 @@ static void setup_per_zone_protection(void) */ zone->protection[i] = higherzone_val(zone, max_zone, i); - zone->protection[i] += zone->pages_low; } } } @@ -1993,41 +2022,40 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned int *_hash_shift, unsigned int *_hash_mask) { - unsigned long mem, max, log2qty, size; + unsigned long long max; + unsigned long log2qty, size; void *table; - /* round applicable memory size up to nearest megabyte */ - mem = consider_highmem ? nr_all_pages : nr_kernel_pages; - mem += (1UL << (20 - PAGE_SHIFT)) - 1; - mem >>= 20 - PAGE_SHIFT; - mem <<= 20 - PAGE_SHIFT; - - /* limit to 1 bucket per 2^scale bytes of low memory (rounded up to - * nearest power of 2 in size) */ - if (scale > PAGE_SHIFT) - mem >>= (scale - PAGE_SHIFT); - else - mem <<= (PAGE_SHIFT - scale); - - mem = 1UL << (long_log2(mem) + 1); + /* allow the kernel cmdline to have a say */ + if (!numentries) { + /* round applicable memory size up to nearest megabyte */ + numentries = consider_highmem ? nr_all_pages : nr_kernel_pages; + numentries += (1UL << (20 - PAGE_SHIFT)) - 1; + numentries >>= 20 - PAGE_SHIFT; + numentries <<= 20 - PAGE_SHIFT; + + /* limit to 1 bucket per 2^scale bytes of low memory */ + if (scale > PAGE_SHIFT) + numentries >>= (scale - PAGE_SHIFT); + else + numentries <<= (PAGE_SHIFT - scale); + } + /* rounded up to nearest power of 2 in size */ + numentries = 1UL << (long_log2(numentries) + 1); - /* limit allocation size */ - max = (1UL << (PAGE_SHIFT + MAX_SYS_HASH_TABLE_ORDER)) / bucketsize; - if (max > mem) - max = mem; + /* limit allocation size to 1/16 total memory */ + max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; + do_div(max, bucketsize); - /* allow the kernel cmdline to have a say */ - if (!numentries || numentries > max) + if (numentries > max) numentries = max; log2qty = long_log2(numentries); do { size = bucketsize << log2qty; - - table = (void *) alloc_bootmem(size); - - } while (!table && size > PAGE_SIZE); + table = alloc_bootmem(size); + } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename);