X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=include%2Flinux%2Fmmzone.h;h=b262f47961fbe8a439cc218cd62b20122c5ac957;hb=refs%2Fheads%2Fvserver;hp=7c36a10f6720ab7a051a8a04680df7773077cebc;hpb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;p=linux-2.6.git diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7c36a10f6..b262f4796 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -4,14 +4,17 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ -#include #include #include #include #include #include #include +#include +#include +#include #include +#include /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER @@ -19,10 +22,11 @@ #else #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif +#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) struct free_area { struct list_head free_list; - unsigned long *map; + unsigned long nr_free; }; struct pglist_data; @@ -35,16 +39,38 @@ struct pglist_data; */ #if defined(CONFIG_SMP) struct zone_padding { - int x; -} ____cacheline_maxaligned_in_smp; + char x[0]; +} ____cacheline_internodealigned_in_smp; #define ZONE_PADDING(name) struct zone_padding name; #else #define ZONE_PADDING(name) #endif +enum zone_stat_item { + NR_ANON_PAGES, /* Mapped anonymous pages */ + NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. + only modified from process context */ + NR_FILE_PAGES, + NR_SLAB_RECLAIMABLE, + NR_SLAB_UNRECLAIMABLE, + NR_PAGETABLE, /* used for pagetables */ + NR_FILE_DIRTY, + NR_WRITEBACK, + NR_UNSTABLE_NFS, /* NFS unstable pages */ + NR_BOUNCE, + NR_VMSCAN_WRITE, +#ifdef CONFIG_NUMA + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ + NUMA_FOREIGN, /* was intended here, hit elsewhere */ + NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ + NUMA_LOCAL, /* allocation from local node */ + NUMA_OTHER, /* allocation from other node */ +#endif + NR_VM_ZONE_STAT_ITEMS }; + struct per_cpu_pages { int count; /* number of pages in the list */ - int low; /* low watermark, refill needed */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ struct list_head list; /* the list of pages */ @@ -52,84 +78,120 @@ struct per_cpu_pages { struct per_cpu_pageset { struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ -#ifdef CONFIG_NUMA - unsigned long numa_hit; /* allocated in intended node */ - unsigned long numa_miss; /* allocated in non intended node */ - unsigned long numa_foreign; /* was intended here, hit elsewhere */ - unsigned long interleave_hit; /* interleaver prefered this zone */ - unsigned long local_node; /* allocation from local node */ - unsigned long other_node; /* allocation from other node */ +#ifdef CONFIG_SMP + s8 stat_threshold; + s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif } ____cacheline_aligned_in_smp; -#define ZONE_DMA 0 -#define ZONE_NORMAL 1 -#define ZONE_HIGHMEM 2 - -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ +#ifdef CONFIG_NUMA +#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) +#else +#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) +#endif +enum zone_type { + /* + * ZONE_DMA is used when there are devices that are not able + * to do DMA to all of addressable memory (ZONE_NORMAL). Then we + * carve out the portion of memory that is needed for these devices. + * The range is arch specific. + * + * Some examples + * + * Architecture Limit + * --------------------------- + * parisc, ia64, sparc <4G + * s390 <2G + * arm26 <48M + * arm Various + * alpha Unlimited or 0-16MB. + * + * i386, x86_64 and multiple other arches + * <16M. + */ + ZONE_DMA, +#ifdef CONFIG_ZONE_DMA32 + /* + * x86_64 needs two ZONE_DMAs because it supports devices that are + * only able to do DMA to the lower 16M but also 32 bit devices that + * can only do DMA areas below 4G. + */ + ZONE_DMA32, +#endif + /* + * Normal addressable memory is in ZONE_NORMAL. DMA operations can be + * performed on pages in ZONE_NORMAL if the DMA devices support + * transfers to all addressable memory. + */ + ZONE_NORMAL, +#ifdef CONFIG_HIGHMEM + /* + * A memory area that is only addressable by the kernel through + * mapping portions into its own address space. This is for example + * used by i386 to allow the kernel to address the memory beyond + * 900MB. The kernel will set up special mappings (page + * table entries on i386) for each page that the kernel needs to + * access. + */ + ZONE_HIGHMEM, +#endif + MAX_NR_ZONES +}; /* * When a memory allocation must conform to specific limitations (such * as being suitable for DMA) the caller will pass in hints to the * allocator in the gfp_mask, in the zone modifier bits. These bits * are used to select a priority ordered list of memory zones which - * match the requested limits. GFP_ZONEMASK defines which bits within - * the gfp_mask should be considered as zone modifiers. Each valid - * combination of the zone modifier bits has a corresponding list - * of zones (in node_zonelists). Thus for two zone modifiers there - * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will - * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible - * combinations of zone modifiers in "zone modifier space". - */ -#define GFP_ZONEMASK 0x03 -/* - * As an optimisation any zone modifier bits which are only valid when - * no other zone modifier bits are set (loners) should be placed in - * the highest order bits of this field. This allows us to reduce the - * extent of the zonelists thus saving space. For example in the case - * of three zone modifier bits, we could require up to eight zonelists. - * If the left most zone modifier is a "loner" then the highest valid - * zonelist would be four allowing us to allocate only five zonelists. - * Use the first form when the left most bit is not a "loner", otherwise - * use the second. + * match the requested limits. See gfp_zone() in include/linux/gfp.h */ -/* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ -#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ -/* - * On machines where it is needed (eg PCs) we divide physical memory - * into multiple physical zones. On a PC we have 3 zones: - * - * ZONE_DMA < 16 MB ISA DMA capable memory - * ZONE_NORMAL 16-896 MB direct mapped by the kernel - * ZONE_HIGHMEM > 896 MB only page cache and user processes - */ +#if !defined(CONFIG_ZONE_DMA32) && !defined(CONFIG_HIGHMEM) +#define ZONES_SHIFT 1 +#else +#define ZONES_SHIFT 2 +#endif struct zone { - /* - * Commonly accessed fields: - */ - spinlock_t lock; + /* Fields commonly accessed by the page allocator */ unsigned long free_pages; unsigned long pages_min, pages_low, pages_high; /* - * protection[] is a pre-calculated number of extra pages that must be - * available in a zone in order for __alloc_pages() to allocate memory - * from the zone. i.e., for a GFP_KERNEL alloc of "order" there must - * be "(1<> PAGE_SHIFT */ unsigned long zone_start_pfn; /* - * rarely used fields: + * zone_start_pfn, spanned_pages and present_pages are all + * protected by span_seqlock. It is a seqlock because it has + * to be read outside of zone->lock, and it is done in the main + * allocator path. But, it is written quite infrequently. + * + * The lock is declared along with zone->lock because it is + * frequently read in proximity to zone->lock. It's good to + * give them a chance of being in the same cacheline. */ - char *name; unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ -} ____cacheline_maxaligned_in_smp; + /* + * rarely used fields: + */ + const char *name; +} ____cacheline_internodealigned_in_smp; /* * The "priority" of VM scanning is how much of the queues we will scan in one @@ -222,21 +288,108 @@ struct zone { */ #define DEF_PRIORITY 12 +/* Maximum number of zones on a zonelist */ +#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) + +#ifdef CONFIG_NUMA +/* + * We cache key information from each zonelist for smaller cache + * footprint when scanning for free pages in get_page_from_freelist(). + * + * 1) The BITMAP fullzones tracks which zones in a zonelist have come + * up short of free memory since the last time (last_fullzone_zap) + * we zero'd fullzones. + * 2) The array z_to_n[] maps each zone in the zonelist to its node + * id, so that we can efficiently evaluate whether that node is + * set in the current tasks mems_allowed. + * + * Both fullzones and z_to_n[] are one-to-one with the zonelist, + * indexed by a zones offset in the zonelist zones[] array. + * + * The get_page_from_freelist() routine does two scans. During the + * first scan, we skip zones whose corresponding bit in 'fullzones' + * is set or whose corresponding node in current->mems_allowed (which + * comes from cpusets) is not set. During the second scan, we bypass + * this zonelist_cache, to ensure we look methodically at each zone. + * + * Once per second, we zero out (zap) fullzones, forcing us to + * reconsider nodes that might have regained more free memory. + * The field last_full_zap is the time we last zapped fullzones. + * + * This mechanism reduces the amount of time we waste repeatedly + * reexaming zones for free memory when they just came up low on + * memory momentarilly ago. + * + * The zonelist_cache struct members logically belong in struct + * zonelist. However, the mempolicy zonelists constructed for + * MPOL_BIND are intentionally variable length (and usually much + * shorter). A general purpose mechanism for handling structs with + * multiple variable length members is more mechanism than we want + * here. We resort to some special case hackery instead. + * + * The MPOL_BIND zonelists don't need this zonelist_cache (in good + * part because they are shorter), so we put the fixed length stuff + * at the front of the zonelist struct, ending in a variable length + * zones[], as is needed by MPOL_BIND. + * + * Then we put the optional zonelist cache on the end of the zonelist + * struct. This optional stuff is found by a 'zlcache_ptr' pointer in + * the fixed length portion at the front of the struct. This pointer + * both enables us to find the zonelist cache, and in the case of + * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) + * to know that the zonelist cache is not there. + * + * The end result is that struct zonelists come in two flavors: + * 1) The full, fixed length version, shown below, and + * 2) The custom zonelists for MPOL_BIND. + * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. + * + * Even though there may be multiple CPU cores on a node modifying + * fullzones or last_full_zap in the same zonelist_cache at the same + * time, we don't lock it. This is just hint data - if it is wrong now + * and then, the allocator will still function, perhaps a bit slower. + */ + + +struct zonelist_cache { + unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ + DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ + unsigned long last_full_zap; /* when last zap'd (jiffies) */ +}; +#else +struct zonelist_cache; +#endif + /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * - * Right now a zonelist takes up less than a cacheline. We never - * modify it apart from boot-up, and only a few indices are used, - * so despite the zonelist table being relatively big, the cache - * footprint of this construct is very small. + * If zlcache_ptr is not NULL, then it is just the address of zlcache, + * as explained above. If zlcache_ptr is NULL, there is no zlcache. */ + struct zonelist { - struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited + struct zonelist_cache *zlcache_ptr; // NULL or &zlcache + struct zone *zones[MAX_ZONES_PER_ZONELIST + 1]; // NULL delimited +#ifdef CONFIG_NUMA + struct zonelist_cache zlcache; // optional ... +#endif }; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +struct node_active_region { + unsigned long start_pfn; + unsigned long end_pfn; + int nid; +}; +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif /* * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM @@ -252,98 +405,93 @@ struct zonelist { struct bootmem_data; typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; - struct zonelist node_zonelists[GFP_ZONETYPES]; + struct zonelist node_zonelists[MAX_NR_ZONES]; int nr_zones; +#ifdef CONFIG_FLAT_NODE_MEM_MAP struct page *node_mem_map; +#endif struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; - struct pglist_data *pgdat_next; - wait_queue_head_t kswapd_wait; + wait_queue_head_t kswapd_wait; struct task_struct *kswapd; + int kswapd_max_order; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#ifdef CONFIG_FLAT_NODE_MEM_MAP +#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) +#else +#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) +#endif +#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) -extern int numnodes; -extern struct pglist_data *pgdat_list; +#include void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat); void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free); void build_all_zonelists(void); -void wakeup_kswapd(struct zone *zone); +void wakeup_kswapd(struct zone *zone, int order); +int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); +enum memmap_context { + MEMMAP_EARLY, + MEMMAP_HOTPLUG, +}; +extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, + unsigned long size, + enum memmap_context context); + +#ifdef CONFIG_HAVE_MEMORY_PRESENT +void memory_present(int nid, unsigned long start, unsigned long end); +#else +static inline void memory_present(int nid, unsigned long start, unsigned long end) {} +#endif + +#ifdef CONFIG_NEED_NODE_MEMMAP_SIZE +unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); +#endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) -/** - * for_each_pgdat - helper macro to iterate over all nodes - * @pgdat - pointer to a pg_data_t variable - * - * Meant to help with common loops of the form - * pgdat = pgdat_list; - * while(pgdat) { - * ... - * pgdat = pgdat->pgdat_next; - * } - */ -#define for_each_pgdat(pgdat) \ - for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) - -/* - * next_zone - helper magic for for_each_zone() - * Thanks to William Lee Irwin III for this piece of ingenuity. - */ -static inline struct zone *next_zone(struct zone *zone) +static inline int populated_zone(struct zone *zone) { - pg_data_t *pgdat = zone->zone_pgdat; - - if (zone - pgdat->node_zones < MAX_NR_ZONES - 1) - zone++; - else if (pgdat->pgdat_next) { - pgdat = pgdat->pgdat_next; - zone = pgdat->node_zones; - } else - zone = NULL; - - return zone; + return (!!zone->present_pages); } -/** - * for_each_zone - helper macro to iterate over all memory zones - * @zone - pointer to struct zone variable - * - * The user only needs to declare the zone variable, for_each_zone - * fills it in. This basically means for_each_zone() is an - * easier to read version of this piece of code: - * - * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) - * for (i = 0; i < MAX_NR_ZONES; ++i) { - * struct zone * z = pgdat->node_zones + i; - * ... - * } - * } - */ -#define for_each_zone(zone) \ - for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) - -static inline int is_highmem_idx(int idx) +static inline int is_highmem_idx(enum zone_type idx) { +#ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM); +#else + return 0; +#endif } -static inline int is_normal_idx(int idx) +static inline int is_normal_idx(enum zone_type idx) { return (idx == ZONE_NORMAL); } + /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references @@ -352,12 +500,30 @@ static inline int is_normal_idx(int idx) */ static inline int is_highmem(struct zone *zone) { - return (is_highmem_idx(zone - zone->zone_pgdat->node_zones)); +#ifdef CONFIG_HIGHMEM + return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; +#else + return 0; +#endif } static inline int is_normal(struct zone *zone) { - return (is_normal_idx(zone - zone->zone_pgdat->node_zones)); + return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; +} + +static inline int is_dma32(struct zone *zone) +{ +#ifdef CONFIG_ZONE_DMA32 + return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; +#else + return 0; +#endif +} + +static inline int is_dma(struct zone *zone) +{ + return zone == zone->zone_pgdat->node_zones + ZONE_DMA; } /* These two functions are used to setup the per zone pages min values */ @@ -365,80 +531,237 @@ struct ctl_table; struct file; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *, +extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ -#define numa_node_id() (cpu_to_node(smp_processor_id())) +#ifndef numa_node_id +#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) +#endif -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map #define MAX_NODES_SHIFT 1 -#define pfn_to_nid(pfn) (0) -#else /* CONFIG_DISCONTIGMEM */ +#else /* CONFIG_NEED_MULTIPLE_NODES */ #include -#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +extern struct pglist_data *first_online_pgdat(void); +extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); +extern struct zone *next_zone(struct zone *zone); + +/** + * for_each_pgdat - helper macro to iterate over all nodes + * @pgdat - pointer to a pg_data_t variable + */ +#define for_each_online_pgdat(pgdat) \ + for (pgdat = first_online_pgdat(); \ + pgdat; \ + pgdat = next_online_pgdat(pgdat)) +/** + * for_each_zone - helper macro to iterate over all memory zones + * @zone - pointer to struct zone variable + * + * The user only needs to declare the zone variable, for_each_zone + * fills it in. + */ +#define for_each_zone(zone) \ + for (zone = (first_online_pgdat())->node_zones; \ + zone; \ + zone = next_zone(zone)) + +#ifdef CONFIG_SPARSEMEM +#include +#endif + +#if BITS_PER_LONG == 32 /* - * with 32 bit page->flags field, we reserve 8 bits for node/zone info. - * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. + * with 32 bit page->flags field, we reserve 9 bits for node/zone info. + * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes. */ -#define MAX_NODES_SHIFT 6 +#define FLAGS_RESERVED 9 + #elif BITS_PER_LONG == 64 /* * with 64 bit flags field, there's plenty of room. */ -#define MAX_NODES_SHIFT 10 +#define FLAGS_RESERVED 32 + +#else + +#error BITS_PER_LONG not defined + #endif -#endif /* !CONFIG_DISCONTIGMEM */ +#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ + !defined(CONFIG_ARCH_POPULATES_NODE_MAP) +#define early_pfn_to_nid(nid) (0UL) +#endif -#if NODES_SHIFT > MAX_NODES_SHIFT -#error NODES_SHIFT > MAX_NODES_SHIFT +#ifdef CONFIG_FLATMEM +#define pfn_to_nid(pfn) (0) #endif -/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */ -#define MAX_ZONES_SHIFT 2 +#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) +#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) + +#ifdef CONFIG_SPARSEMEM + +/* + * SECTION_SHIFT #bits space required to store a section # + * + * PA_SECTION_SHIFT physical address to/from section number + * PFN_SECTION_SHIFT pfn to/from section number + */ +#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) + +#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) +#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) -#if ZONES_SHIFT > MAX_ZONES_SHIFT -#error ZONES_SHIFT > MAX_ZONES_SHIFT +#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) + +#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) +#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) + +#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_ORDER exceeds SECTION_SIZE #endif -extern DECLARE_BITMAP(node_online_map, MAX_NUMNODES); +struct page; +struct mem_section { + /* + * This is, logically, a pointer to an array of struct + * pages. However, it is stored with some other magic. + * (see sparse.c::sparse_init_one_section()) + * + * Additionally during early boot we encode node id of + * the location of the section here to guide allocation. + * (see sparse.c::memory_present()) + * + * Making it a UL at least makes someone do a cast + * before using it wrong. + */ + unsigned long section_mem_map; +}; -#if defined(CONFIG_DISCONTIGMEM) || defined(CONFIG_NUMA) +#ifdef CONFIG_SPARSEMEM_EXTREME +#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) +#else +#define SECTIONS_PER_ROOT 1 +#endif -#define node_online(node) test_bit(node, node_online_map) -#define node_set_online(node) set_bit(node, node_online_map) -#define node_set_offline(node) clear_bit(node, node_online_map) -static inline unsigned int num_online_nodes(void) +#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) +#define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT) +#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) + +#ifdef CONFIG_SPARSEMEM_EXTREME +extern struct mem_section *mem_section[NR_SECTION_ROOTS]; +#else +extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; +#endif + +static inline struct mem_section *__nr_to_section(unsigned long nr) { - int i, num = 0; + if (!mem_section[SECTION_NR_TO_ROOT(nr)]) + return NULL; + return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; +} +extern int __section_nr(struct mem_section* ms); - for(i = 0; i < MAX_NUMNODES; i++){ - if (node_online(i)) - num++; - } - return num; +/* + * We use the lower bits of the mem_map pointer to store + * a little bit of information. There should be at least + * 3 bits here due to 32-bit alignment. + */ +#define SECTION_MARKED_PRESENT (1UL<<0) +#define SECTION_HAS_MEM_MAP (1UL<<1) +#define SECTION_MAP_LAST_BIT (1UL<<2) +#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) +#define SECTION_NID_SHIFT 2 + +static inline struct page *__section_mem_map_addr(struct mem_section *section) +{ + unsigned long map = section->section_mem_map; + map &= SECTION_MAP_MASK; + return (struct page *)map; } -#else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */ +static inline int valid_section(struct mem_section *section) +{ + return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); +} + +static inline int section_has_mem_map(struct mem_section *section) +{ + return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); +} + +static inline int valid_section_nr(unsigned long nr) +{ + return valid_section(__nr_to_section(nr)); +} + +static inline struct mem_section *__pfn_to_section(unsigned long pfn) +{ + return __nr_to_section(pfn_to_section_nr(pfn)); +} + +static inline int pfn_valid(unsigned long pfn) +{ + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) + return 0; + return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); +} + +/* + * These are _only_ used during initialisation, therefore they + * can use __initdata ... They could have names to indicate + * this restriction. + */ +#ifdef CONFIG_NUMA +#define pfn_to_nid(pfn) \ +({ \ + unsigned long __pfn_to_nid_pfn = (pfn); \ + page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ +}) +#else +#define pfn_to_nid(pfn) (0) +#endif + +#define early_pfn_valid(pfn) pfn_valid(pfn) +void sparse_init(void); +#else +#define sparse_init() do {} while (0) +#define sparse_index_init(_sec, _nid) do {} while (0) +#endif /* CONFIG_SPARSEMEM */ + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) +#else +#define early_pfn_in_nid(pfn, nid) (1) +#endif + +#ifndef early_pfn_valid +#define early_pfn_valid(pfn) (1) +#endif -#define node_online(node) \ - ({ BUG_ON((node) != 0); test_bit(node, node_online_map); }) -#define node_set_online(node) \ - ({ BUG_ON((node) != 0); set_bit(node, node_online_map); }) -#define node_set_offline(node) \ - ({ BUG_ON((node) != 0); clear_bit(node, node_online_map); }) -#define num_online_nodes() 1 +void memory_present(int nid, unsigned long start, unsigned long end); +unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); -#endif /* CONFIG_DISCONTIGMEM || CONFIG_NUMA */ #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */