X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fi386%2Fmm%2Finit.c;h=e2e0a9a22c7f01648a2bcdae307fd1cfb786b5fa;hb=refs%2Fheads%2Fvserver;hp=19ba96e640d1e6e8141af825d1b7f0c0c91a874c;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 19ba96e64..e2e0a9a22 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -6,7 +6,6 @@ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 */ -#include #include #include #include @@ -23,16 +22,19 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include #include #include #include #include -#include #include #include #include @@ -41,10 +43,12 @@ #include #include +unsigned int __VMALLOC_RESERVE = 128 << 20; + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; -static int do_test_wp_bit(void); +static int noinline do_test_wp_bit(void); /* * Creates a middle page table and puts a pointer to it in the @@ -53,15 +57,18 @@ static int do_test_wp_bit(void); */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + pud_t *pud; pmd_t *pmd_table; #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) + pud = pud_offset(pgd, 0); + if (pmd_table != pmd_offset(pud, 0)) BUG(); #else - pmd_table = pmd_offset(pgd, 0); + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); #endif return pmd_table; @@ -99,6 +106,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; @@ -111,8 +119,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { if (pgd_none(*pgd)) one_md_table_init(pgd); - - pmd = pmd_offset(pgd, vaddr); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) one_page_table_init(pmd); @@ -123,6 +131,13 @@ static void __init page_table_range_init (unsigned long start, unsigned long end } } +static inline int is_kernel_text(unsigned long addr) +{ + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) + return 1; + return 0; +} + /* * This maps the physical memory to kernel virtual address space, a total * of max_low_pfn pages, by creating page tables starting from address @@ -145,18 +160,29 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) if (pfn >= max_low_pfn) continue; for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { + unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; + /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(address) || is_kernel_text(address2)) + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); + else + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); pfn += PTRS_PER_PTE; } else { pte = one_page_table_init(pmd); - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + } } } - } + } } static inline int page_kills_ppro(unsigned long pagenr) @@ -166,18 +192,17 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } -extern int is_available_memory(efi_memory_desc_t *); - -static inline int page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; unsigned long addr, end; if (efi_enabled) { efi_memory_desc_t *md; + void *p; - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; if (!is_available_memory(md)) continue; addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; @@ -206,17 +231,35 @@ static inline int page_is_ram(unsigned long pagenr) return 0; } +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address is + * valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + +EXPORT_SYMBOL_GPL(page_is_ram); + #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); -EXPORT_SYMBOL(kmap_pte); - #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) -void __init kmap_init(void) +static void __init kmap_init(void) { unsigned long kmap_vstart; @@ -227,9 +270,10 @@ void __init kmap_init(void) kmap_prot = PAGE_KERNEL; } -void __init permanent_kmaps_init(pgd_t *pgd_base) +static void __init permanent_kmaps_init(pgd_t *pgd_base) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pte_t *pte; unsigned long vaddr; @@ -238,34 +282,63 @@ void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __meminit free_new_highpage(struct page *page) +{ + init_page_count(page); + __free_page(page); + totalhigh_pages++; +} + +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - set_bit(PG_highmem, &page->flags); - set_page_count(page, 1); - __free_page(page); - totalhigh_pages++; + free_new_highpage(page); } else SetPageReserved(page); } -#ifndef CONFIG_DISCONTIGMEM -void __init set_highmem_pages_init(int bad_ppro) +static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) +{ + free_new_highpage(page); + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr = max(pfn, max_mapnr); +#endif + num_physpages++; + return 0; +} + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ +void __meminit online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage_hotplug(page, page_to_pfn(page)); +} + + +#ifdef CONFIG_NUMA +extern void set_highmem_pages_init(int); +#else +static void __init set_highmem_pages_init(int bad_ppro) { int pfn; for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } -#else -extern void set_highmem_pages_init(int); -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_FLATMEM */ #else #define kmap_init() do { } while (0) @@ -273,12 +346,14 @@ extern void set_highmem_pages_init(int); #define set_highmem_pages_init(bad_ppro) do { } while (0) #endif /* CONFIG_HIGHMEM */ -unsigned long __PAGE_KERNEL = _PAGE_KERNEL; +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +EXPORT_SYMBOL(__PAGE_KERNEL); +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; -#ifndef CONFIG_DISCONTIGMEM -#define remap_numa_kva() do {} while (0) -#else +#ifdef CONFIG_NUMA extern void __init remap_numa_kva(void); +#else +#define remap_numa_kva() do {} while (0) #endif static void __init pagetable_init (void) @@ -302,6 +377,7 @@ static void __init pagetable_init (void) if (cpu_has_pge) { set_in_cr4(X86_CR4_PGE); __PAGE_KERNEL |= _PAGE_GLOBAL; + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; } kernel_physical_mapping_init(pgd_base); @@ -324,11 +400,11 @@ static void __init pagetable_init (void) * All user-space mappings are explicitly cleared after * SMP startup. */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; + set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); #endif } -#if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND) +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) /* * Swap suspend & friends need this for resume because things like the intel-agp * driver might have split up a kernel 4MB mapping. @@ -367,30 +443,83 @@ void zap_low_mappings (void) flush_tlb_all(); } -#ifndef CONFIG_DISCONTIGMEM -void __init zone_sizes_init(void) +static int disable_nx __initdata = 0; +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; + +/* + * noexec = on|off + * + * Control non executable mappings. + * + * on Enable + * off Disable (disables exec-shield too) + */ +static int __init noexec_setup(char *str) { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int max_dma, high, low; - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - low = max_low_pfn; - high = highend_pfn; - - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; -#endif + if (!str || !strcmp(str, "on")) { + if (cpu_has_nx) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } + } else if (!strcmp(str,"off")) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + exec_shield = 0; + } else + return -EINVAL; + + return 0; +} +early_param("noexec", noexec_setup); + +int nx_enabled = 0; +#ifdef CONFIG_X86_PAE + +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } } - free_area_init(zones_size); } -#else -extern void zone_sizes_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ + +/* + * Enables/disables executability of a given kernel page and + * returns the previous setting. + */ +int __init set_kernel_exec(unsigned long vaddr, int enable) +{ + pte_t *pte; + int ret = 1; + + if (!nx_enabled) + goto out; + + pte = lookup_address(vaddr); + BUG_ON(!pte); + + if (!pte_exec_kernel(*pte)) + ret = 0; + + if (enable) + pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); + else + pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); + pte_update_defer(&init_mm, vaddr, pte); + __flush_tlb_all(); +out: + return ret; +} + +#endif /* * paging_init() sets up the page tables - note that the first 8MB are @@ -401,6 +530,15 @@ extern void zone_sizes_init(void); */ void __init paging_init(void) { +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk("NX (Execute Disable) protection: active\n"); + else +#endif + if (exec_shield) + printk("Using x86 segment limits to approximate NX protection\n"); + pagetable_init(); load_cr3(swapper_pg_dir); @@ -416,7 +554,6 @@ void __init paging_init(void) __flush_tlb_all(); kmap_init(); - zone_sizes_init(); } /* @@ -426,7 +563,7 @@ void __init paging_init(void) * but fortunately the switch to using exceptions got rid of all that. */ -void __init test_wp_bit(void) +static void __init test_wp_bit(void) { printk("Checking if this processor honours the WP bit even in supervisor mode... "); @@ -445,22 +582,6 @@ void __init test_wp_bit(void) } } -#ifndef CONFIG_DISCONTIGMEM -static void __init set_max_mapnr_init(void) -{ -#ifdef CONFIG_HIGHMEM - highmem_start_page = pfn_to_page(highstart_pfn); - max_mapnr = num_physpages = highend_pfn; -#else - max_mapnr = num_physpages = max_low_pfn; -#endif -} -#define __free_all_bootmem() free_all_bootmem() -#else -#define __free_all_bootmem() free_all_bootmem_node(NODE_DATA(0)) -extern void set_max_mapnr_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ - static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) @@ -470,9 +591,8 @@ void __init mem_init(void) int tmp; int bad_ppro; -#ifndef CONFIG_DISCONTIGMEM - if (!mem_map) - BUG(); +#ifdef CONFIG_FLATMEM + BUG_ON(!mem_map); #endif bad_ppro = ppro_with_ram_bug(); @@ -487,16 +607,8 @@ void __init mem_init(void) } #endif - set_max_mapnr_init(); - -#ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); -#endif - /* this will put all low memory onto the freelists */ - totalram_pages += __free_all_bootmem(); + totalram_pages += free_all_bootmem(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) @@ -526,6 +638,48 @@ void __init mem_init(void) (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) ); +#if 1 /* double-sanity-check paranoia */ + printk("virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +#endif + + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, + + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + +#ifdef CONFIG_HIGHMEM + BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif + BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); +#endif /* double-sanity-check paranoia */ + #ifdef CONFIG_X86_PAE if (!cpu_has_pae) panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); @@ -544,8 +698,26 @@ void __init mem_init(void) #endif } -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +#ifdef CONFIG_MEMORY_HOTPLUG +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(nid); + struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages); +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(remove_memory); +#endif + +struct kmem_cache *pgd_cache; +struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { @@ -573,7 +745,7 @@ void __init pgtable_cache_init(void) * This function cannot be __init, since exceptions don't work in that * section. Put this after the callers, so that it cannot be inlined. */ -static int do_test_wp_bit(void) +static int noinline do_test_wp_bit(void) { char tmp_reg; int flag; @@ -596,30 +768,53 @@ static int do_test_wp_bit(void) return flag; } -void free_initmem(void) +#ifdef CONFIG_DEBUG_RODATA + +void mark_rodata_ro(void) +{ + unsigned long addr = (unsigned long)__start_rodata; + + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) + change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); + + printk("Write protecting the kernel read-only data: %uk\n", + (__end_rodata - __start_rodata) >> 10); + + /* + * change_page_attr() requires a global_flush_tlb() call after it. + * We do this after the printk so that if something went wrong in the + * change, the printk gets out at least to give a better debug hint + * of who is the culprit. + */ + global_flush_tlb(); +} +#endif + +void free_init_pages(char *what, unsigned long begin, unsigned long end) { unsigned long addr; - addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } - printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start < end) - printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); - free_page(start); - totalram_pages++; - } + free_init_pages("initrd memory", start, end); } #endif +