This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / arch / i386 / mm / init.c
index 040862e..2fc263c 100644 (file)
@@ -32,7 +32,6 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
-#include <asm/pgalloc.h>
 #include <asm/dma.h>
 #include <asm/fixmap.h>
 #include <asm/e820.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/desc.h>
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static int do_test_wp_bit(void);
 
-/*
- * Creates a middle page table and puts a pointer to it in the
- * given global directory entry. This only returns the gd entry
- * in non-PAE compilation mode, since the middle layer is folded.
- */
-static pmd_t * __init one_md_table_init(pgd_t *pgd)
-{
-       pmd_t *pmd_table;
-               
-#ifdef CONFIG_X86_PAE
-       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
-       if (pmd_table != pmd_offset(pgd, 0)) 
-               BUG();
-#else
-       pmd_table = pmd_offset(pgd, 0);
-#endif
-
-       return pmd_table;
-}
-
-/*
- * Create a page table and place a pointer to it in a middle page
- * directory entry.
- */
-static pte_t * __init one_page_table_init(pmd_t *pmd)
-{
-       if (pmd_none(*pmd)) {
-               pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
-               if (page_table != pte_offset_kernel(pmd, 0))
-                       BUG();  
-
-               return page_table;
-       }
-       
-       return pte_offset_kernel(pmd, 0);
-}
-
-/*
- * This function initializes a certain range of kernel virtual memory 
- * with new bootmem page tables, everywhere page tables are missing in
- * the given range.
- */
-
-/*
- * NOTE: The pagetables are allocated contiguous on the physical space 
- * so we can cache the place of the first one and move around without 
- * checking the pgd every time.
- */
-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
-{
-       pgd_t *pgd;
-       pmd_t *pmd;
-       int pgd_idx, pmd_idx;
-       unsigned long vaddr;
-
-       vaddr = start;
-       pgd_idx = pgd_index(vaddr);
-       pmd_idx = pmd_index(vaddr);
-       pgd = pgd_base + pgd_idx;
-
-       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
-               if (pgd_none(*pgd)) 
-                       one_md_table_init(pgd);
-
-               pmd = pmd_offset(pgd, vaddr);
-               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
-                       if (pmd_none(*pmd)) 
-                               one_page_table_init(pmd);
-
-                       vaddr += PMD_SIZE;
-               }
-               pmd_idx = 0;
-       }
-}
-
-/*
- * This maps the physical memory to kernel virtual address space, a total 
- * of max_low_pfn pages, by creating page tables starting from address 
- * PAGE_OFFSET.
- */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
-{
-       unsigned long pfn;
-       pgd_t *pgd;
-       pmd_t *pmd;
-       pte_t *pte;
-       int pgd_idx, pmd_idx, pte_ofs;
-
-       pgd_idx = pgd_index(PAGE_OFFSET);
-       pgd = pgd_base + pgd_idx;
-       pfn = 0;
-
-       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
-               pmd = one_md_table_init(pgd);
-               if (pfn >= max_low_pfn)
-                       continue;
-               for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
-                       /* Map with big pages if possible, otherwise create normal page tables. */
-                       if (cpu_has_pse) {
-                               set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
-                               pfn += PTRS_PER_PTE;
-                       } else {
-                               pte = one_page_table_init(pmd);
-
-                               for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++)
-                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
-                       }
-               }
-       }       
-}
-
 static inline int page_kills_ppro(unsigned long pagenr)
 {
        if (pagenr >= 0x70000 && pagenr <= 0x7003F)
@@ -168,7 +55,7 @@ static inline int page_kills_ppro(unsigned long pagenr)
 
 extern int is_available_memory(efi_memory_desc_t *);
 
-static inline int page_is_ram(unsigned long pagenr)
+int page_is_ram(unsigned long pagenr)
 {
        int i;
        unsigned long addr, end;
@@ -206,11 +93,8 @@ static inline int page_is_ram(unsigned long pagenr)
        return 0;
 }
 
-#ifdef CONFIG_HIGHMEM
 pte_t *kmap_pte;
-pgprot_t kmap_prot;
 
-EXPORT_SYMBOL(kmap_prot);
 EXPORT_SYMBOL(kmap_pte);
 
 #define kmap_get_fixmap_pte(vaddr)                                     \
@@ -218,29 +102,7 @@ EXPORT_SYMBOL(kmap_pte);
 
 void __init kmap_init(void)
 {
-       unsigned long kmap_vstart;
-
-       /* cache the first kmap pte */
-       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
-       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
-
-       kmap_prot = PAGE_KERNEL;
-}
-
-void __init permanent_kmaps_init(pgd_t *pgd_base)
-{
-       pgd_t *pgd;
-       pmd_t *pmd;
-       pte_t *pte;
-       unsigned long vaddr;
-
-       vaddr = PKMAP_BASE;
-       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
-
-       pgd = swapper_pg_dir + pgd_index(vaddr);
-       pmd = pmd_offset(pgd, vaddr);
-       pte = pte_offset_kernel(pmd, vaddr);
-       pkmap_page_table = pte; 
+       kmap_pte = kmap_get_fixmap_pte(__fix_to_virt(FIX_KMAP_BEGIN));
 }
 
 void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
@@ -255,6 +117,10 @@ void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
                SetPageReserved(page);
 }
 
+EXPORT_SYMBOL_GPL(page_is_ram);
+
+#ifdef CONFIG_HIGHMEM
+
 #ifndef CONFIG_DISCONTIGMEM
 void __init set_highmem_pages_init(int bad_ppro) 
 {
@@ -266,14 +132,12 @@ void __init set_highmem_pages_init(int bad_ppro)
 #else
 extern void set_highmem_pages_init(int);
 #endif /* !CONFIG_DISCONTIGMEM */
-
 #else
-#define kmap_init() do { } while (0)
-#define permanent_kmaps_init(pgd_base) do { } while (0)
-#define set_highmem_pages_init(bad_ppro) do { } while (0)
-#endif /* CONFIG_HIGHMEM */
+# define set_highmem_pages_init(bad_ppro) do { } while (0)
+#endif
 
-unsigned long __PAGE_KERNEL = _PAGE_KERNEL;
+unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
 
 #ifndef CONFIG_DISCONTIGMEM
 #define remap_numa_kva() do {} while (0)
@@ -281,30 +145,125 @@ unsigned long __PAGE_KERNEL = _PAGE_KERNEL;
 extern void __init remap_numa_kva(void);
 #endif
 
-static void __init pagetable_init (void)
+static __init void prepare_pagetables(pgd_t *pgd_base, unsigned long address)
+{
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_base + pgd_index(address);
+       pmd = pmd_offset(pgd, address);
+       if (!pmd_present(*pmd)) {
+               pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+               set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));
+       }
+}
+
+static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+       unsigned long vaddr;
+
+       for (vaddr = start; vaddr != end; vaddr += PAGE_SIZE)
+               prepare_pagetables(pgd_base, vaddr);
+}
+
+void setup_identity_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end)
 {
        unsigned long vaddr;
-       pgd_t *pgd_base = swapper_pg_dir;
+       pgd_t *pgd;
+       int i, j, k;
+       pmd_t *pmd;
+       pte_t *pte, *pte_base;
+
+       pgd = pgd_base;
+
+       for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
+               vaddr = i*PGDIR_SIZE;
+               if (end && (vaddr >= end))
+                       break;
+               pmd = pmd_offset(pgd, 0);
+               for (j = 0; j < PTRS_PER_PMD; pmd++, j++) {
+                       vaddr = i*PGDIR_SIZE + j*PMD_SIZE;
+                       if (end && (vaddr >= end))
+                               break;
+                       if (vaddr < start)
+                               continue;
+                       if (cpu_has_pse) {
+                               unsigned long __pe;
+
+                               set_in_cr4(X86_CR4_PSE);
+                               boot_cpu_data.wp_works_ok = 1;
+                               __pe = _KERNPG_TABLE + _PAGE_PSE + vaddr - start;
+                               /* Make it "global" too if supported */
+                               if (cpu_has_pge) {
+                                       set_in_cr4(X86_CR4_PGE);
+#if !defined(CONFIG_X86_SWITCH_PAGETABLES)
+                                       __pe += _PAGE_GLOBAL;
+                                       __PAGE_KERNEL |= _PAGE_GLOBAL;
+#endif
+                               }
+                               set_pmd(pmd, __pmd(__pe));
+                               continue;
+                       }
+                       if (!pmd_present(*pmd))
+                               pte_base = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+                       else
+                               pte_base = pte_offset_kernel(pmd, 0);
+                       pte = pte_base;
+                       for (k = 0; k < PTRS_PER_PTE; pte++, k++) {
+                               vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE;
+                               if (end && (vaddr >= end))
+                                       break;
+                               if (vaddr < start)
+                                       continue;
+                               *pte = mk_pte_phys(vaddr-start, PAGE_KERNEL);
+                       }
+                       set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base)));
+               }
+       }
+}
 
+static void __init pagetable_init (void)
+{
+       unsigned long vaddr, end;
+       pgd_t *pgd_base;
 #ifdef CONFIG_X86_PAE
        int i;
-       /* Init entries of the first-level page table to the zero page */
-       for (i = 0; i < PTRS_PER_PGD; i++)
-               set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
 #endif
 
-       /* Enable PSE if available */
-       if (cpu_has_pse) {
-               set_in_cr4(X86_CR4_PSE);
-       }
+       /*
+        * This can be zero as well - no problem, in that case we exit
+        * the loops anyway due to the PTRS_PER_* conditions.
+        */
+       end = (unsigned long)__va(max_low_pfn*PAGE_SIZE);
 
-       /* Enable PGE if available */
-       if (cpu_has_pge) {
-               set_in_cr4(X86_CR4_PGE);
-               __PAGE_KERNEL |= _PAGE_GLOBAL;
+       pgd_base = swapper_pg_dir;
+#ifdef CONFIG_X86_PAE
+       /*
+        * It causes too many problems if there's no proper pmd set up
+        * for all 4 entries of the PGD - so we allocate all of them.
+        * PAE systems will not miss this extra 4-8K anyway ...
+        */
+       for (i = 0; i < PTRS_PER_PGD; i++) {
+               pmd_t *pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+               set_pgd(pgd_base + i, __pgd(__pa(pmd) + 0x1));
        }
+#endif
+       /*
+        * Set up lowmem-sized identity mappings at PAGE_OFFSET:
+        */
+       setup_identity_mappings(pgd_base, PAGE_OFFSET, end);
 
-       kernel_physical_mapping_init(pgd_base);
+       /*
+        * Add flat-mode identity-mappings - SMP needs it when
+        * starting up on an AP from real-mode. (In the non-PAE
+        * case we already have these mappings through head.S.)
+        * All user-space mappings are explicitly cleared after
+        * SMP startup.
+        */
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PAE)
+       setup_identity_mappings(pgd_base, 0, 16*1024*1024);
+#endif
        remap_numa_kva();
 
        /*
@@ -312,38 +271,64 @@ static void __init pagetable_init (void)
         * created - mappings will be set by set_fixmap():
         */
        vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
-       page_table_range_init(vaddr, 0, pgd_base);
+       fixrange_init(vaddr, 0, pgd_base);
 
-       permanent_kmaps_init(pgd_base);
+#ifdef CONFIG_HIGHMEM
+       {
+               pgd_t *pgd;
+               pmd_t *pmd;
+               pte_t *pte;
 
-#ifdef CONFIG_X86_PAE
-       /*
-        * Add low memory identity-mappings - SMP needs it when
-        * starting up on an AP from real-mode. In the non-PAE
-        * case we already have these mappings through head.S.
-        * All user-space mappings are explicitly cleared after
-        * SMP startup.
-        */
-       pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
+               /*
+                * Permanent kmaps:
+                */
+               vaddr = PKMAP_BASE;
+               fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+               pgd = swapper_pg_dir + pgd_index(vaddr);
+               pmd = pmd_offset(pgd, vaddr);
+               pte = pte_offset_kernel(pmd, vaddr);
+               pkmap_page_table = pte;
+       }
 #endif
 }
 
-void zap_low_mappings (void)
+/*
+ * Clear kernel pagetables in a PMD_SIZE-aligned range.
+ */
+static void clear_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end)
 {
-       int i;
+       unsigned long vaddr;
+       pgd_t *pgd;
+       pmd_t *pmd;
+       int i, j;
+
+       pgd = pgd_base;
+
+       for (i = 0; i < PTRS_PER_PGD; pgd++, i++) {
+               vaddr = i*PGDIR_SIZE;
+               if (end && (vaddr >= end))
+                       break;
+               pmd = pmd_offset(pgd, 0);
+               for (j = 0; j < PTRS_PER_PMD; pmd++, j++) {
+                       vaddr = i*PGDIR_SIZE + j*PMD_SIZE;
+                       if (end && (vaddr >= end))
+                               break;
+                       if (vaddr < start)
+                               continue;
+                       pmd_clear(pmd);
+               }
+       }
+       flush_tlb_all();
+}
+
+void zap_low_mappings(void)
+{
+       printk("zapping low mappings.\n");
        /*
         * Zap initial low-memory mappings.
-        *
-        * Note that "pgd_clear()" doesn't do it for
-        * us, because pgd_clear() is a no-op on i386.
         */
-       for (i = 0; i < USER_PTRS_PER_PGD; i++)
-#ifdef CONFIG_X86_PAE
-               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
-#else
-               set_pgd(swapper_pg_dir+i, __pgd(0));
-#endif
-       flush_tlb_all();
+       clear_mappings(swapper_pg_dir, 0, 16*1024*1024);
 }
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -371,6 +356,79 @@ void __init zone_sizes_init(void)
 extern void zone_sizes_init(void);
 #endif /* !CONFIG_DISCONTIGMEM */
 
+static int disable_nx __initdata = 0;
+u64 __supported_pte_mask = ~_PAGE_NX;
+
+/*
+ * noexec = on|off
+ *
+ * Control non executable mappings.
+ *
+ * on      Enable
+ * off     Disable (disables exec-shield too)
+ */
+static int __init noexec_setup(char *str)
+{
+       if (!strncmp(str, "on",2) && cpu_has_nx) {
+               __supported_pte_mask |= _PAGE_NX;
+               disable_nx = 0;
+       } else if (!strncmp(str,"off",3)) {
+               disable_nx = 1;
+               __supported_pte_mask &= ~_PAGE_NX;
+               exec_shield = 0;
+       }
+       return 1;
+}
+
+__setup("noexec=", noexec_setup);
+
+#ifdef CONFIG_X86_PAE
+int nx_enabled = 0;
+
+static void __init set_nx(void)
+{
+       unsigned int v[4], l, h;
+
+       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+               if ((v[3] & (1 << 20)) && !disable_nx) {
+                       rdmsr(MSR_EFER, l, h);
+                       l |= EFER_NX;
+                       wrmsr(MSR_EFER, l, h);
+                       nx_enabled = 1;
+                       __supported_pte_mask |= _PAGE_NX;
+               }
+       }
+}
+/*
+ * Enables/disables executability of a given kernel page and
+ * returns the previous setting.
+ */
+int __init set_kernel_exec(unsigned long vaddr, int enable)
+{
+       pte_t *pte;
+       int ret = 1;
+
+       if (!nx_enabled)
+               goto out;
+
+       pte = lookup_address(vaddr);
+       BUG_ON(!pte);
+
+       if (!pte_exec_kernel(*pte))
+               ret = 0;
+
+       if (enable)
+               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
+       else
+               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
+       __flush_tlb_all();
+out:
+       return ret;
+}
+
+#endif
+
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
@@ -380,6 +438,12 @@ extern void zone_sizes_init(void);
  */
 void __init paging_init(void)
 {
+#ifdef CONFIG_X86_PAE
+       set_nx();
+       if (nx_enabled)
+               printk("NX (Execute Disable) protection: active\n");
+#endif
+
        pagetable_init();
 
        load_cr3(swapper_pg_dir);
@@ -393,7 +457,15 @@ void __init paging_init(void)
                set_in_cr4(X86_CR4_PAE);
 #endif
        __flush_tlb_all();
-
+       /*
+        * Subtle. SMP is doing it's boot stuff late (because it has to
+        * fork idle threads) - but it also needs low mappings for the
+        * protected-mode entry to work. We zap these entries only after
+        * the WP-bit has been tested.
+        */
+#ifndef CONFIG_SMP
+       zap_low_mappings();
+#endif
        kmap_init();
        zone_sizes_init();
 }
@@ -512,22 +584,18 @@ void __init mem_init(void)
        if (boot_cpu_data.wp_works_ok < 0)
                test_wp_bit();
 
-       /*
-        * Subtle. SMP is doing it's boot stuff late (because it has to
-        * fork idle threads) - but it also needs low mappings for the
-        * protected-mode entry to work. We zap these entries only after
-        * the WP-bit has been tested.
-        */
-#ifndef CONFIG_SMP
-       zap_low_mappings();
-#endif
+       entry_trampoline_setup();
+       default_ldt_page = virt_to_page(default_ldt);
+       load_LDT(&init_mm.context);
 }
 
-kmem_cache_t *pgd_cache;
-kmem_cache_t *pmd_cache;
+kmem_cache_t *pgd_cache, *pmd_cache, *kpmd_cache;
 
 void __init pgtable_cache_init(void)
 {
+       void (*ctor)(void *, kmem_cache_t *, unsigned long);
+       void (*dtor)(void *, kmem_cache_t *, unsigned long);
+
        if (PTRS_PER_PMD > 1) {
                pmd_cache = kmem_cache_create("pmd",
                                        PTRS_PER_PMD*sizeof(pmd_t),
@@ -537,13 +605,36 @@ void __init pgtable_cache_init(void)
                                        NULL);
                if (!pmd_cache)
                        panic("pgtable_cache_init(): cannot create pmd cache");
+
+               if (TASK_SIZE > PAGE_OFFSET) {
+                       kpmd_cache = kmem_cache_create("kpmd",
+                                       PTRS_PER_PMD*sizeof(pmd_t),
+                                       PTRS_PER_PMD*sizeof(pmd_t),
+                                       0,
+                                       kpmd_ctor,
+                                       NULL);
+                       if (!kpmd_cache)
+                               panic("pgtable_cache_init(): "
+                                               "cannot create kpmd cache");
+               }
        }
+
+       if (PTRS_PER_PMD == 1 || TASK_SIZE <= PAGE_OFFSET)
+               ctor = pgd_ctor;
+       else
+               ctor = NULL;
+
+       if (PTRS_PER_PMD == 1 && TASK_SIZE <= PAGE_OFFSET)
+               dtor = pgd_dtor;
+       else
+               dtor = NULL;
+
        pgd_cache = kmem_cache_create("pgd",
                                PTRS_PER_PGD*sizeof(pgd_t),
                                PTRS_PER_PGD*sizeof(pgd_t),
                                0,
-                               pgd_ctor,
-                               PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
+                               ctor,
+                               dtor);
        if (!pgd_cache)
                panic("pgtable_cache_init(): Cannot create pgd cache");
 }
@@ -552,7 +643,7 @@ void __init pgtable_cache_init(void)
  * This function cannot be __init, since exceptions don't work in that
  * section.  Put this after the callers, so that it cannot be inlined.
  */
-static int do_test_wp_bit(void)
+static int noinline do_test_wp_bit(void)
 {
        char tmp_reg;
        int flag;