vserver 2.0 rc7
[linux-2.6.git] / arch / ia64 / mm / init.c
index 87a8016..4eb2f52 100644 (file)
@@ -19,9 +19,9 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/proc_fs.h>
+#include <linux/bitops.h>
 
 #include <asm/a.out.h>
-#include <asm/bitops.h>
 #include <asm/dma.h>
 #include <asm/ia32.h>
 #include <asm/io.h>
@@ -39,6 +39,9 @@
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
+DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
+DEFINE_PER_CPU(long, __pgtable_quicklist_size);
+
 extern void ia64_tlb_init (void);
 
 unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
@@ -50,33 +53,59 @@ struct page *vmem_map;
 EXPORT_SYMBOL(vmem_map);
 #endif
 
-static int pgt_cache_water[2] = { 25, 50 };
-
-struct page *zero_page_memmap_ptr;             /* map entry for zero page */
+struct page *zero_page_memmap_ptr;     /* map entry for zero page */
 EXPORT_SYMBOL(zero_page_memmap_ptr);
 
+#define MIN_PGT_PAGES                  25UL
+#define MAX_PGT_FREES_PER_PASS         16L
+#define PGT_FRACTION_OF_NODE_MEM       16
+
+static inline long
+max_pgt_pages(void)
+{
+       u64 node_free_pages, max_pgt_pages;
+
+#ifndef        CONFIG_NUMA
+       node_free_pages = nr_free_pages();
+#else
+       node_free_pages = nr_free_pages_pgdat(NODE_DATA(numa_node_id()));
+#endif
+       max_pgt_pages = node_free_pages / PGT_FRACTION_OF_NODE_MEM;
+       max_pgt_pages = max(max_pgt_pages, MIN_PGT_PAGES);
+       return max_pgt_pages;
+}
+
+static inline long
+min_pages_to_free(void)
+{
+       long pages_to_free;
+
+       pages_to_free = pgtable_quicklist_size - max_pgt_pages();
+       pages_to_free = min(pages_to_free, MAX_PGT_FREES_PER_PASS);
+       return pages_to_free;
+}
+
 void
-check_pgt_cache (void)
+check_pgt_cache(void)
 {
-       int low, high;
+       long pages_to_free;
 
-       low = pgt_cache_water[0];
-       high = pgt_cache_water[1];
+       if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES))
+               return;
 
        preempt_disable();
-       if (pgtable_cache_size > (u64) high) {
-               do {
-                       if (pgd_quicklist)
-                               free_page((unsigned long)pgd_alloc_one_fast(NULL));
-                       if (pmd_quicklist)
-                               free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
-               } while (pgtable_cache_size > (u64) low);
+       while (unlikely((pages_to_free = min_pages_to_free()) > 0)) {
+               while (pages_to_free--) {
+                       free_page((unsigned long)pgtable_quicklist_alloc());
+               }
+               preempt_enable();
+               preempt_disable();
        }
        preempt_enable();
 }
 
 void
-update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
+lazy_mmu_prot_update (pte_t pte)
 {
        unsigned long addr;
        struct page *page;
@@ -85,7 +114,6 @@ update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
                return;                         /* not an executable page... */
 
        page = pte_page(pte);
-       /* don't use VADDR: it may not be mapped on this CPU (or may have just been flushed): */
        addr = (unsigned long) page_address(page);
 
        if (test_bit(PG_arch_1, &page->flags))
@@ -98,7 +126,7 @@ update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
 inline void
 ia64_set_rbs_bot (void)
 {
-       unsigned long stack_size = current->rlim[RLIMIT_STACK].rlim_max & -16;
+       unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
 
        if (stack_size > MAX_USER_STACK_SIZE)
                stack_size = MAX_USER_STACK_SIZE;
@@ -131,7 +159,13 @@ ia64_init_addr_space (void)
                vma->vm_end = vma->vm_start + PAGE_SIZE;
                vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
                vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
-               insert_vm_struct(current->mm, vma);
+               down_write(&current->mm->mmap_sem);
+               if (insert_vm_struct(current->mm, vma)) {
+                       up_write(&current->mm->mmap_sem);
+                       kmem_cache_free(vm_area_cachep, vma);
+                       return;
+               }
+               up_write(&current->mm->mmap_sem);
        }
 
        /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
@@ -143,7 +177,13 @@ ia64_init_addr_space (void)
                        vma->vm_end = PAGE_SIZE;
                        vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
                        vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
-                       insert_vm_struct(current->mm, vma);
+                       down_write(&current->mm->mmap_sem);
+                       if (insert_vm_struct(current->mm, vma)) {
+                               up_write(&current->mm->mmap_sem);
+                               kmem_cache_free(vm_area_cachep, vma);
+                               return;
+                       }
+                       up_write(&current->mm->mmap_sem);
                }
        }
 }
@@ -225,6 +265,7 @@ struct page *
 put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 {
        pgd_t *pgd;
+       pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
 
@@ -236,7 +277,11 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 
        spin_lock(&init_mm.page_table_lock);
        {
-               pmd = pmd_alloc(&init_mm, pgd, address);
+               pud = pud_alloc(&init_mm, pgd, address);
+               if (!pud)
+                       goto out;
+
+               pmd = pmd_alloc(&init_mm, pud, address);
                if (!pmd)
                        goto out;
                pte = pte_alloc_map(&init_mm, pmd, address);
@@ -260,8 +305,9 @@ setup_gate (void)
        struct page *page;
 
        /*
-        * Map the gate page twice: once read-only to export the ELF headers etc. and once
-        * execute-only page to enable privilege-promotion via "epc":
+        * Map the gate page twice: once read-only to export the ELF
+        * headers etc. and once execute-only page to enable
+        * privilege-promotion via "epc":
         */
        page = virt_to_page(ia64_imva(__start_gate_section));
        put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
@@ -270,6 +316,20 @@ setup_gate (void)
        put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
 #else
        put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
+       /* Fill in the holes (if any) with read-only zero pages: */
+       {
+               unsigned long addr;
+
+               for (addr = GATE_ADDR + PAGE_SIZE;
+                    addr < GATE_ADDR + PERCPU_PAGE_SIZE;
+                    addr += PAGE_SIZE)
+               {
+                       put_kernel_page(ZERO_PAGE(0), addr,
+                                       PAGE_READONLY);
+                       put_kernel_page(ZERO_PAGE(0), addr + PERCPU_PAGE_SIZE,
+                                       PAGE_READONLY);
+               }
+       }
 #endif
        ia64_patch_gate();
 }
@@ -279,7 +339,6 @@ ia64_mmu_init (void *my_cpu_data)
 {
        unsigned long psr, pta, impl_va_bits;
        extern void __devinit tlb_init (void);
-       int cpu;
 
 #ifdef CONFIG_DISABLE_VHPT
 #      define VHPT_ENABLE_BIT  0
@@ -344,20 +403,6 @@ ia64_mmu_init (void *my_cpu_data)
        ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
        ia64_srlz_d();
 #endif
-
-       cpu = smp_processor_id();
-
-       /* mca handler uses cr.lid as key to pick the right entry */
-       ia64_mca_tlb_list[cpu].cr_lid = ia64_getreg(_IA64_REG_CR_LID);
-
-       /* insert this percpu data information into our list for MCA recovery purposes */
-       ia64_mca_tlb_list[cpu].percpu_paddr = pte_val(mk_pte_phys(__pa(my_cpu_data), PAGE_KERNEL));
-       /* Also save per-cpu tlb flush recipe for use in physical mode mca handler */
-       ia64_mca_tlb_list[cpu].ptce_base = local_cpu_data->ptce_base;
-       ia64_mca_tlb_list[cpu].ptce_count[0] = local_cpu_data->ptce_count[0];
-       ia64_mca_tlb_list[cpu].ptce_count[1] = local_cpu_data->ptce_count[1];
-       ia64_mca_tlb_list[cpu].ptce_stride[0] = local_cpu_data->ptce_stride[0];
-       ia64_mca_tlb_list[cpu].ptce_stride[1] = local_cpu_data->ptce_stride[1];
 }
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
@@ -369,6 +414,7 @@ create_mem_map_page_table (u64 start, u64 end, void *arg)
        struct page *map_start, *map_end;
        int node;
        pgd_t *pgd;
+       pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
 
@@ -383,7 +429,11 @@ create_mem_map_page_table (u64 start, u64 end, void *arg)
                pgd = pgd_offset_k(address);
                if (pgd_none(*pgd))
                        pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
-               pmd = pmd_offset(pgd, address);
+               pud = pud_offset(pgd, address);
+
+               if (pud_none(*pud))
+                       pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+               pmd = pmd_offset(pud, address);
 
                if (pmd_none(*pmd))
                        pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
@@ -410,7 +460,6 @@ virtual_memmap_init (u64 start, u64 end, void *arg)
        struct page *map_start, *map_end;
 
        args = (struct memmap_init_callback_data *) arg;
-
        map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
        map_end   = vmem_map + (__pa(end) >> PAGE_SHIFT);
 
@@ -518,11 +567,14 @@ void
 mem_init (void)
 {
        long reserved_pages, codesize, datasize, initsize;
-       unsigned long num_pgt_pages;
        pg_data_t *pgdat;
        int i;
        static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
 
+       BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
+       BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
+       BUG_ON(PTRS_PER_PTE * sizeof(pte_t) != PAGE_SIZE);
+
 #ifdef CONFIG_PCI
        /*
         * This needs to be called _after_ the command line has been parsed but _before_
@@ -559,18 +611,6 @@ mem_init (void)
               num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
               reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
 
-       /*
-        * Allow for enough (cached) page table pages so that we can map the entire memory
-        * at least once.  Each task also needs a couple of page tables pages, so add in a
-        * fudge factor for that (don't use "threads-max" here; that would be wrong!).
-        * Don't allow the cache to be more than 10% of total memory, though.
-        */
-#      define NUM_TASKS        500     /* typical number of tasks */
-       num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
-       if (num_pgt_pages > nr_free_pages() / 10)
-               num_pgt_pages = nr_free_pages() / 10;
-       if (num_pgt_pages > (u64) pgt_cache_water[1])
-               pgt_cache_water[1] = num_pgt_pages;
 
        /*
         * For fsyscall entrpoints with no light-weight handler, use the ordinary