Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / arch / ia64 / mm / init.c
index c03b398..95f3209 100644 (file)
@@ -19,9 +19,9 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/proc_fs.h>
+#include <linux/bitops.h>
 
 #include <asm/a.out.h>
-#include <asm/bitops.h>
 #include <asm/dma.h>
 #include <asm/ia32.h>
 #include <asm/io.h>
@@ -39,6 +39,9 @@
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
+DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
+DEFINE_PER_CPU(long, __pgtable_quicklist_size);
+
 extern void ia64_tlb_init (void);
 
 unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
@@ -50,53 +53,86 @@ struct page *vmem_map;
 EXPORT_SYMBOL(vmem_map);
 #endif
 
-static int pgt_cache_water[2] = { 25, 50 };
-
-struct page *zero_page_memmap_ptr;             /* map entry for zero page */
+struct page *zero_page_memmap_ptr;     /* map entry for zero page */
 EXPORT_SYMBOL(zero_page_memmap_ptr);
 
+#define MIN_PGT_PAGES                  25UL
+#define MAX_PGT_FREES_PER_PASS         16L
+#define PGT_FRACTION_OF_NODE_MEM       16
+
+static inline long
+max_pgt_pages(void)
+{
+       u64 node_free_pages, max_pgt_pages;
+
+#ifndef        CONFIG_NUMA
+       node_free_pages = nr_free_pages();
+#else
+       node_free_pages = nr_free_pages_pgdat(NODE_DATA(numa_node_id()));
+#endif
+       max_pgt_pages = node_free_pages / PGT_FRACTION_OF_NODE_MEM;
+       max_pgt_pages = max(max_pgt_pages, MIN_PGT_PAGES);
+       return max_pgt_pages;
+}
+
+static inline long
+min_pages_to_free(void)
+{
+       long pages_to_free;
+
+       pages_to_free = pgtable_quicklist_size - max_pgt_pages();
+       pages_to_free = min(pages_to_free, MAX_PGT_FREES_PER_PASS);
+       return pages_to_free;
+}
+
 void
-check_pgt_cache (void)
+check_pgt_cache(void)
 {
-       int low, high;
-
-       low = pgt_cache_water[0];
-       high = pgt_cache_water[1];
-
-       if (pgtable_cache_size > (u64) high) {
-               do {
-                       if (pgd_quicklist)
-                               free_page((unsigned long)pgd_alloc_one_fast(0));
-                       if (pmd_quicklist)
-                               free_page((unsigned long)pmd_alloc_one_fast(0, 0));
-               } while (pgtable_cache_size > (u64) low);
+       long pages_to_free;
+
+       if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES))
+               return;
+
+       preempt_disable();
+       while (unlikely((pages_to_free = min_pages_to_free()) > 0)) {
+               while (pages_to_free--) {
+                       free_page((unsigned long)pgtable_quicklist_alloc());
+               }
+               preempt_enable();
+               preempt_disable();
        }
+       preempt_enable();
 }
 
 void
-update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
+lazy_mmu_prot_update (pte_t pte)
 {
        unsigned long addr;
        struct page *page;
+       unsigned long order;
 
        if (!pte_exec(pte))
                return;                         /* not an executable page... */
 
        page = pte_page(pte);
-       /* don't use VADDR: it may not be mapped on this CPU (or may have just been flushed): */
        addr = (unsigned long) page_address(page);
 
        if (test_bit(PG_arch_1, &page->flags))
                return;                         /* i-cache is already coherent with d-cache */
 
-       flush_icache_range(addr, addr + PAGE_SIZE);
+       if (PageCompound(page)) {
+               order = (unsigned long) (page[1].lru.prev);
+               flush_icache_range(addr, addr + (1UL << order << PAGE_SHIFT));
+       }
+       else
+               flush_icache_range(addr, addr + PAGE_SIZE);
        set_bit(PG_arch_1, &page->flags);       /* mark page as clean */
 }
 
 inline void
 ia64_set_rbs_bot (void)
 {
-       unsigned long stack_size = current->rlim[RLIMIT_STACK].rlim_max & -16;
+       unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
 
        if (stack_size > MAX_USER_STACK_SIZE)
                stack_size = MAX_USER_STACK_SIZE;
@@ -128,8 +164,14 @@ ia64_init_addr_space (void)
                vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
                vma->vm_end = vma->vm_start + PAGE_SIZE;
                vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
-               vma->vm_flags = VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE|VM_GROWSUP;
-               insert_vm_struct(current->mm, vma);
+               vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT;
+               down_write(&current->mm->mmap_sem);
+               if (insert_vm_struct(current->mm, vma)) {
+                       up_write(&current->mm->mmap_sem);
+                       kmem_cache_free(vm_area_cachep, vma);
+                       return;
+               }
+               up_write(&current->mm->mmap_sem);
        }
 
        /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
@@ -141,7 +183,13 @@ ia64_init_addr_space (void)
                        vma->vm_end = PAGE_SIZE;
                        vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
                        vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
-                       insert_vm_struct(current->mm, vma);
+                       down_write(&current->mm->mmap_sem);
+                       if (insert_vm_struct(current->mm, vma)) {
+                               up_write(&current->mm->mmap_sem);
+                               kmem_cache_free(vm_area_cachep, vma);
+                               return;
+                       }
+                       up_write(&current->mm->mmap_sem);
                }
        }
 }
@@ -155,7 +203,7 @@ free_initmem (void)
        eaddr = (unsigned long) ia64_imva(__init_end);
        while (addr < eaddr) {
                ClearPageReserved(virt_to_page(addr));
-               set_page_count(virt_to_page(addr), 1);
+               init_page_count(virt_to_page(addr));
                free_page(addr);
                ++totalram_pages;
                addr += PAGE_SIZE;
@@ -164,12 +212,12 @@ free_initmem (void)
               (__init_end - __init_begin) >> 10);
 }
 
-void
+void __init
 free_initrd_mem (unsigned long start, unsigned long end)
 {
        struct page *page;
        /*
-        * EFI uses 4KB pages while the kernel can use 4KB  or bigger.
+        * EFI uses 4KB pages while the kernel can use 4KB or bigger.
         * Thus EFI and the kernel may have different page sizes. It is
         * therefore possible to have the initrd share the same page as
         * the end of the kernel (given current setup).
@@ -210,19 +258,27 @@ free_initrd_mem (unsigned long start, unsigned long end)
                        continue;
                page = virt_to_page(start);
                ClearPageReserved(page);
-               set_page_count(page, 1);
+               init_page_count(page);
                free_page(start);
                ++totalram_pages;
        }
 }
 
+int page_is_ram(unsigned long pagenr)
+{
+      //FIXME: implement w/efi walk
+      printk("page is ram is called!!!!!\n");  
+      return 1;
+}
+
 /*
  * This installs a clean page in the kernel's page table.
  */
-struct page *
+static struct page * __init
 put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 {
        pgd_t *pgd;
+       pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
 
@@ -232,34 +288,34 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 
        pgd = pgd_offset_k(address);            /* note: this is NOT pgd_offset()! */
 
-       spin_lock(&init_mm.page_table_lock);
        {
-               pmd = pmd_alloc(&init_mm, pgd, address);
+               pud = pud_alloc(&init_mm, pgd, address);
+               if (!pud)
+                       goto out;
+               pmd = pmd_alloc(&init_mm, pud, address);
                if (!pmd)
                        goto out;
-               pte = pte_alloc_map(&init_mm, pmd, address);
+               pte = pte_alloc_kernel(pmd, address);
                if (!pte)
                        goto out;
-               if (!pte_none(*pte)) {
-                       pte_unmap(pte);
+               if (!pte_none(*pte))
                        goto out;
-               }
                set_pte(pte, mk_pte(page, pgprot));
-               pte_unmap(pte);
        }
-  out: spin_unlock(&init_mm.page_table_lock);
+  out:
        /* no need for flush_tlb */
        return page;
 }
 
-static void
+static void __init
 setup_gate (void)
 {
        struct page *page;
 
        /*
-        * Map the gate page twice: once read-only to export the ELF headers etc. and once
-        * execute-only page to enable privilege-promotion via "epc":
+        * Map the gate page twice: once read-only to export the ELF
+        * headers etc. and once execute-only page to enable
+        * privilege-promotion via "epc":
         */
        page = virt_to_page(ia64_imva(__start_gate_section));
        put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
@@ -268,6 +324,20 @@ setup_gate (void)
        put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
 #else
        put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
+       /* Fill in the holes (if any) with read-only zero pages: */
+       {
+               unsigned long addr;
+
+               for (addr = GATE_ADDR + PAGE_SIZE;
+                    addr < GATE_ADDR + PERCPU_PAGE_SIZE;
+                    addr += PAGE_SIZE)
+               {
+                       put_kernel_page(ZERO_PAGE(0), addr,
+                                       PAGE_READONLY);
+                       put_kernel_page(ZERO_PAGE(0), addr + PERCPU_PAGE_SIZE,
+                                       PAGE_READONLY);
+               }
+       }
 #endif
        ia64_patch_gate();
 }
@@ -277,7 +347,6 @@ ia64_mmu_init (void *my_cpu_data)
 {
        unsigned long psr, pta, impl_va_bits;
        extern void __devinit tlb_init (void);
-       int cpu;
 
 #ifdef CONFIG_DISABLE_VHPT
 #      define VHPT_ENABLE_BIT  0
@@ -321,13 +390,22 @@ ia64_mmu_init (void *my_cpu_data)
 
        if (impl_va_bits < 51 || impl_va_bits > 61)
                panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
+       /*
+        * mapped_space_bits - PAGE_SHIFT is the total number of ptes we need,
+        * which must fit into "vmlpt_bits - pte_bits" slots. Second half of
+        * the test makes sure that our mapped space doesn't overlap the
+        * unimplemented hole in the middle of the region.
+        */
+       if ((mapped_space_bits - PAGE_SHIFT > vmlpt_bits - pte_bits) ||
+           (mapped_space_bits > impl_va_bits - 1))
+               panic("Cannot build a big enough virtual-linear page table"
+                     " to cover mapped address space.\n"
+                     " Try using a smaller page size.\n");
+
 
        /* place the VMLPT at the end of each page-table mapped region: */
        pta = POW2(61) - POW2(vmlpt_bits);
 
-       if (POW2(mapped_space_bits) >= pta)
-               panic("mm/init: overlap between virtually mapped linear page table and "
-                     "mapped kernel space!");
        /*
         * Set the (virtually mapped linear) page table address.  Bit
         * 8 selects between the short and long format, bits 2-7 the
@@ -342,31 +420,18 @@ ia64_mmu_init (void *my_cpu_data)
        ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
        ia64_srlz_d();
 #endif
-
-       cpu = smp_processor_id();
-
-       /* mca handler uses cr.lid as key to pick the right entry */
-       ia64_mca_tlb_list[cpu].cr_lid = ia64_getreg(_IA64_REG_CR_LID);
-
-       /* insert this percpu data information into our list for MCA recovery purposes */
-       ia64_mca_tlb_list[cpu].percpu_paddr = pte_val(mk_pte_phys(__pa(my_cpu_data), PAGE_KERNEL));
-       /* Also save per-cpu tlb flush recipe for use in physical mode mca handler */
-       ia64_mca_tlb_list[cpu].ptce_base = local_cpu_data->ptce_base;
-       ia64_mca_tlb_list[cpu].ptce_count[0] = local_cpu_data->ptce_count[0];
-       ia64_mca_tlb_list[cpu].ptce_count[1] = local_cpu_data->ptce_count[1];
-       ia64_mca_tlb_list[cpu].ptce_stride[0] = local_cpu_data->ptce_stride[0];
-       ia64_mca_tlb_list[cpu].ptce_stride[1] = local_cpu_data->ptce_stride[1];
 }
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
 
-int
+int __init
 create_mem_map_page_table (u64 start, u64 end, void *arg)
 {
        unsigned long address, start_page, end_page;
        struct page *map_start, *map_end;
        int node;
        pgd_t *pgd;
+       pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
 
@@ -381,7 +446,11 @@ create_mem_map_page_table (u64 start, u64 end, void *arg)
                pgd = pgd_offset_k(address);
                if (pgd_none(*pgd))
                        pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
-               pmd = pmd_offset(pgd, address);
+               pud = pud_offset(pgd, address);
+
+               if (pud_none(*pud))
+                       pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
+               pmd = pmd_offset(pud, address);
 
                if (pmd_none(*pmd))
                        pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
@@ -408,7 +477,6 @@ virtual_memmap_init (u64 start, u64 end, void *arg)
        struct page *map_start, *map_end;
 
        args = (struct memmap_init_callback_data *) arg;
-
        map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
        map_end   = vmem_map + (__pa(end) >> PAGE_SHIFT);
 
@@ -427,20 +495,22 @@ virtual_memmap_init (u64 start, u64 end, void *arg)
                    / sizeof(struct page));
 
        if (map_start < map_end)
-               memmap_init_zone(map_start, (unsigned long) (map_end - map_start),
+               memmap_init_zone((unsigned long)(map_end - map_start),
                                 args->nid, args->zone, page_to_pfn(map_start));
        return 0;
 }
 
 void
-memmap_init (struct page *start, unsigned long size, int nid,
-            unsigned long zone, unsigned long start_pfn)
+memmap_init (unsigned long size, int nid, unsigned long zone,
+            unsigned long start_pfn)
 {
        if (!vmem_map)
-               memmap_init_zone(start, size, nid, zone, start_pfn);
+               memmap_init_zone(size, nid, zone, start_pfn);
        else {
+               struct page *start;
                struct memmap_init_callback_data args;
 
+               start = pfn_to_page(start_pfn);
                args.start = start;
                args.end = start + size;
                args.nid = nid;
@@ -456,13 +526,13 @@ ia64_pfn_valid (unsigned long pfn)
        char byte;
        struct page *pg = pfn_to_page(pfn);
 
-       return     (__get_user(byte, (char *) pg) == 0)
+       return     (__get_user(byte, (char __user *) pg) == 0)
                && ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK))
-                       || (__get_user(byte, (char *) (pg + 1) - 1) == 0));
+                       || (__get_user(byte, (char __user *) (pg + 1) - 1) == 0));
 }
 EXPORT_SYMBOL(ia64_pfn_valid);
 
-int
+int __init
 find_largest_hole (u64 start, u64 end, void *arg)
 {
        u64 *max_gap = arg;
@@ -478,7 +548,7 @@ find_largest_hole (u64 start, u64 end, void *arg)
 }
 #endif /* CONFIG_VIRTUAL_MEM_MAP */
 
-static int
+static int __init
 count_reserved_pages (u64 start, u64 end, void *arg)
 {
        unsigned long num_reserved = 0;
@@ -499,7 +569,7 @@ count_reserved_pages (u64 start, u64 end, void *arg)
  * purposes.
  */
 
-static int nolwsys;
+static int nolwsys __initdata;
 
 static int __init
 nolwsys_setup (char *s)
@@ -510,15 +580,18 @@ nolwsys_setup (char *s)
 
 __setup("nolwsys", nolwsys_setup);
 
-void
+void __init
 mem_init (void)
 {
        long reserved_pages, codesize, datasize, initsize;
-       unsigned long num_pgt_pages;
        pg_data_t *pgdat;
        int i;
        static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
 
+       BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
+       BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
+       BUG_ON(PTRS_PER_PTE * sizeof(pte_t) != PAGE_SIZE);
+
 #ifdef CONFIG_PCI
        /*
         * This needs to be called _after_ the command line has been parsed but _before_
@@ -528,7 +601,7 @@ mem_init (void)
        platform_dma_init();
 #endif
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
        if (!mem_map)
                BUG();
        max_mapnr = max_low_pfn;
@@ -540,8 +613,9 @@ mem_init (void)
        kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
        kclist_add(&kcore_kernel, _stext, _end - _stext);
 
-       for_each_pgdat(pgdat)
-               totalram_pages += free_all_bootmem_node(pgdat);
+       for_each_online_pgdat(pgdat)
+               if (pgdat->bdata->node_bootmem_map)
+                       totalram_pages += free_all_bootmem_node(pgdat);
 
        reserved_pages = 0;
        efi_memmap_walk(count_reserved_pages, &reserved_pages);
@@ -555,18 +629,6 @@ mem_init (void)
               num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
               reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
 
-       /*
-        * Allow for enough (cached) page table pages so that we can map the entire memory
-        * at least once.  Each task also needs a couple of page tables pages, so add in a
-        * fudge factor for that (don't use "threads-max" here; that would be wrong!).
-        * Don't allow the cache to be more than 10% of total memory, though.
-        */
-#      define NUM_TASKS        500     /* typical number of tasks */
-       num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
-       if (num_pgt_pages > nr_free_pages() / 10)
-               num_pgt_pages = nr_free_pages() / 10;
-       if (num_pgt_pages > (u64) pgt_cache_water[1])
-               pgt_cache_water[1] = num_pgt_pages;
 
        /*
         * For fsyscall entrpoints with no light-weight handler, use the ordinary
@@ -580,9 +642,45 @@ mem_init (void)
                if (!fsyscall_table[i] || nolwsys)
                        fsyscall_table[i] = sys_call_table[i] | 1;
        }
-       setup_gate();   /* setup gate pages before we free up boot memory... */
+       setup_gate();
 
 #ifdef CONFIG_IA32_SUPPORT
-       ia32_boot_gdt_init();
+       ia32_mem_init();
 #endif
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void online_page(struct page *page)
+{
+       ClearPageReserved(page);
+       init_page_count(page);
+       __free_page(page);
+       totalram_pages++;
+       num_physpages++;
+}
+
+int add_memory(u64 start, u64 size)
+{
+       pg_data_t *pgdat;
+       struct zone *zone;
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       int ret;
+
+       pgdat = NODE_DATA(0);
+
+       zone = pgdat->node_zones + ZONE_NORMAL;
+       ret = __add_pages(zone, start_pfn, nr_pages);
+
+       if (ret)
+               printk("%s: Problem encountered in __add_pages() as ret=%d\n",
+                      __FUNCTION__,  ret);
+
+       return ret;
+}
+
+int remove_memory(u64 start, u64 size)
+{
+       return -EINVAL;
+}
+#endif