X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fvmalloc.c;h=266162d2ba28254a9c8ca4366a8262d81d032f3c;hb=16c70f8c1b54b61c3b951b6fb220df250fe09b32;hp=c7f7018278b43deef91770a8eeddfe9dbda02493;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c7f701827..266162d2b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5,6 +5,7 @@ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 + * Numa awareness, Christoph Lameter, SGI, June 2005 */ #include @@ -17,197 +18,192 @@ #include #include -#include #include -rwlock_t vmlist_lock = RW_LOCK_UNLOCKED; +DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; -static void unmap_area_pte(pmd_t *pmd, unsigned long address, - unsigned long size) +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) { - unsigned long end; pte_t *pte; - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - - pte = pte_offset_kernel(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - + pte = pte_offset_kernel(pmd, addr); do { - pte_t page; - page = ptep_get_and_clear(pte); - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - if (pte_present(page)) - continue; - printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); - } while (address < end); + pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); + WARN_ON(!pte_none(ptent) && !pte_present(ptent)); + } while (pte++, addr += PAGE_SIZE, addr != end); } -static void unmap_area_pmd(pgd_t *dir, unsigned long address, - unsigned long size) +static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end) { - unsigned long end; pmd_t *pmd; + unsigned long next; - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + vunmap_pte_range(pmd, addr, next); + } while (pmd++, addr = next, addr != end); +} - pmd = pmd_offset(dir, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; +static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + pud_t *pud; + unsigned long next; + pud = pud_offset(pgd, addr); do { - unmap_area_pte(pmd, address, end - address); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + vunmap_pmd_range(pud, addr, next); + } while (pud++, addr = next, addr != end); } -static int map_area_pte(pte_t *pte, unsigned long address, - unsigned long size, pgprot_t prot, - struct page ***pages) +void unmap_vm_area(struct vm_struct *area) { - unsigned long end; + pgd_t *pgd; + unsigned long next; + unsigned long addr = (unsigned long) area->addr; + unsigned long end = addr + area->size; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + flush_cache_vunmap(addr, end); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + vunmap_pud_range(pgd, addr, next); + } while (pgd++, addr = next, addr != end); + flush_tlb_kernel_range((unsigned long) area->addr, end); +} - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page ***pages) +{ + pte_t *pte; + pte = pte_alloc_kernel(pmd, addr); + if (!pte) + return -ENOMEM; do { struct page *page = **pages; - WARN_ON(!pte_none(*pte)); if (!page) return -ENOMEM; - - set_pte(pte, mk_pte(page, prot)); - address += PAGE_SIZE; - pte++; + set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*pages)++; - } while (address < end); + } while (pte++, addr += PAGE_SIZE, addr != end); return 0; } -static int map_area_pmd(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t prot, - struct page ***pages) +static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page ***pages) { - unsigned long base, end; - - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; + pmd_t *pmd; + unsigned long next; + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + return -ENOMEM; do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, base + address); - if (!pte) + next = pmd_addr_end(addr, end); + if (vmap_pte_range(pmd, addr, next, prot, pages)) return -ENOMEM; - if (map_area_pte(pte, address, end - address, prot, pages)) - return -ENOMEM; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - + } while (pmd++, addr = next, addr != end); return 0; } -void unmap_vm_area(struct vm_struct *area) +static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page ***pages) { - unsigned long address = (unsigned long) area->addr; - unsigned long end = (address + area->size); - pgd_t *dir; + pud_t *pud; + unsigned long next; - dir = pgd_offset_k(address); - flush_cache_vunmap(address, end); + pud = pud_alloc(&init_mm, pgd, addr); + if (!pud) + return -ENOMEM; do { - unmap_area_pmd(dir, address, end - address); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - flush_tlb_kernel_range((unsigned long) area->addr, end); + next = pud_addr_end(addr, end); + if (vmap_pmd_range(pud, addr, next, prot, pages)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; } int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { - unsigned long address = (unsigned long) area->addr; - unsigned long end = address + (area->size-PAGE_SIZE); - pgd_t *dir; - int err = 0; - - dir = pgd_offset_k(address); - spin_lock(&init_mm.page_table_lock); + pgd_t *pgd; + unsigned long next; + unsigned long addr = (unsigned long) area->addr; + unsigned long end = addr + area->size - PAGE_SIZE; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); - if (!pmd) { - err = -ENOMEM; + next = pgd_addr_end(addr, end); + err = vmap_pud_range(pgd, addr, next, prot, pages); + if (err) break; - } - if (map_area_pmd(pmd, address, end - address, prot, pages)) { - err = -ENOMEM; - break; - } - - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - - spin_unlock(&init_mm.page_table_lock); + } while (pgd++, addr = next, addr != end); flush_cache_vmap((unsigned long) area->addr, end); return err; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) +struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, int node) { struct vm_struct **p, *tmp, *area; - unsigned long addr = start; + unsigned long align = 1; + unsigned long addr; + + if (flags & VM_IOREMAP) { + int bit = fls(size); - area = kmalloc(sizeof(*area), GFP_KERNEL); + if (bit > IOREMAP_MAX_ORDER) + bit = IOREMAP_MAX_ORDER; + else if (bit < PAGE_SHIFT) + bit = PAGE_SHIFT; + + align = 1ul << bit; + } + addr = ALIGN(start, align); + size = PAGE_ALIGN(size); + + area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); if (unlikely(!area)) return NULL; - /* - * We always allocate a guard page. - */ - size += PAGE_SIZE; if (unlikely(!size)) { kfree (area); return NULL; } + /* + * We always allocate a guard page. + */ + size += PAGE_SIZE; + write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) ;p = &tmp->next) { - if ((unsigned long)tmp->addr < addr) + for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { + if ((unsigned long)tmp->addr < addr) { + if((unsigned long)tmp->addr + tmp->size >= addr) + addr = ALIGN(tmp->size + + (unsigned long)tmp->addr, align); continue; + } if ((size + addr) < addr) goto out; if (size + addr <= (unsigned long)tmp->addr) goto found; - addr = tmp->size + (unsigned long)tmp->addr; + addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); if (addr > end - size) goto out; } @@ -229,9 +225,17 @@ found: out: write_unlock(&vmlist_lock); kfree(area); + if (printk_ratelimit()) + printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc= to increase size.\n"); return NULL; } +struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end) +{ + return __get_vm_area_node(size, flags, start, end, -1); +} + /** * get_vm_area - reserve a contingous kernel virtual area * @@ -247,34 +251,64 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); } -/** - * remove_vm_area - find and remove a contingous kernel virtual area - * - * @addr: base address - * - * Search for the kernel VM area starting at @addr, and remove it. - * This function returns the found VM area, but using it is NOT safe - * on SMP machines. - */ -struct vm_struct *remove_vm_area(void *addr) +struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) +{ + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); +} + +/* Caller must hold vmlist_lock */ +static struct vm_struct *__find_vm_area(void *addr) +{ + struct vm_struct *tmp; + + for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { + if (tmp->addr == addr) + break; + } + + return tmp; +} + +/* Caller must hold vmlist_lock */ +struct vm_struct *__remove_vm_area(void *addr) { struct vm_struct **p, *tmp; - write_lock(&vmlist_lock); - for (p = &vmlist ; (tmp = *p) ;p = &tmp->next) { + for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { if (tmp->addr == addr) goto found; } - write_unlock(&vmlist_lock); return NULL; found: unmap_vm_area(tmp); *p = tmp->next; - write_unlock(&vmlist_lock); + + /* + * Remove the guard page. + */ + tmp->size -= PAGE_SIZE; return tmp; } +/** + * remove_vm_area - find and remove a contingous kernel virtual area + * + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and remove it. + * This function returns the found VM area, but using it is NOT safe + * on SMP machines, except for its size or flags. + */ +struct vm_struct *remove_vm_area(void *addr) +{ + struct vm_struct *v; + write_lock(&vmlist_lock); + v = __remove_vm_area(addr); + write_unlock(&vmlist_lock); + return v; +} + void __vunmap(void *addr, int deallocate_pages) { struct vm_struct *area; @@ -284,6 +318,7 @@ void __vunmap(void *addr, int deallocate_pages) if ((PAGE_SIZE-1) & (unsigned long)addr) { printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); + WARN_ON(1); return; } @@ -291,19 +326,24 @@ void __vunmap(void *addr, int deallocate_pages) if (unlikely(!area)) { printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); + WARN_ON(1); return; } - + + debug_check_no_locks_freed(addr, area->size); + if (deallocate_pages) { int i; for (i = 0; i < area->nr_pages; i++) { - if (unlikely(!area->pages[i])) - BUG(); + BUG_ON(!area->pages[i]); __free_page(area->pages[i]); } - kfree(area->pages); + if (area->flags & VM_VPAGES) + vfree(area->pages); + else + kfree(area->pages); } kfree(area); @@ -316,16 +356,16 @@ void __vunmap(void *addr, int deallocate_pages) * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, as - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). + * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is + * NULL, no operation is performed. * - * May not be called in interrupt context. + * Must not be called in interrupt context. */ void vfree(void *addr) { BUG_ON(in_interrupt()); __vunmap(addr, 1); } - EXPORT_SYMBOL(vfree); /** @@ -336,14 +376,13 @@ EXPORT_SYMBOL(vfree); * Free the virtually contiguous memory area starting at @addr, * which was created from the page array passed to vmap(). * - * May not be called in interrupt context. + * Must not be called in interrupt context. */ void vunmap(void *addr) { BUG_ON(in_interrupt()); __vunmap(addr, 0); } - EXPORT_SYMBOL(vunmap); /** @@ -375,39 +414,25 @@ void *vmap(struct page **pages, unsigned int count, return area->addr; } - EXPORT_SYMBOL(vmap); -/** - * __vmalloc - allocate virtually contiguous memory - * - * @size: allocation size - * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages - * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. - */ -void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot) +void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot, int node) { - struct vm_struct *area; struct page **pages; unsigned int nr_pages, array_size, i; - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) - return NULL; - - area = get_vm_area(size, VM_ALLOC); - if (!area) - return NULL; - - nr_pages = size >> PAGE_SHIFT; + nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); area->nr_pages = nr_pages; - area->pages = pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); + /* Please note that the recursion is strictly bounded. */ + if (array_size > PAGE_SIZE) { + pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); + area->flags |= VM_VPAGES; + } else + pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); + area->pages = pages; if (!area->pages) { remove_vm_area(area->addr); kfree(area); @@ -416,14 +441,17 @@ void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot) memset(area->pages, 0, array_size); for (i = 0; i < area->nr_pages; i++) { - area->pages[i] = alloc_page(gfp_mask); + if (node < 0) + area->pages[i] = alloc_page(gfp_mask); + else + area->pages[i] = alloc_pages_node(node, gfp_mask, 0); if (unlikely(!area->pages[i])) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; goto fail; } } - + if (map_vm_area(area, prot, &pages)) goto fail; return area->addr; @@ -433,6 +461,44 @@ fail: return NULL; } +void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) +{ + return __vmalloc_area_node(area, gfp_mask, prot, -1); +} + +/** + * __vmalloc_node - allocate virtually contiguous memory + * + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node) +{ + struct vm_struct *area; + + size = PAGE_ALIGN(size); + if (!size || (size >> PAGE_SHIFT) > num_physpages) + return NULL; + + area = get_vm_area_node(size, VM_ALLOC, node); + if (!area) + return NULL; + + return __vmalloc_area_node(area, gfp_mask, prot, node); +} +EXPORT_SYMBOL(__vmalloc_node); + +void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +{ + return __vmalloc_node(size, gfp_mask, prot, -1); +} EXPORT_SYMBOL(__vmalloc); /** @@ -448,11 +514,72 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); } - EXPORT_SYMBOL(vmalloc); +/** + * vmalloc_user - allocate virtually contiguous memory which has + * been zeroed so it can be mapped to userspace without + * leaking data. + * + * @size: allocation size + */ +void *vmalloc_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + write_lock(&vmlist_lock); + area = __find_vm_area(ret); + area->flags |= VM_USERMAP; + write_unlock(&vmlist_lock); + + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + +/** + * vmalloc_node - allocate memory on a specific node + * + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight cotrol over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); +} +EXPORT_SYMBOL(vmalloc_node); + +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight cotrol over page level allocator and protection flags + * use __vmalloc() instead. + */ + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); +} + /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @@ -465,9 +592,30 @@ void *vmalloc_32(unsigned long size) { return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); } - EXPORT_SYMBOL(vmalloc_32); +/** + * vmalloc_32_user - allocate virtually contiguous memory (32bit + * addressable) which is zeroed so it can be + * mapped to userspace without leaking data. + * + * @size: allocation size + */ +void *vmalloc_32_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + write_lock(&vmlist_lock); + area = __find_vm_area(ret); + area->flags |= VM_USERMAP; + write_unlock(&vmlist_lock); + + return ret; +} +EXPORT_SYMBOL(vmalloc_32_user); + long vread(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; @@ -542,3 +690,64 @@ finished: read_unlock(&vmlist_lock); return buf - buf_start; } + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * @returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + struct vm_struct *area; + unsigned long uaddr = vma->vm_start; + unsigned long usize = vma->vm_end - vma->vm_start; + int ret; + + if ((PAGE_SIZE-1) & (unsigned long)addr) + return -EINVAL; + + read_lock(&vmlist_lock); + area = __find_vm_area(addr); + if (!area) + goto out_einval_locked; + + if (!(area->flags & VM_USERMAP)) + goto out_einval_locked; + + if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) + goto out_einval_locked; + read_unlock(&vmlist_lock); + + addr += pgoff << PAGE_SHIFT; + do { + struct page *page = vmalloc_to_page(addr); + ret = vm_insert_page(vma, uaddr, page); + if (ret) + return ret; + + uaddr += PAGE_SIZE; + addr += PAGE_SIZE; + usize -= PAGE_SIZE; + } while (usize > 0); + + /* Prevent "things" like memory migration? VM_flags need a cleanup... */ + vma->vm_flags |= VM_RESERVED; + + return ret; + +out_einval_locked: + read_unlock(&vmlist_lock); + return -EINVAL; +} +EXPORT_SYMBOL(remap_vmalloc_range); +