X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fmemory.c;h=3e0e9ec8ede09ba8bb11b052811350b7c59d3164;hb=6a77f38946aaee1cd85eeec6cf4229b204c15071;hp=ea40537ba2ebdc7358492c4e8ce057b5e162111f;hpb=87fc8d1bb10cd459024a742c6a10961fefcef18f;p=linux-2.6.git diff --git a/mm/memory.c b/mm/memory.c index ea40537ba..3e0e9ec8e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -34,6 +34,8 @@ * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) + * + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */ #include @@ -44,6 +46,7 @@ #include #include #include +#include #include #include @@ -74,83 +77,120 @@ unsigned long num_physpages; * and ZONE_HIGHMEM. */ void * high_memory; -struct page *highmem_start_page; unsigned long vmalloc_earlyreserve; EXPORT_SYMBOL(num_physpages); -EXPORT_SYMBOL(highmem_start_page); EXPORT_SYMBOL(high_memory); EXPORT_SYMBOL(vmalloc_earlyreserve); /* - * We special-case the C-O-W ZERO_PAGE, because it's such - * a common occurrence (no need to read the page to know - * that it's zero - better for the cache and memory subsystem). + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. */ -static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) +static inline void clear_pmd_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long start, unsigned long end) { - if (from == ZERO_PAGE(address)) { - clear_user_highpage(to, address); + struct page *page; + + if (pmd_none(*pmd)) return; + if (unlikely(pmd_bad(*pmd))) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + if (!((start | end) & ~PMD_MASK)) { + /* Only clear full, aligned ranges */ + page = pmd_page(*pmd); + pmd_clear(pmd); + dec_page_state(nr_page_table_pages); + tlb->mm->nr_ptes--; + pte_free_tlb(tlb, page); } - copy_user_highpage(to, from, address); } -/* - * Note: this doesn't free the actual pages themselves. That - * has been handled earlier when unmapping all the memory regions. - */ -static inline void free_one_pmd(struct mmu_gather *tlb, pmd_t * dir) +static inline void clear_pud_range(struct mmu_gather *tlb, pud_t *pud, unsigned long start, unsigned long end) { - struct page *page; + unsigned long addr = start, next; + pmd_t *pmd, *__pmd; - if (pmd_none(*dir)) + if (pud_none(*pud)) return; - if (unlikely(pmd_bad(*dir))) { - pmd_ERROR(*dir); - pmd_clear(dir); + if (unlikely(pud_bad(*pud))) { + pud_ERROR(*pud); + pud_clear(pud); return; } - page = pmd_page(*dir); - pmd_clear(dir); - dec_page_state(nr_page_table_pages); - pte_free_tlb(tlb, page); + + pmd = __pmd = pmd_offset(pud, start); + do { + next = (addr + PMD_SIZE) & PMD_MASK; + if (next > end || next <= addr) + next = end; + + clear_pmd_range(tlb, pmd, addr, next); + pmd++; + addr = next; + } while (addr && (addr < end)); + + if (!((start | end) & ~PUD_MASK)) { + /* Only clear full, aligned ranges */ + pud_clear(pud); + pmd_free_tlb(tlb, __pmd); + } } -static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) + +static inline void clear_pgd_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long start, unsigned long end) { - int j; - pmd_t * pmd; + unsigned long addr = start, next; + pud_t *pud, *__pud; - if (pgd_none(*dir)) + if (pgd_none(*pgd)) return; - if (unlikely(pgd_bad(*dir))) { - pgd_ERROR(*dir); - pgd_clear(dir); + if (unlikely(pgd_bad(*pgd))) { + pgd_ERROR(*pgd); + pgd_clear(pgd); return; } - pmd = pmd_offset(dir, 0); - pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); + + pud = __pud = pud_offset(pgd, start); + do { + next = (addr + PUD_SIZE) & PUD_MASK; + if (next > end || next <= addr) + next = end; + + clear_pud_range(tlb, pud, addr, next); + pud++; + addr = next; + } while (addr && (addr < end)); + + if (!((start | end) & ~PGDIR_MASK)) { + /* Only clear full, aligned ranges */ + pgd_clear(pgd); + pud_free_tlb(tlb, __pud); + } } /* - * This function clears all user-level page tables of a process - this - * is needed by execve(), so that old pages aren't in the way. + * This function clears user-level page tables of a process. * * Must be called with pagetable lock held. */ -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) +void clear_page_range(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - pgd_t * page_dir = tlb->mm->pgd; - - page_dir += first; - do { - free_one_pgd(tlb, page_dir); - page_dir++; - } while (--nr); + unsigned long addr = start, next; + pgd_t * pgd = pgd_offset(tlb->mm, start); + unsigned long i; + + for (i = pgd_index(start); i <= pgd_index(end-1); i++) { + next = (addr + PGDIR_SIZE) & PGDIR_MASK; + if (next > end || next <= addr) + next = end; + + clear_pgd_range(tlb, pgd, addr, next); + pgd++; + addr = next; + } } pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -163,7 +203,6 @@ pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long a spin_lock(&mm->page_table_lock); if (!new) return NULL; - /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. @@ -172,6 +211,7 @@ pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long a pte_free(new); goto out; } + mm->nr_ptes++; inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } @@ -203,161 +243,214 @@ pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned lon out: return pte_offset_kernel(pmd, address); } -#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) -#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. * - * 08Jan98 Merged into one routine from several inline routines to reduce - * variable count and make things faster. -jj - * * dst->page_table_lock is held on entry and exit, - * but may be dropped within pmd_alloc() and pte_alloc_map(). + * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). */ -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) + +static inline void +copy_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t pte) { - pgd_t * src_pgd, * dst_pgd; - unsigned long address = vma->vm_start; - unsigned long end = vma->vm_end; - unsigned long cow; + if (pte_file(pte)) + return; + swap_duplicate(pte_to_swp_entry(pte)); + if (list_empty(&dst_mm->mmlist)) { + spin_lock(&mmlist_lock); + list_add(&dst_mm->mmlist, &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } +} - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst, src, vma); +static inline void +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, + unsigned long addr) +{ + pte_t pte = *src_pte; + struct page *page; + unsigned long pfn; - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - src_pgd = pgd_offset(src, address)-1; - dst_pgd = pgd_offset(dst, address)-1; + /* pte contains position in swap, so copy. */ + if (!pte_present(pte)) { + copy_swap_pte(dst_mm, src_mm, pte); + set_pte(dst_pte, pte); + return; + } + pfn = pte_pfn(pte); + /* the pte points outside of valid memory, the + * mapping is assumed to be good, meaningful + * and not mapped via rmap - duplicate the + * mapping as is. + */ + page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); - for (;;) { - pmd_t * src_pmd, * dst_pmd; + if (!page || PageReserved(page)) { + set_pte(dst_pte, pte); + return; + } - src_pgd++; dst_pgd++; - - /* copy_pmd_range */ - - if (pgd_none(*src_pgd)) - goto skip_copy_pmd_range; - if (unlikely(pgd_bad(*src_pgd))) { - pgd_ERROR(*src_pgd); - pgd_clear(src_pgd); -skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; - if (!address || (address >= end)) - goto out; + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { + ptep_set_wrprotect(src_pte); + pte = *src_pte; + } + + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(page); + // dst_mm->rss++; + vx_rsspages_inc(dst_mm); + if (PageAnon(page)) + // dst_mm->anon_rss++; + vx_anonpages_inc(dst_mm); + set_pte(dst_pte, pte); + page_dup_rmap(page); +} + +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pte_t *src_pte, *dst_pte; + pte_t *s, *d; + unsigned long vm_flags = vma->vm_flags; + + d = dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); + if (!dst_pte) + return -ENOMEM; + + spin_lock(&src_mm->page_table_lock); + s = src_pte = pte_offset_map_nested(src_pmd, addr); + for (; addr < end; addr += PAGE_SIZE, s++, d++) { + if (pte_none(*s)) + continue; + copy_one_pte(dst_mm, src_mm, d, s, vm_flags, addr); + } + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + spin_unlock(&src_mm->page_table_lock); + cond_resched_lock(&dst_mm->page_table_lock); + return 0; +} + +static int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pmd_t *src_pmd, *dst_pmd; + int err = 0; + unsigned long next; + + src_pmd = pmd_offset(src_pud, addr); + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + if (!dst_pmd) + return -ENOMEM; + + for (; addr < end; addr = next, src_pmd++, dst_pmd++) { + next = (addr + PMD_SIZE) & PMD_MASK; + if (next > end || next <= addr) + next = end; + if (pmd_none(*src_pmd)) + continue; + if (pmd_bad(*src_pmd)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); continue; } + err = copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next); + if (err) + break; + } + return err; +} - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); - if (!dst_pmd) - goto nomem; +static int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pud_t *src_pud, *dst_pud; + int err = 0; + unsigned long next; + + src_pud = pud_offset(src_pgd, addr); + dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + if (!dst_pud) + return -ENOMEM; + + for (; addr < end; addr = next, src_pud++, dst_pud++) { + next = (addr + PUD_SIZE) & PUD_MASK; + if (next > end || next <= addr) + next = end; + if (pud_none(*src_pud)) + continue; + if (pud_bad(*src_pud)) { + pud_ERROR(*src_pud); + pud_clear(src_pud); + continue; + } + err = copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next); + if (err) + break; + } + return err; +} - do { - pte_t * src_pte, * dst_pte; - - /* copy_pte_range */ - - if (pmd_none(*src_pmd)) - goto skip_copy_pte_range; - if (unlikely(pmd_bad(*src_pmd))) { - pmd_ERROR(*src_pmd); - pmd_clear(src_pmd); -skip_copy_pte_range: - address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) - goto out; - goto cont_copy_pmd_range; - } +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long addr, start, end, next; + int err = 0; - dst_pte = pte_alloc_map(dst, dst_pmd, address); - if (!dst_pte) - goto nomem; - spin_lock(&src->page_table_lock); - src_pte = pte_offset_map_nested(src_pmd, address); - do { - pte_t pte = *src_pte; - struct page *page; - unsigned long pfn; - - if (!vx_rsspages_avail(dst, 1)) { - spin_unlock(&src->page_table_lock); - goto nomem; - } - /* copy_one_pte */ - - if (pte_none(pte)) - goto cont_copy_pte_range_noset; - /* pte contains position in swap, so copy. */ - if (!pte_present(pte)) { - if (!pte_file(pte)) - swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; - } - pfn = pte_pfn(pte); - /* the pte points outside of valid memory, the - * mapping is assumed to be good, meaningful - * and not mapped via rmap - duplicate the - * mapping as is. - */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst, src, vma); - if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; - } + start = vma->vm_start; + src_pgd = pgd_offset(src, start); + dst_pgd = pgd_offset(dst, start); - /* - * If it's a COW mapping, write protect it both - * in the parent and the child - */ - if (cow) { - ptep_set_wrprotect(src_pte); - pte = *src_pte; - } + end = vma->vm_end; + addr = start; + while (addr && (addr < end-1)) { + next = (addr + PGDIR_SIZE) & PGDIR_MASK; + if (next > end || next <= addr) + next = end; + if (pgd_none(*src_pgd)) + goto next_pgd; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); + goto next_pgd; + } + err = copy_pud_range(dst, src, dst_pgd, src_pgd, + vma, addr, next); + if (err) + break; - /* - * If it's a shared mapping, mark it clean in - * the child - */ - if (vma->vm_flags & VM_SHARED) - pte = pte_mkclean(pte); - pte = pte_mkold(pte); - get_page(page); - // dst->rss++; - vx_rsspages_inc(dst); - set_pte(dst_pte, pte); - page_dup_rmap(page); -cont_copy_pte_range_noset: - address += PAGE_SIZE; - if (address >= end) { - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - goto out_unlock; - } - src_pte++; - dst_pte++; - } while ((unsigned long)src_pte & PTE_TABLE_MASK); - pte_unmap_nested(src_pte-1); - pte_unmap(dst_pte-1); - spin_unlock(&src->page_table_lock); - cond_resched_lock(&dst->page_table_lock); -cont_copy_pmd_range: - src_pmd++; - dst_pmd++; - } while ((unsigned long)src_pmd & PMD_TABLE_MASK); +next_pgd: + src_pgd++; + dst_pgd++; + addr = next; } -out_unlock: - spin_unlock(&src->page_table_lock); -out: - return 0; -nomem: - return -ENOMEM; + + return err; } static void zap_pte_range(struct mmu_gather *tlb, @@ -421,7 +514,10 @@ static void zap_pte_range(struct mmu_gather *tlb, set_pte(ptep, pgoff_to_pte(page->index)); if (pte_dirty(pte)) set_page_dirty(page); - if (pte_young(pte) && !PageAnon(page)) + if (PageAnon(page)) + // tlb->mm->anon_rss--; + vx_anonpages_dec(tlb->mm); + else if (pte_young(pte)) mark_page_accessed(page); tlb->freed++; page_remove_rmap(page); @@ -442,23 +538,23 @@ static void zap_pte_range(struct mmu_gather *tlb, } static void zap_pmd_range(struct mmu_gather *tlb, - pgd_t * dir, unsigned long address, + pud_t *pud, unsigned long address, unsigned long size, struct zap_details *details) { pmd_t * pmd; unsigned long end; - if (pgd_none(*dir)) + if (pud_none(*pud)) return; - if (unlikely(pgd_bad(*dir))) { - pgd_ERROR(*dir); - pgd_clear(dir); + if (unlikely(pud_bad(*pud))) { + pud_ERROR(*pud); + pud_clear(pud); return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset(pud, address); end = address + size; - if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) - end = ((address + PGDIR_SIZE) & PGDIR_MASK); + if (end > ((address + PUD_SIZE) & PUD_MASK)) + end = ((address + PUD_SIZE) & PUD_MASK); do { zap_pte_range(tlb, pmd, address, end - address, details); address = (address + PMD_SIZE) & PMD_MASK; @@ -466,36 +562,54 @@ static void zap_pmd_range(struct mmu_gather *tlb, } while (address && (address < end)); } +static void zap_pud_range(struct mmu_gather *tlb, + pgd_t * pgd, unsigned long address, + unsigned long end, struct zap_details *details) +{ + pud_t * pud; + + if (pgd_none(*pgd)) + return; + if (unlikely(pgd_bad(*pgd))) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return; + } + pud = pud_offset(pgd, address); + do { + zap_pmd_range(tlb, pud, address, end - address, details); + address = (address + PUD_SIZE) & PUD_MASK; + pud++; + } while (address && (address < end)); +} + static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long end, struct zap_details *details) { - pgd_t * dir; + unsigned long next; + pgd_t *pgd; + int i; BUG_ON(address >= end); - dir = pgd_offset(vma->vm_mm, address); + pgd = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); - do { - zap_pmd_range(tlb, dir, address, end - address, details); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); + for (i = pgd_index(address); i <= pgd_index(end-1); i++) { + next = (address + PGDIR_SIZE) & PGDIR_MASK; + if (next <= address || next > end) + next = end; + zap_pud_range(tlb, pgd, address, next, details); + address = next; + pgd++; + } tlb_end_vma(tlb, vma); } -/* Dispose of an entire struct mmu_gather per rescheduling point */ -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (FREE_PTE_NR * PAGE_SIZE) -#endif - -/* For UP, 256 pages at a time gives nice low latency */ -#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (256 * PAGE_SIZE) -#endif - +#ifdef CONFIG_PREEMPT +# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) +#else /* No preempt: go for improved straight-line efficiency */ -#if !defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) +# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) #endif /** @@ -534,7 +648,8 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; int ret = 0; - int atomic = details && details->atomic; + spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; + int fullmm = tlb_is_full_mm(*tlbp); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long start; @@ -572,16 +687,29 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, zap_bytes -= block; if ((long)zap_bytes > 0) continue; - if (!atomic && need_resched()) { - int fullmm = tlb_is_full_mm(*tlbp); - tlb_finish_mmu(*tlbp, tlb_start, start); - cond_resched_lock(&mm->page_table_lock); - *tlbp = tlb_gather_mmu(mm, fullmm); - tlb_start_valid = 0; + + tlb_finish_mmu(*tlbp, tlb_start, start); + + if (need_resched() || + need_lockbreak(&mm->page_table_lock) || + (i_mmap_lock && need_lockbreak(i_mmap_lock))) { + if (i_mmap_lock) { + /* must reset count of rss freed */ + *tlbp = tlb_gather_mmu(mm, fullmm); + details->break_addr = start; + goto out; + } + spin_unlock(&mm->page_table_lock); + cond_resched(); + spin_lock(&mm->page_table_lock); } + + *tlbp = tlb_gather_mmu(mm, fullmm); + tlb_start_valid = 0; zap_bytes = ZAP_BLOCK_SIZE; } } +out: return ret; } @@ -610,6 +738,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, tlb = tlb_gather_mmu(mm, 0); unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); tlb_finish_mmu(tlb, address, end); + acct_update_integrals(); spin_unlock(&mm->page_table_lock); } @@ -617,10 +746,11 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) +static struct page * +__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; unsigned long pfn; @@ -634,13 +764,15 @@ follow_page(struct mm_struct *mm, unsigned long address, int write) if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; - pmd = pmd_offset(pgd, address); - if (pmd_none(*pmd)) + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; if (pmd_huge(*pmd)) return follow_huge_pmd(mm, address, pmd, write); - if (unlikely(pmd_bad(*pmd))) - goto out; ptep = pte_offset_map(pmd, address); if (!ptep) @@ -651,6 +783,8 @@ follow_page(struct mm_struct *mm, unsigned long address, int write) if (pte_present(pte)) { if (write && !pte_write(pte)) goto out; + if (read && !pte_read(pte)) + goto out; pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); @@ -665,6 +799,20 @@ out: return NULL; } +struct page * +follow_page(struct mm_struct *mm, unsigned long address, int write) +{ + return __follow_page(mm, address, /*read*/0, write); +} + +int +check_user_page_readable(struct mm_struct *mm, unsigned long address) +{ + return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; +} + +EXPORT_SYMBOL(check_user_page_readable); + /* * Given a physical address, is there a useful struct page pointing to * it? This may become more complex in the future if we start dealing @@ -684,6 +832,7 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; /* Check if the vma is for an anonymous mapping. */ @@ -695,8 +844,12 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return 1; + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + return 1; + /* Check if page middle directory entry exists. */ - pmd = pmd_offset(pgd, address); + pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) return 1; @@ -728,23 +881,22 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long pg = start & PAGE_MASK; struct vm_area_struct *gate_vma = get_gate_vma(tsk); pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pte_t *pte; if (write) /* user gate pages are read-only */ return i ? : -EFAULT; - pgd = pgd_offset_gate(mm, pg); - if (!pgd) - return i ? : -EFAULT; - pmd = pmd_offset(pgd, pg); - if (!pmd) - return i ? : -EFAULT; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + BUG_ON(pmd_none(*pmd)); pte = pte_offset_map(pmd, pg); - if (!pte) - return i ? : -EFAULT; - if (!pte_present(*pte)) { - pte_unmap(pte); - return i ? : -EFAULT; - } + BUG_ON(pte_none(*pte)); if (pages) { pages[i] = pte_page(*pte); get_page(pages[i]); @@ -758,7 +910,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (pages && (vma->vm_flags & VM_IO)) + if (!vma || (vma->vm_flags & VM_IO) || !(flags & vma->vm_flags)) return i ? : -EFAULT; @@ -771,6 +923,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, do { struct page *map; int lookup_write = write; + + cond_resched_lock(&mm->page_table_lock); while (!(map = follow_page(mm, start, lookup_write))) { /* * Shortcut for anonymous pages. We don't want @@ -854,16 +1008,16 @@ static void zeromap_pte_range(pte_t * pte, unsigned long address, } while (address && (address < end)); } -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t prot) +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long base, end; - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; + base = address & PUD_MASK; + address &= ~PUD_MASK; end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; + if (end > PUD_SIZE) + end = PUD_SIZE; do { pte_t * pte = pte_alloc_map(mm, pmd, base + address); if (!pte) @@ -876,31 +1030,65 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned return 0; } -int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, pgprot_t prot) +static inline int zeromap_pud_range(struct mm_struct *mm, pud_t * pud, + unsigned long address, + unsigned long size, pgprot_t prot) { + unsigned long base, end; int error = 0; - pgd_t * dir; + + base = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pmd_t * pmd = pmd_alloc(mm, pud, base + address); + error = -ENOMEM; + if (!pmd) + break; + error = zeromap_pmd_range(mm, pmd, base + address, + end - address, prot); + if (error) + break; + address = (address + PUD_SIZE) & PUD_MASK; + pud++; + } while (address && (address < end)); + return 0; +} + +int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size, pgprot_t prot) +{ + int i; + int error = 0; + pgd_t * pgd; unsigned long beg = address; unsigned long end = address + size; + unsigned long next; struct mm_struct *mm = vma->vm_mm; - dir = pgd_offset(mm, address); + pgd = pgd_offset(mm, address); flush_cache_range(vma, beg, end); - if (address >= end) - BUG(); + BUG_ON(address >= end); + BUG_ON(end > vma->vm_end); spin_lock(&mm->page_table_lock); - do { - pmd_t *pmd = pmd_alloc(mm, dir, address); + for (i = pgd_index(address); i <= pgd_index(end-1); i++) { + pud_t *pud = pud_alloc(mm, pgd, address); error = -ENOMEM; - if (!pmd) + if (!pud) break; - error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + next = (address + PGDIR_SIZE) & PGDIR_MASK; + if (next <= beg || next > end) + next = end; + error = zeromap_pud_range(mm, pud, address, + next - address, prot); if (error) break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); + address = next; + pgd++; + } /* * Why flush? zeromap_pte_range has a BUG_ON for !pte_none() */ @@ -914,17 +1102,16 @@ int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, unsign * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline void +remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, + unsigned long pfn, pgprot_t prot) { unsigned long end; - unsigned long pfn; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) end = PMD_SIZE; - pfn = phys_addr >> PAGE_SHIFT; do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) @@ -935,22 +1122,24 @@ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned } while (address && (address < end)); } -static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline int +remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, + unsigned long size, unsigned long pfn, pgprot_t prot) { unsigned long base, end; - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; + base = address & PUD_MASK; + address &= ~PUD_MASK; end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - phys_addr -= address; + if (end > PUD_SIZE) + end = PUD_SIZE; + pfn -= (address >> PAGE_SHIFT); do { pte_t * pte = pte_alloc_map(mm, pmd, base + address); if (!pte) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + remap_pte_range(pte, base + address, end - address, + (address >> PAGE_SHIFT) + pfn, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; pmd++; @@ -958,42 +1147,87 @@ static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned lo return 0; } +static inline int remap_pud_range(struct mm_struct *mm, pud_t * pud, + unsigned long address, unsigned long size, + unsigned long pfn, pgprot_t prot) +{ + unsigned long base, end; + int error; + + base = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + pfn -= address >> PAGE_SHIFT; + do { + pmd_t *pmd = pmd_alloc(mm, pud, base+address); + error = -ENOMEM; + if (!pmd) + break; + error = remap_pmd_range(mm, pmd, base + address, end - address, + (address >> PAGE_SHIFT) + pfn, prot); + if (error) + break; + address = (address + PUD_SIZE) & PUD_MASK; + pud++; + } while (address && (address < end)); + return error; +} + /* Note: this is only safe if the mm semaphore is held when called. */ -int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) +int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long pfn, unsigned long size, pgprot_t prot) { int error = 0; - pgd_t * dir; + pgd_t *pgd; unsigned long beg = from; unsigned long end = from + size; + unsigned long next; struct mm_struct *mm = vma->vm_mm; + int i; - phys_addr -= from; - dir = pgd_offset(mm, from); + pfn -= from >> PAGE_SHIFT; + pgd = pgd_offset(mm, from); flush_cache_range(vma, beg, end); - if (from >= end) - BUG(); + BUG_ON(from >= end); + + /* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_RESERVED tells swapout not to try to touch + * this region. + */ + vma->vm_flags |= VM_IO | VM_RESERVED; spin_lock(&mm->page_table_lock); - do { - pmd_t *pmd = pmd_alloc(mm, dir, from); + for (i = pgd_index(beg); i <= pgd_index(end-1); i++) { + pud_t *pud = pud_alloc(mm, pgd, from); error = -ENOMEM; - if (!pmd) + if (!pud) break; - error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + next = (from + PGDIR_SIZE) & PGDIR_MASK; + if (next > end || next <= from) + next = end; + error = remap_pud_range(mm, pud, from, end - from, + pfn + (from >> PAGE_SHIFT), prot); if (error) break; - from = (from + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (from && (from < end)); + from = next; + pgd++; + } /* * Why flush? remap_pte_range has a BUG_ON for !pte_none() */ flush_tlb_range(vma, beg, end); spin_unlock(&mm->page_table_lock); + return error; } -EXPORT_SYMBOL(remap_page_range); +EXPORT_SYMBOL(remap_pfn_range); /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when @@ -1089,21 +1323,31 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, if (unlikely(anon_vma_prepare(vma))) goto no_new_page; - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!new_page) - goto no_new_page; - copy_cow_page(old_page,new_page,address); - + if (old_page == ZERO_PAGE(address)) { + new_page = alloc_zeroed_user_highpage(vma, address); + if (!new_page) + goto no_new_page; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!new_page) + goto no_new_page; + copy_user_highpage(new_page, old_page, address); + } /* * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, pte))) { - if (PageReserved(old_page)) + if (PageAnon(old_page)) + // mm->anon_rss--; + vx_anonpages_dec(mm); + if (PageReserved(old_page)) { // ++mm->rss; vx_rsspages_inc(mm); - else + acct_update_integrals(); + update_mem_hiwater(); + } else page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); lru_cache_add_active(new_page); @@ -1124,17 +1368,112 @@ no_new_page: } /* - * Helper function for unmap_mapping_range(). + * Helper functions for unmap_mapping_range(). + * + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * + * We have to restart searching the prio_tree whenever we drop the lock, + * since the iterator is only valid while the lock is held, and anyway + * a later vma might be split and reinserted earlier while lock dropped. + * + * The list of nonlinear vmas could be handled more efficiently, using + * a placeholder, but handle it in the same way until a need is shown. + * It is important to search the prio_tree before nonlinear list: a vma + * may become nonlinear and be shifted from prio_tree to nonlinear list + * while the lock is dropped; but never shifted from list to prio_tree. + * + * In order to make forward progress despite restarting the search, + * vm_truncate_count is used to mark a vma as now dealt with, so we can + * quickly skip it next time around. Since the prio_tree search only + * shows us those vmas affected by unmapping the range in question, we + * can't efficiently keep all vmas in step with mapping->truncate_count: + * so instead reset them all whenever it wraps back to 0 (then go to 1). + * mapping->truncate_count and vma->vm_truncate_count are protected by + * i_mmap_lock. + * + * In order to make forward progress despite repeatedly restarting some + * large vma, note the break_addr set by unmap_vmas when it breaks out: + * and restart from that address when we reach that vma again. It might + * have been split or merged, shrunk or extended, but never shifted: so + * restart_addr remains valid so long as it remains in the vma's range. + * unmap_mapping_range forces truncate_count to leap over page-aligned + * values so we can save vma's restart_addr in its truncate_count field. */ -static inline void unmap_mapping_range_list(struct prio_tree_root *root, +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) + +static void reset_vma_truncate_counts(struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) + vma->vm_truncate_count = 0; + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_truncate_count = 0; +} + +static int unmap_mapping_range_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + unsigned long restart_addr; + int need_break; + +again: + restart_addr = vma->vm_truncate_count; + if (is_restart_addr(restart_addr) && start_addr < restart_addr) { + start_addr = restart_addr; + if (start_addr >= end_addr) { + /* Top of vma has been split off since last time */ + vma->vm_truncate_count = details->truncate_count; + return 0; + } + } + + details->break_addr = end_addr; + zap_page_range(vma, start_addr, end_addr - start_addr, details); + + /* + * We cannot rely on the break test in unmap_vmas: + * on the one hand, we don't want to restart our loop + * just because that broke out for the page_table_lock; + * on the other hand, it does no test when vma is small. + */ + need_break = need_resched() || + need_lockbreak(details->i_mmap_lock); + + if (details->break_addr >= end_addr) { + /* We have now completed this vma: mark it so */ + vma->vm_truncate_count = details->truncate_count; + if (!need_break) + return 0; + } else { + /* Note restart_addr in vma's truncate_count field */ + vma->vm_truncate_count = details->break_addr; + if (!need_break) + goto again; + } + + spin_unlock(details->i_mmap_lock); + cond_resched(); + spin_lock(details->i_mmap_lock); + return -EINTR; +} + +static inline void unmap_mapping_range_tree(struct prio_tree_root *root, struct zap_details *details) { struct vm_area_struct *vma; struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; +restart: vma_prio_tree_foreach(vma, &iter, root, details->first_index, details->last_index) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + vba = vma->vm_pgoff; vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ @@ -1144,9 +1483,35 @@ static inline void unmap_mapping_range_list(struct prio_tree_root *root, zea = details->last_index; if (zea > vea) zea = vea; - zap_page_range(vma, + + if (unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, - (zea - zba + 1) << PAGE_SHIFT, details); + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, + details) < 0) + goto restart; + } +} + +static inline void unmap_mapping_range_list(struct list_head *head, + struct zap_details *details) +{ + struct vm_area_struct *vma; + + /* + * In nonlinear VMAs there is no correspondence between virtual address + * offset and file offset. So we must perform an exhaustive search + * across *all* the pages in each nonlinear VMA, not just the pages + * whose virtual address lies outside the file truncation point. + */ +restart: + list_for_each_entry(vma, head, shared.vm_set.list) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + details->nonlinear_vma = vma; + if (unmap_mapping_range_vma(vma, vma->vm_start, + vma->vm_end, details) < 0) + goto restart; } } @@ -1185,32 +1550,34 @@ void unmap_mapping_range(struct address_space *mapping, details.nonlinear_vma = NULL; details.first_index = hba; details.last_index = hba + hlen - 1; - details.atomic = 1; /* A spinlock is held */ if (details.last_index < details.first_index) details.last_index = ULONG_MAX; + details.i_mmap_lock = &mapping->i_mmap_lock; spin_lock(&mapping->i_mmap_lock); - /* Protect against page fault */ - atomic_inc(&mapping->truncate_count); - - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) - unmap_mapping_range_list(&mapping->i_mmap, &details); + /* serialize i_size write against truncate_count write */ + smp_wmb(); + /* Protect against page faults, and endless unmapping loops */ + mapping->truncate_count++; /* - * In nonlinear VMAs there is no correspondence between virtual address - * offset and file offset. So we must perform an exhaustive search - * across *all* the pages in each nonlinear VMA, not just the pages - * whose virtual address lies outside the file truncation point. + * For archs where spin_lock has inclusive semantics like ia64 + * this smp_mb() will prevent to read pagetable contents + * before the truncate_count increment is visible to + * other cpus. */ - if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) { - struct vm_area_struct *vma; - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { - details.nonlinear_vma = vma; - zap_page_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start, &details); - } + smp_mb(); + if (unlikely(is_restart_addr(mapping->truncate_count))) { + if (mapping->truncate_count == 0) + reset_vma_truncate_counts(mapping); + mapping->truncate_count++; } + details.truncate_count = mapping->truncate_count; + + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) + unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); } EXPORT_SYMBOL(unmap_mapping_range); @@ -1242,7 +1609,7 @@ int vmtruncate(struct inode * inode, loff_t offset) goto out_truncate; do_expand: - limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) @@ -1390,6 +1757,9 @@ static int do_swap_page(struct mm_struct * mm, // mm->rss++; vx_rsspages_inc(mm); + acct_update_integrals(); + update_mem_hiwater(); + pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -1438,15 +1808,13 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - if (unlikely(anon_vma_prepare(vma))) - goto no_mem; if (!vx_rsspages_avail(mm, 1)) goto no_mem; - - page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + if (unlikely(anon_vma_prepare(vma))) + goto no_mem; + page = alloc_zeroed_user_highpage(vma, addr); if (!page) goto no_mem; - clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, addr); @@ -1459,11 +1827,13 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } // mm->rss++; vx_rsspages_inc(mm); + acct_update_integrals(); + update_mem_hiwater(); entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)), vma); lru_cache_add_active(page); - mark_page_accessed(page); + SetPageReferenced(page); page_add_anon_rmap(page, vma, addr); } @@ -1498,7 +1868,7 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page * new_page; struct address_space *mapping = NULL; pte_t entry; - int sequence = 0; + unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; @@ -1510,19 +1880,28 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - sequence = atomic_read(&mapping->truncate_count); + sequence = mapping->truncate_count; + smp_rmb(); /* serializes i_size against truncate_count */ } - smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */ retry: + cond_resched(); + /* FIXME: is that check useful here? */ + if (!vx_rsspages_avail(mm, 1)) + return VM_FAULT_OOM; new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* + * No smp_rmb is needed here as long as there's a full + * spin_lock/unlock sequence inside the ->nopage callback + * (for the pagecache lookup) that acts as an implicit + * smp_mb() and prevents the i_size read to happen + * after the next truncate_count read. + */ /* no page was available -- either SIGBUS or OOM */ if (new_page == NOPAGE_SIGBUS) return VM_FAULT_SIGBUS; if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - if (!vx_rsspages_avail(mm, 1)) - return VM_FAULT_OOM; /* * Should we do an early C-O-W break? @@ -1547,9 +1926,8 @@ retry: * invalidated this page. If unmap_mapping_range got called, * retry getting the page. */ - if (mapping && - (unlikely(sequence != atomic_read(&mapping->truncate_count)))) { - sequence = atomic_read(&mapping->truncate_count); + if (mapping && unlikely(sequence != mapping->truncate_count)) { + sequence = mapping->truncate_count; spin_unlock(&mm->page_table_lock); page_cache_release(new_page); goto retry; @@ -1571,6 +1949,9 @@ retry: if (!PageReserved(new_page)) // ++mm->rss; vx_rsspages_inc(mm); + acct_update_integrals(); + update_mem_hiwater(); + flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) @@ -1695,13 +2076,14 @@ static inline int handle_pte_fault(struct mm_struct *mm, * By the time we get here, we already hold the mm semaphore */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, int write_access) + unsigned long address, int write_access) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; + pte_t *pte; __set_current_state(TASK_RUNNING); - pgd = pgd_offset(mm, address); inc_page_state(pgfault); @@ -1712,18 +2094,61 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ + pgd = pgd_offset(mm, address); spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); - if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); - if (pte) - return handle_pte_fault(mm, vma, address, write_access, pte, pmd); - } + pud = pud_alloc(mm, pgd, address); + if (!pud) + goto oom; + + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + goto oom; + + pte = pte_alloc_map(mm, pmd, address); + if (!pte) + goto oom; + + return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + + oom: spin_unlock(&mm->page_table_lock); return VM_FAULT_OOM; } +#ifndef __ARCH_HAS_4LEVEL_HACK +/* + * Allocate page upper directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + * + * On a two-level or three-level page table, this ends up actually being + * entirely optimized away. + */ +pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pud_t *new; + + spin_unlock(&mm->page_table_lock); + new = pud_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pud_free(new); + goto out; + } + pgd_populate(mm, pgd, new); + out: + return pud_offset(pgd, address); +} + /* * Allocate page middle directory. * @@ -1733,7 +2158,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, * On a two-level page table, this ends up actually being entirely * optimized away. */ -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { pmd_t *new; @@ -1747,14 +2172,38 @@ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ - if (pgd_present(*pgd)) { + if (pud_present(*pud)) { pmd_free(new); goto out; } - pgd_populate(mm, pgd, new); + pud_populate(mm, pud, new); + out: + return pmd_offset(pud, address); +} +#else +pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + pmd_t *new; + + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pud)) { + pmd_free(new); + goto out; + } + pgd_populate(mm, pud, new); out: - return pmd_offset(pgd, address); + return pmd_offset(pud, address); } +#endif int make_pages_present(unsigned long addr, unsigned long end) { @@ -1785,19 +2234,21 @@ struct page * vmalloc_to_page(void * vmalloc_addr) unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); - if (!pmd_none(*pmd)) { - preempt_disable(); - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - preempt_enable(); + pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + } } } return page; @@ -1805,7 +2256,33 @@ struct page * vmalloc_to_page(void * vmalloc_addr) EXPORT_SYMBOL(vmalloc_to_page); -#if !defined(CONFIG_ARCH_GATE_AREA) +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(void * vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} + +EXPORT_SYMBOL(vmalloc_to_pfn); + +/* + * update_mem_hiwater + * - update per process rss and vm high water data + */ +void update_mem_hiwater(void) +{ + struct task_struct *tsk = current; + + if (tsk->mm) { + if (tsk->mm->hiwater_rss < tsk->mm->rss) + tsk->mm->hiwater_rss = tsk->mm->rss; + if (tsk->mm->hiwater_vm < tsk->mm->total_vm) + tsk->mm->hiwater_vm = tsk->mm->total_vm; + } +} + +#if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) struct vm_area_struct gate_vma; @@ -1831,7 +2308,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) #endif } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area_no_task(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) @@ -1840,4 +2317,4 @@ int in_gate_area(struct task_struct *task, unsigned long addr) return 0; } -#endif +#endif /* __HAVE_ARCH_GATE_AREA */