X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fmemory.c;h=5d07c87792aea2d8b57f8ec5a81a3892a425fb28;hb=8e8ece46a861c84343256819eaec77e608ff9217;hp=576d4c97e928b4e093e7231f6ca1f0fbca33e764;hpb=a8e794ca871505c8ea96cc102f4ad555c5231d7f;p=linux-2.6.git diff --git a/mm/memory.c b/mm/memory.c index 576d4c97e..5d07c8779 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -34,6 +34,8 @@ * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) + * + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */ #include @@ -44,11 +46,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -67,82 +69,128 @@ EXPORT_SYMBOL(mem_map); #endif unsigned long num_physpages; +/* + * A number of key systems in x86 including ioremap() rely on the assumption + * that high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ void * high_memory; -struct page *highmem_start_page; +unsigned long vmalloc_earlyreserve; EXPORT_SYMBOL(num_physpages); -EXPORT_SYMBOL(highmem_start_page); EXPORT_SYMBOL(high_memory); +EXPORT_SYMBOL(vmalloc_earlyreserve); /* - * We special-case the C-O-W ZERO_PAGE, because it's such - * a common occurrence (no need to read the page to know - * that it's zero - better for the cache and memory subsystem). + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. */ -static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) +static inline void clear_pmd_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long start, unsigned long end) { - if (from == ZERO_PAGE(address)) { - clear_user_highpage(to, address); + struct page *page; + + if (pmd_none(*pmd)) + return; + if (unlikely(pmd_bad(*pmd))) { + pmd_ERROR(*pmd); + pmd_clear(pmd); return; } - copy_user_highpage(to, from, address); + if (!((start | end) & ~PMD_MASK)) { + /* Only clear full, aligned ranges */ + page = pmd_page(*pmd); + pmd_clear(pmd); + dec_page_state(nr_page_table_pages); + tlb->mm->nr_ptes--; + pte_free_tlb(tlb, page); + } } -/* - * Note: this doesn't free the actual pages themselves. That - * has been handled earlier when unmapping all the memory regions. - */ -static inline void free_one_pmd(struct mmu_gather *tlb, pmd_t * dir) +static inline void clear_pud_range(struct mmu_gather *tlb, pud_t *pud, unsigned long start, unsigned long end) { - struct page *page; + unsigned long addr = start, next; + pmd_t *pmd, *__pmd; - if (pmd_none(*dir)) + if (pud_none(*pud)) return; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); + if (unlikely(pud_bad(*pud))) { + pud_ERROR(*pud); + pud_clear(pud); return; } - page = pmd_page(*dir); - pmd_clear(dir); - pgtable_remove_rmap(page); - pte_free_tlb(tlb, page); + + pmd = __pmd = pmd_offset(pud, start); + do { + next = (addr + PMD_SIZE) & PMD_MASK; + if (next > end || next <= addr) + next = end; + + clear_pmd_range(tlb, pmd, addr, next); + pmd++; + addr = next; + } while (addr && (addr < end)); + + if (!((start | end) & ~PUD_MASK)) { + /* Only clear full, aligned ranges */ + pud_clear(pud); + pmd_free_tlb(tlb, __pmd); + } } -static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) + +static inline void clear_pgd_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long start, unsigned long end) { - int j; - pmd_t * pmd; + unsigned long addr = start, next; + pud_t *pud, *__pud; - if (pgd_none(*dir)) + if (pgd_none(*pgd)) return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); + if (unlikely(pgd_bad(*pgd))) { + pgd_ERROR(*pgd); + pgd_clear(pgd); return; } - pmd = pmd_offset(dir, 0); - pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); + + pud = __pud = pud_offset(pgd, start); + do { + next = (addr + PUD_SIZE) & PUD_MASK; + if (next > end || next <= addr) + next = end; + + clear_pud_range(tlb, pud, addr, next); + pud++; + addr = next; + } while (addr && (addr < end)); + + if (!((start | end) & ~PGDIR_MASK)) { + /* Only clear full, aligned ranges */ + pgd_clear(pgd); + pud_free_tlb(tlb, __pud); + } } /* - * This function clears all user-level page tables of a process - this - * is needed by execve(), so that old pages aren't in the way. + * This function clears user-level page tables of a process. * * Must be called with pagetable lock held. */ -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) +void clear_page_range(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - pgd_t * page_dir = tlb->mm->pgd; - - page_dir += first; - do { - free_one_pgd(tlb, page_dir); - page_dir++; - } while (--nr); + unsigned long addr = start, next; + pgd_t * pgd = pgd_offset(tlb->mm, start); + unsigned long i; + + for (i = pgd_index(start); i <= pgd_index(end-1); i++) { + next = (addr + PGDIR_SIZE) & PGDIR_MASK; + if (next > end || next <= addr) + next = end; + + clear_pgd_range(tlb, pgd, addr, next); + pgd++; + addr = next; + } } pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -155,7 +203,6 @@ pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long a spin_lock(&mm->page_table_lock); if (!new) return NULL; - /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. @@ -164,7 +211,8 @@ pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long a pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); + mm->nr_ptes++; + inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } out: @@ -190,203 +238,217 @@ pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned lon pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: return pte_offset_kernel(pmd, address); } -#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) -#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. * - * 08Jan98 Merged into one routine from several inline routines to reduce - * variable count and make things faster. -jj - * * dst->page_table_lock is held on entry and exit, - * but may be dropped within pmd_alloc() and pte_alloc_map(). + * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). */ -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) + +static inline void +copy_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t pte) { - pgd_t * src_pgd, * dst_pgd; - unsigned long address = vma->vm_start; - unsigned long end = vma->vm_end; - unsigned long cow; - struct pte_chain *pte_chain = NULL; + if (pte_file(pte)) + return; + swap_duplicate(pte_to_swp_entry(pte)); + if (list_empty(&dst_mm->mmlist)) { + spin_lock(&mmlist_lock); + list_add(&dst_mm->mmlist, &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } +} - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst, src, vma); +static inline void +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, + unsigned long addr) +{ + pte_t pte = *src_pte; + struct page *page; + unsigned long pfn; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; + /* pte contains position in swap, so copy. */ + if (!pte_present(pte)) { + copy_swap_pte(dst_mm, src_mm, pte); + set_pte(dst_pte, pte); + return; + } + pfn = pte_pfn(pte); + /* the pte points outside of valid memory, the + * mapping is assumed to be good, meaningful + * and not mapped via rmap - duplicate the + * mapping as is. + */ + page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + + if (!page || PageReserved(page)) { + set_pte(dst_pte, pte); + return; } - - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - src_pgd = pgd_offset(src, address)-1; - dst_pgd = pgd_offset(dst, address)-1; - for (;;) { - pmd_t * src_pmd, * dst_pmd; + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { + ptep_set_wrprotect(src_pte); + pte = *src_pte; + } - src_pgd++; dst_pgd++; - - /* copy_pmd_range */ - - if (pgd_none(*src_pgd)) - goto skip_copy_pmd_range; - if (pgd_bad(*src_pgd)) { - pgd_ERROR(*src_pgd); - pgd_clear(src_pgd); -skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; - if (!address || (address >= end)) - goto out; + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(page); + vx_rsspages_inc(dst_mm); + if (PageAnon(page)) + vx_anonpages_inc(dst_mm); + set_pte(dst_pte, pte); + page_dup_rmap(page); +} + +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pte_t *src_pte, *dst_pte; + pte_t *s, *d; + unsigned long vm_flags = vma->vm_flags; + + d = dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); + if (!dst_pte) + return -ENOMEM; + + spin_lock(&src_mm->page_table_lock); + s = src_pte = pte_offset_map_nested(src_pmd, addr); + for (; addr < end; addr += PAGE_SIZE, s++, d++) { + if (pte_none(*s)) continue; - } + copy_one_pte(dst_mm, src_mm, d, s, vm_flags, addr); + } + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + spin_unlock(&src_mm->page_table_lock); + cond_resched_lock(&dst_mm->page_table_lock); + return 0; +} - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); - if (!dst_pmd) - goto nomem; +static int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pmd_t *src_pmd, *dst_pmd; + int err = 0; + unsigned long next; + + src_pmd = pmd_offset(src_pud, addr); + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + if (!dst_pmd) + return -ENOMEM; + + for (; addr < end; addr = next, src_pmd++, dst_pmd++) { + next = (addr + PMD_SIZE) & PMD_MASK; + if (next > end || next <= addr) + next = end; + if (pmd_none(*src_pmd)) + continue; + if (pmd_bad(*src_pmd)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); + continue; + } + err = copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next); + if (err) + break; + } + return err; +} - do { - pte_t * src_pte, * dst_pte; - - /* copy_pte_range */ - - if (pmd_none(*src_pmd)) - goto skip_copy_pte_range; - if (pmd_bad(*src_pmd)) { - pmd_ERROR(*src_pmd); - pmd_clear(src_pmd); -skip_copy_pte_range: - address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) - goto out; - goto cont_copy_pmd_range; - } +static int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pud_t *src_pud, *dst_pud; + int err = 0; + unsigned long next; + + src_pud = pud_offset(src_pgd, addr); + dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + if (!dst_pud) + return -ENOMEM; + + for (; addr < end; addr = next, src_pud++, dst_pud++) { + next = (addr + PUD_SIZE) & PUD_MASK; + if (next > end || next <= addr) + next = end; + if (pud_none(*src_pud)) + continue; + if (pud_bad(*src_pud)) { + pud_ERROR(*src_pud); + pud_clear(src_pud); + continue; + } + err = copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next); + if (err) + break; + } + return err; +} - dst_pte = pte_alloc_map(dst, dst_pmd, address); - if (!dst_pte) - goto nomem; - spin_lock(&src->page_table_lock); - src_pte = pte_offset_map_nested(src_pmd, address); - do { - pte_t pte = *src_pte; - struct page *page; - unsigned long pfn; - - if (!vx_rsspages_avail(dst, 1)) { - spin_unlock(&src->page_table_lock); - goto nomem; - } - /* copy_one_pte */ - - if (pte_none(pte)) - goto cont_copy_pte_range_noset; - /* pte contains position in swap, so copy. */ - if (!pte_present(pte)) { - if (!pte_file(pte)) - swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; - } - pfn = pte_pfn(pte); - /* the pte points outside of valid memory, the - * mapping is assumed to be good, meaningful - * and not mapped via rmap - duplicate the - * mapping as is. - */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long addr, start, end, next; + int err = 0; - if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; - } + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst, src, vma); - /* - * If it's a COW mapping, write protect it both - * in the parent and the child - */ - if (cow) { - ptep_set_wrprotect(src_pte); - pte = *src_pte; - } + start = vma->vm_start; + src_pgd = pgd_offset(src, start); + dst_pgd = pgd_offset(dst, start); - /* - * If it's a shared mapping, mark it clean in - * the child - */ - if (vma->vm_flags & VM_SHARED) - pte = pte_mkclean(pte); - pte = pte_mkold(pte); - get_page(page); - // dst->rss++; - vx_rsspages_inc(dst); - - set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (pte_chain) - goto cont_copy_pte_range_noset; + end = vma->vm_end; + addr = start; + while (addr && (addr < end-1)) { + next = (addr + PGDIR_SIZE) & PGDIR_MASK; + if (next > end || next <= addr) + next = end; + if (pgd_none(*src_pgd)) + goto next_pgd; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); + goto next_pgd; + } + err = copy_pud_range(dst, src, dst_pgd, src_pgd, + vma, addr, next); + if (err) + break; - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); -cont_copy_pte_range_noset: - address += PAGE_SIZE; - if (address >= end) { - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - goto out_unlock; - } - src_pte++; - dst_pte++; - } while ((unsigned long)src_pte & PTE_TABLE_MASK); - pte_unmap_nested(src_pte-1); - pte_unmap(dst_pte-1); - spin_unlock(&src->page_table_lock); - -cont_copy_pmd_range: - src_pmd++; - dst_pmd++; - } while ((unsigned long)src_pmd & PMD_TABLE_MASK); +next_pgd: + src_pgd++; + dst_pgd++; + addr = next; } -out_unlock: - spin_unlock(&src->page_table_lock); -out: - pte_chain_free(pte_chain); - return 0; -nomem: - pte_chain_free(pte_chain); - return -ENOMEM; + + return err; } static void zap_pte_range(struct mmu_gather *tlb, @@ -398,7 +460,7 @@ static void zap_pte_range(struct mmu_gather *tlb, if (pmd_none(*pmd)) return; - if (pmd_bad(*pmd)) { + if (unlikely(pmd_bad(*pmd))) { pmd_ERROR(*pmd); pmd_clear(pmd); return; @@ -408,6 +470,8 @@ static void zap_pte_range(struct mmu_gather *tlb, if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; size &= PAGE_MASK; + if (details && !details->check_mapping && !details->nonlinear_vma) + details = NULL; for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { pte_t pte = *ptep; if (pte_none(pte)) @@ -448,10 +512,12 @@ static void zap_pte_range(struct mmu_gather *tlb, set_pte(ptep, pgoff_to_pte(page->index)); if (pte_dirty(pte)) set_page_dirty(page); - if (pte_young(pte) && page_mapping(page)) + if (PageAnon(page)) + vx_anonpages_dec(tlb->mm); + else if (pte_young(pte)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); continue; } @@ -469,60 +535,78 @@ static void zap_pte_range(struct mmu_gather *tlb, } static void zap_pmd_range(struct mmu_gather *tlb, - pgd_t * dir, unsigned long address, + pud_t *pud, unsigned long address, unsigned long size, struct zap_details *details) { pmd_t * pmd; unsigned long end; - if (pgd_none(*dir)) + if (pud_none(*pud)) return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); + if (unlikely(pud_bad(*pud))) { + pud_ERROR(*pud); + pud_clear(pud); return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset(pud, address); end = address + size; - if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) - end = ((address + PGDIR_SIZE) & PGDIR_MASK); + if (end > ((address + PUD_SIZE) & PUD_MASK)) + end = ((address + PUD_SIZE) & PUD_MASK); do { zap_pte_range(tlb, pmd, address, end - address, details); address = (address + PMD_SIZE) & PMD_MASK; pmd++; - } while (address < end); + } while (address && (address < end)); +} + +static void zap_pud_range(struct mmu_gather *tlb, + pgd_t * pgd, unsigned long address, + unsigned long end, struct zap_details *details) +{ + pud_t * pud; + + if (pgd_none(*pgd)) + return; + if (unlikely(pgd_bad(*pgd))) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return; + } + pud = pud_offset(pgd, address); + do { + zap_pmd_range(tlb, pud, address, end - address, details); + address = (address + PUD_SIZE) & PUD_MASK; + pud++; + } while (address && (address < end)); } static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long end, struct zap_details *details) { - pgd_t * dir; + unsigned long next; + pgd_t *pgd; + int i; BUG_ON(address >= end); - dir = pgd_offset(vma->vm_mm, address); + pgd = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); - do { - zap_pmd_range(tlb, dir, address, end - address, details); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); + for (i = pgd_index(address); i <= pgd_index(end-1); i++) { + next = (address + PGDIR_SIZE) & PGDIR_MASK; + if (next <= address || next > end) + next = end; + zap_pud_range(tlb, pgd, address, next, details); + address = next; + pgd++; + } tlb_end_vma(tlb, vma); } -/* Dispose of an entire struct mmu_gather per rescheduling point */ -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (FREE_PTE_NR * PAGE_SIZE) -#endif - -/* For UP, 256 pages at a time gives nice low latency */ -#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (256 * PAGE_SIZE) -#endif - +#ifdef CONFIG_PREEMPT +# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) +#else /* No preempt: go for improved straight-line efficiency */ -#if !defined(CONFIG_PREEMPT) -#define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) +# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) #endif /** @@ -561,6 +645,8 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; int ret = 0; + spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; + int fullmm = tlb_is_full_mm(*tlbp); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long start; @@ -598,16 +684,29 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, zap_bytes -= block; if ((long)zap_bytes > 0) continue; - if (need_resched()) { - int fullmm = tlb_is_full_mm(*tlbp); - tlb_finish_mmu(*tlbp, tlb_start, start); - cond_resched_lock(&mm->page_table_lock); - *tlbp = tlb_gather_mmu(mm, fullmm); - tlb_start_valid = 0; + + tlb_finish_mmu(*tlbp, tlb_start, start); + + if (need_resched() || + need_lockbreak(&mm->page_table_lock) || + (i_mmap_lock && need_lockbreak(i_mmap_lock))) { + if (i_mmap_lock) { + /* must reset count of rss freed */ + *tlbp = tlb_gather_mmu(mm, fullmm); + details->break_addr = start; + goto out; + } + spin_unlock(&mm->page_table_lock); + cond_resched(); + spin_lock(&mm->page_table_lock); } + + *tlbp = tlb_gather_mmu(mm, fullmm); + tlb_start_valid = 0; zap_bytes = ZAP_BLOCK_SIZE; } } +out: return ret; } @@ -626,8 +725,6 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long end = address + size; unsigned long nr_accounted = 0; - might_sleep(); - if (is_vm_hugetlb_page(vma)) { zap_hugepage_range(vma, address, size); return; @@ -638,6 +735,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, tlb = tlb_gather_mmu(mm, 0); unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); tlb_finish_mmu(tlb, address, end); + acct_update_integrals(); spin_unlock(&mm->page_table_lock); } @@ -645,10 +743,11 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) +static struct page * +__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; unsigned long pfn; @@ -659,16 +758,18 @@ follow_page(struct mm_struct *mm, unsigned long address, int write) return page; pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || pgd_bad(*pgd)) + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; - pmd = pmd_offset(pgd, address); - if (pmd_none(*pmd)) + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; if (pmd_huge(*pmd)) return follow_huge_pmd(mm, address, pmd, write); - if (pmd_bad(*pmd)) - goto out; ptep = pte_offset_map(pmd, address); if (!ptep) @@ -679,15 +780,13 @@ follow_page(struct mm_struct *mm, unsigned long address, int write) if (pte_present(pte)) { if (write && !pte_write(pte)) goto out; - if (write && !pte_dirty(pte)) { - struct page *page = pte_page(pte); - if (!PageDirty(page)) - set_page_dirty(page); - } + if (read && !pte_read(pte)) + goto out; pfn = pte_pfn(pte); if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - + page = pfn_to_page(pfn); + if (write && !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); mark_page_accessed(page); return page; } @@ -697,6 +796,20 @@ out: return NULL; } +struct page * +follow_page(struct mm_struct *mm, unsigned long address, int write) +{ + return __follow_page(mm, address, /*read*/0, write); +} + +int +check_user_page_readable(struct mm_struct *mm, unsigned long address) +{ + return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; +} + +EXPORT_SYMBOL(check_user_page_readable); + /* * Given a physical address, is there a useful struct page pointing to * it? This may become more complex in the future if we start dealing @@ -706,7 +819,7 @@ out: static inline struct page *get_page_map(struct page *page) { if (!pfn_valid(page_to_pfn(page))) - return 0; + return NULL; return page; } @@ -716,6 +829,7 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; /* Check if the vma is for an anonymous mapping. */ @@ -724,12 +838,16 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, /* Check if page directory entry exists. */ pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || pgd_bad(*pgd)) + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return 1; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) return 1; /* Check if page middle directory entry exists. */ - pmd = pmd_offset(pgd, address); - if (pmd_none(*pmd) || pmd_bad(*pmd)) + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) return 1; /* There is a pte slot for 'address' in 'mm'. */ @@ -760,23 +878,27 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long pg = start & PAGE_MASK; struct vm_area_struct *gate_vma = get_gate_vma(tsk); pgd_t *pgd; + pud_t *pud; pmd_t *pmd; pte_t *pte; if (write) /* user gate pages are read-only */ return i ? : -EFAULT; - pgd = pgd_offset_k(pg); - if (!pgd) - return i ? : -EFAULT; - pmd = pmd_offset(pgd, pg); - if (!pmd) - return i ? : -EFAULT; - pte = pte_offset_kernel(pmd, pg); - if (!pte || !pte_present(*pte)) - return i ? : -EFAULT; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + BUG_ON(pmd_none(*pmd)); + pte = pte_offset_map(pmd, pg); + BUG_ON(pte_none(*pte)); if (pages) { pages[i] = pte_page(*pte); get_page(pages[i]); } + pte_unmap(pte); if (vmas) vmas[i] = gate_vma; i++; @@ -785,7 +907,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (pages && (vma->vm_flags & VM_IO)) + if (!vma || (vma->vm_flags & VM_IO) || !(flags & vma->vm_flags)) return i ? : -EFAULT; @@ -798,6 +920,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, do { struct page *map; int lookup_write = write; + + cond_resched_lock(&mm->page_table_lock); while (!(map = follow_page(mm, start, lookup_write))) { /* * Shortcut for anonymous pages. We don't want @@ -881,16 +1005,16 @@ static void zeromap_pte_range(pte_t * pte, unsigned long address, } while (address && (address < end)); } -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t prot) +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long base, end; - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; + base = address & PUD_MASK; + address &= ~PUD_MASK; end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; + if (end > PUD_SIZE) + end = PUD_SIZE; do { pte_t * pte = pte_alloc_map(mm, pmd, base + address); if (!pte) @@ -903,31 +1027,65 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned return 0; } -int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, pgprot_t prot) +static inline int zeromap_pud_range(struct mm_struct *mm, pud_t * pud, + unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long base, end; + int error = 0; + + base = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pmd_t * pmd = pmd_alloc(mm, pud, base + address); + error = -ENOMEM; + if (!pmd) + break; + error = zeromap_pmd_range(mm, pmd, base + address, + end - address, prot); + if (error) + break; + address = (address + PUD_SIZE) & PUD_MASK; + pud++; + } while (address && (address < end)); + return 0; +} + +int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size, pgprot_t prot) { + int i; int error = 0; - pgd_t * dir; + pgd_t * pgd; unsigned long beg = address; unsigned long end = address + size; + unsigned long next; struct mm_struct *mm = vma->vm_mm; - dir = pgd_offset(mm, address); + pgd = pgd_offset(mm, address); flush_cache_range(vma, beg, end); - if (address >= end) - BUG(); + BUG_ON(address >= end); + BUG_ON(end > vma->vm_end); spin_lock(&mm->page_table_lock); - do { - pmd_t *pmd = pmd_alloc(mm, dir, address); + for (i = pgd_index(address); i <= pgd_index(end-1); i++) { + pud_t *pud = pud_alloc(mm, pgd, address); error = -ENOMEM; - if (!pmd) + if (!pud) break; - error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + next = (address + PGDIR_SIZE) & PGDIR_MASK; + if (next <= beg || next > end) + next = end; + error = zeromap_pud_range(mm, pud, address, + next - address, prot); if (error) break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); + address = next; + pgd++; + } /* * Why flush? zeromap_pte_range has a BUG_ON for !pte_none() */ @@ -941,17 +1099,16 @@ int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, unsign * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline void +remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, + unsigned long pfn, pgprot_t prot) { unsigned long end; - unsigned long pfn; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) end = PMD_SIZE; - pfn = phys_addr >> PAGE_SHIFT; do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) @@ -962,22 +1119,24 @@ static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned } while (address && (address < end)); } -static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline int +remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, + unsigned long size, unsigned long pfn, pgprot_t prot) { unsigned long base, end; - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; + base = address & PUD_MASK; + address &= ~PUD_MASK; end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - phys_addr -= address; + if (end > PUD_SIZE) + end = PUD_SIZE; + pfn -= (address >> PAGE_SHIFT); do { pte_t * pte = pte_alloc_map(mm, pmd, base + address); if (!pte) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + remap_pte_range(pte, base + address, end - address, + (address >> PAGE_SHIFT) + pfn, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; pmd++; @@ -985,42 +1144,87 @@ static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned lo return 0; } +static inline int remap_pud_range(struct mm_struct *mm, pud_t * pud, + unsigned long address, unsigned long size, + unsigned long pfn, pgprot_t prot) +{ + unsigned long base, end; + int error; + + base = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + pfn -= address >> PAGE_SHIFT; + do { + pmd_t *pmd = pmd_alloc(mm, pud, base+address); + error = -ENOMEM; + if (!pmd) + break; + error = remap_pmd_range(mm, pmd, base + address, end - address, + (address >> PAGE_SHIFT) + pfn, prot); + if (error) + break; + address = (address + PUD_SIZE) & PUD_MASK; + pud++; + } while (address && (address < end)); + return error; +} + /* Note: this is only safe if the mm semaphore is held when called. */ -int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) +int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long pfn, unsigned long size, pgprot_t prot) { int error = 0; - pgd_t * dir; + pgd_t *pgd; unsigned long beg = from; unsigned long end = from + size; + unsigned long next; struct mm_struct *mm = vma->vm_mm; + int i; - phys_addr -= from; - dir = pgd_offset(mm, from); + pfn -= from >> PAGE_SHIFT; + pgd = pgd_offset(mm, from); flush_cache_range(vma, beg, end); - if (from >= end) - BUG(); + BUG_ON(from >= end); + + /* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_RESERVED tells swapout not to try to touch + * this region. + */ + vma->vm_flags |= VM_IO | VM_RESERVED; spin_lock(&mm->page_table_lock); - do { - pmd_t *pmd = pmd_alloc(mm, dir, from); + for (i = pgd_index(beg); i <= pgd_index(end-1); i++) { + pud_t *pud = pud_alloc(mm, pgd, from); error = -ENOMEM; - if (!pmd) + if (!pud) break; - error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + next = (from + PGDIR_SIZE) & PGDIR_MASK; + if (next > end || next <= from) + next = end; + error = remap_pud_range(mm, pud, from, end - from, + pfn + (from >> PAGE_SHIFT), prot); if (error) break; - from = (from + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (from && (from < end)); + from = next; + pgd++; + } /* * Why flush? remap_pte_range has a BUG_ON for !pte_none() */ flush_tlb_range(vma, beg, end); spin_unlock(&mm->page_table_lock); + return error; } -EXPORT_SYMBOL(remap_page_range); +EXPORT_SYMBOL(remap_pfn_range); /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when @@ -1075,7 +1279,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain; pte_t entry; if (unlikely(!pfn_valid(pfn))) { @@ -1099,7 +1302,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, flush_cache_page(vma, address); entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), vma); - ptep_establish(vma, address, page_table, entry); + ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1111,30 +1314,39 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, /* * Ok, we need to copy. Oh, well.. */ - page_cache_get(old_page); + if (!PageReserved(old_page)) + page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_pte_chain; - new_page = alloc_page(GFP_HIGHUSER); - if (!new_page) + if (unlikely(anon_vma_prepare(vma))) goto no_new_page; - copy_cow_page(old_page,new_page,address); - + if (old_page == ZERO_PAGE(address)) { + new_page = alloc_zeroed_user_highpage(vma, address); + if (!new_page) + goto no_new_page; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!new_page) + goto no_new_page; + copy_user_highpage(new_page, old_page, address); + } /* * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (pte_same(*page_table, pte)) { - if (PageReserved(old_page)) - // ++mm->rss; + if (likely(pte_same(*page_table, pte))) { + if (PageAnon(old_page)) + vx_anonpages_dec(mm); + if (PageReserved(old_page)) { vx_rsspages_inc(mm); - page_remove_rmap(old_page, page_table); + acct_update_integrals(); + update_mem_hiwater(); + } else + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, vma, address); /* Free the old page.. */ new_page = old_page; @@ -1143,48 +1355,158 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return VM_FAULT_MINOR; no_new_page: - pte_chain_free(pte_chain); -no_pte_chain: page_cache_release(old_page); return VM_FAULT_OOM; } /* - * Helper function for unmap_mapping_range(). + * Helper functions for unmap_mapping_range(). + * + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * + * We have to restart searching the prio_tree whenever we drop the lock, + * since the iterator is only valid while the lock is held, and anyway + * a later vma might be split and reinserted earlier while lock dropped. + * + * The list of nonlinear vmas could be handled more efficiently, using + * a placeholder, but handle it in the same way until a need is shown. + * It is important to search the prio_tree before nonlinear list: a vma + * may become nonlinear and be shifted from prio_tree to nonlinear list + * while the lock is dropped; but never shifted from list to prio_tree. + * + * In order to make forward progress despite restarting the search, + * vm_truncate_count is used to mark a vma as now dealt with, so we can + * quickly skip it next time around. Since the prio_tree search only + * shows us those vmas affected by unmapping the range in question, we + * can't efficiently keep all vmas in step with mapping->truncate_count: + * so instead reset them all whenever it wraps back to 0 (then go to 1). + * mapping->truncate_count and vma->vm_truncate_count are protected by + * i_mmap_lock. + * + * In order to make forward progress despite repeatedly restarting some + * large vma, note the break_addr set by unmap_vmas when it breaks out: + * and restart from that address when we reach that vma again. It might + * have been split or merged, shrunk or extended, but never shifted: so + * restart_addr remains valid so long as it remains in the vma's range. + * unmap_mapping_range forces truncate_count to leap over page-aligned + * values so we can save vma's restart_addr in its truncate_count field. */ -static void unmap_mapping_range_list(struct list_head *head, - struct zap_details *details) +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) + +static void reset_vma_truncate_counts(struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) + vma->vm_truncate_count = 0; + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_truncate_count = 0; +} + +static int unmap_mapping_range_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + unsigned long restart_addr; + int need_break; + +again: + restart_addr = vma->vm_truncate_count; + if (is_restart_addr(restart_addr) && start_addr < restart_addr) { + start_addr = restart_addr; + if (start_addr >= end_addr) { + /* Top of vma has been split off since last time */ + vma->vm_truncate_count = details->truncate_count; + return 0; + } + } + + details->break_addr = end_addr; + zap_page_range(vma, start_addr, end_addr - start_addr, details); + + /* + * We cannot rely on the break test in unmap_vmas: + * on the one hand, we don't want to restart our loop + * just because that broke out for the page_table_lock; + * on the other hand, it does no test when vma is small. + */ + need_break = need_resched() || + need_lockbreak(details->i_mmap_lock); + + if (details->break_addr >= end_addr) { + /* We have now completed this vma: mark it so */ + vma->vm_truncate_count = details->truncate_count; + if (!need_break) + return 0; + } else { + /* Note restart_addr in vma's truncate_count field */ + vma->vm_truncate_count = details->break_addr; + if (!need_break) + goto again; + } + + spin_unlock(details->i_mmap_lock); + cond_resched(); + spin_lock(details->i_mmap_lock); + return -EINTR; +} + +static inline void unmap_mapping_range_tree(struct prio_tree_root *root, + struct zap_details *details) { struct vm_area_struct *vma; + struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; - list_for_each_entry(vma, head, shared) { - if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - details->nonlinear_vma = vma; - zap_page_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start, details); - details->nonlinear_vma = NULL; +restart: + vma_prio_tree_foreach(vma, &iter, root, + details->first_index, details->last_index) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) continue; - } + vba = vma->vm_pgoff; vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ - if (vba > details->last_index || vea < details->first_index) - continue; /* Mapping disjoint from hole. */ zba = details->first_index; if (zba < vba) zba = vba; zea = details->last_index; if (zea > vea) zea = vea; - zap_page_range(vma, + + if (unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, - (zea - zba + 1) << PAGE_SHIFT, - details->check_mapping? details: NULL); + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, + details) < 0) + goto restart; + } +} + +static inline void unmap_mapping_range_list(struct list_head *head, + struct zap_details *details) +{ + struct vm_area_struct *vma; + + /* + * In nonlinear VMAs there is no correspondence between virtual address + * offset and file offset. So we must perform an exhaustive search + * across *all* the pages in each nonlinear VMA, not just the pages + * whose virtual address lies outside the file truncation point. + */ +restart: + list_for_each_entry(vma, head, shared.vm_set.list) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + details->nonlinear_vma = vma; + if (unmap_mapping_range_vma(vma, vma->vm_start, + vma->vm_end, details) < 0) + goto restart; } } @@ -1205,7 +1527,7 @@ static void unmap_mapping_range_list(struct list_head *head, * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows) + loff_t const holebegin, loff_t const holelen, int even_cows) { struct zap_details details; pgoff_t hba = holebegin >> PAGE_SHIFT; @@ -1225,19 +1547,33 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; + details.i_mmap_lock = &mapping->i_mmap_lock; - down(&mapping->i_shared_sem); - /* Protect against page fault */ - atomic_inc(&mapping->truncate_count); - if (unlikely(!list_empty(&mapping->i_mmap))) - unmap_mapping_range_list(&mapping->i_mmap, &details); + spin_lock(&mapping->i_mmap_lock); - /* Don't waste time to check mapping on fully shared vmas */ - details.check_mapping = NULL; + /* serialize i_size write against truncate_count write */ + smp_wmb(); + /* Protect against page faults, and endless unmapping loops */ + mapping->truncate_count++; + /* + * For archs where spin_lock has inclusive semantics like ia64 + * this smp_mb() will prevent to read pagetable contents + * before the truncate_count increment is visible to + * other cpus. + */ + smp_mb(); + if (unlikely(is_restart_addr(mapping->truncate_count))) { + if (mapping->truncate_count == 0) + reset_vma_truncate_counts(mapping); + mapping->truncate_count++; + } + details.truncate_count = mapping->truncate_count; - if (unlikely(!list_empty(&mapping->i_mmap_shared))) - unmap_mapping_range_list(&mapping->i_mmap_shared, &details); - up(&mapping->i_shared_sem); + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) + unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); + spin_unlock(&mapping->i_mmap_lock); } EXPORT_SYMBOL(unmap_mapping_range); @@ -1256,17 +1592,23 @@ int vmtruncate(struct inode * inode, loff_t offset) if (inode->i_size < offset) goto do_expand; + /* + * truncation of in-use swapfiles is disallowed - it would cause + * subsequent swapout to scribble on the now-freed blocks. + */ + if (IS_SWAPFILE(inode)) + goto out_busy; i_size_write(inode, offset); unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(mapping, offset); goto out_truncate; do_expand: - limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) - goto out; + goto out_big; i_size_write(inode, offset); out_truncate: @@ -1275,8 +1617,10 @@ out_truncate: return 0; out_sig: send_sig(SIGXFSZ, current, 0); -out: +out_big: return -EFBIG; +out_busy: + return -ETXTBSY; } EXPORT_SYMBOL(vmtruncate); @@ -1286,9 +1630,17 @@ EXPORT_SYMBOL(vmtruncate); * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold down_read on the vma->vm_mm if vma is not NULL. */ -void swapin_readahead(swp_entry_t entry) +void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) { +#ifdef CONFIG_NUMA + struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; +#endif int i, num; struct page *new_page; unsigned long offset; @@ -1300,10 +1652,31 @@ void swapin_readahead(swp_entry_t entry) for (i = 0; i < num; offset++, i++) { /* Ok, do the async read-ahead now */ new_page = read_swap_cache_async(swp_entry(swp_type(entry), - offset)); + offset), vma, addr); if (!new_page) break; page_cache_release(new_page); +#ifdef CONFIG_NUMA + /* + * Find the next applicable VMA for the NUMA policy. + */ + addr += PAGE_SIZE; + if (addr == 0) + vma = NULL; + if (vma) { + if (addr >= vma->vm_end) { + vma = next_vma; + next_vma = vma ? vma->vm_next : NULL; + } + if (vma && addr < vma->vm_start) + vma = NULL; + } else { + if (next_vma && addr >= next_vma->vm_start) { + vma = next_vma; + next_vma = vma->vm_next; + } + } +#endif } lru_add_drain(); /* Push any new pages onto the LRU now */ } @@ -1320,14 +1693,13 @@ static int do_swap_page(struct mm_struct * mm, swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; pte_unmap(page_table); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { - swapin_readahead(entry); - page = read_swap_cache_async(entry); + swapin_readahead(entry, address, vma); + page = read_swap_cache_async(entry, vma, address); if (!page) { /* * Back out if somebody else faulted in this pte while @@ -1335,7 +1707,7 @@ static int do_swap_page(struct mm_struct * mm, */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (pte_same(*page_table, orig_pte)) + if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; @@ -1347,6 +1719,7 @@ static int do_swap_page(struct mm_struct * mm, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; inc_page_state(pgmajfault); + grab_swap_token(); } if (!vx_rsspages_avail(mm, 1)) { @@ -1354,11 +1727,6 @@ static int do_swap_page(struct mm_struct * mm, goto out; } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = VM_FAULT_OOM; - goto out; - } lock_page(page); /* @@ -1367,7 +1735,7 @@ static int do_swap_page(struct mm_struct * mm, */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (!pte_same(*page_table, orig_pte)) { + if (unlikely(!pte_same(*page_table, orig_pte))) { pte_unmap(page_table); spin_unlock(&mm->page_table_lock); unlock_page(page); @@ -1382,23 +1750,33 @@ static int do_swap_page(struct mm_struct * mm, if (vm_swap_full()) remove_exclusive_swap_page(page); - // mm->rss++; vx_rsspages_inc(mm); + acct_update_integrals(); + update_mem_hiwater(); + pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) + if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); + write_access = 0; + } unlock_page(page); flush_icache_page(vma, page); set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + page_add_anon_rmap(page, vma, address); + + if (write_access) { + if (do_wp_page(mm, vma, address, + page_table, pmd, pte) == VM_FAULT_OOM) + ret = VM_FAULT_OOM; + goto out; + } /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; } @@ -1414,25 +1792,7 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; - int ret; - if (!vx_rsspages_avail(mm, 1)) { - spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; - } - - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1442,10 +1802,13 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - page = alloc_page(GFP_HIGHUSER); + if (!vx_rsspages_avail(mm, 1)) + goto no_mem; + if (unlikely(anon_vma_prepare(vma))) + goto no_mem; + page = alloc_zeroed_user_highpage(vma, addr); if (!page) goto no_mem; - clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, addr); @@ -1454,34 +1817,29 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; goto out; } - // mm->rss++; vx_rsspages_inc(mm); + acct_update_integrals(); + update_mem_hiwater(); entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)), vma); lru_cache_add_active(page); - mark_page_accessed(page); + SetPageReferenced(page); + page_add_anon_rmap(page, vma, addr); } set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; - -no_mem: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); - return ret; + return VM_FAULT_MINOR; +no_mem: + return VM_FAULT_OOM; } /* @@ -1503,9 +1861,9 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page * new_page; struct address_space *mapping = NULL; pte_t entry; - struct pte_chain *pte_chain; - int sequence = 0; + unsigned int sequence = 0; int ret = VM_FAULT_MINOR; + int anon = 0; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, @@ -1515,35 +1873,44 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - sequence = atomic_read(&mapping->truncate_count); + sequence = mapping->truncate_count; + smp_rmb(); /* serializes i_size against truncate_count */ } - smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */ retry: + cond_resched(); + /* FIXME: is that check useful here? */ + if (!vx_rsspages_avail(mm, 1)) + return VM_FAULT_OOM; new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* + * No smp_rmb is needed here as long as there's a full + * spin_lock/unlock sequence inside the ->nopage callback + * (for the pagecache lookup) that acts as an implicit + * smp_mb() and prevents the i_size read to happen + * after the next truncate_count read. + */ /* no page was available -- either SIGBUS or OOM */ if (new_page == NOPAGE_SIGBUS) return VM_FAULT_SIGBUS; if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - if (!vx_rsspages_avail(mm, 1)) - return VM_FAULT_OOM; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; /* * Should we do an early C-O-W break? */ if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page * page = alloc_page(GFP_HIGHUSER); + struct page *page; + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!page) goto oom; copy_user_highpage(page, new_page, address); page_cache_release(new_page); - lru_cache_add_active(page); new_page = page; + anon = 1; } spin_lock(&mm->page_table_lock); @@ -1552,12 +1919,10 @@ retry: * invalidated this page. If unmap_mapping_range got called, * retry getting the page. */ - if (mapping && - (unlikely(sequence != atomic_read(&mapping->truncate_count)))) { - sequence = atomic_read(&mapping->truncate_count); + if (mapping && unlikely(sequence != mapping->truncate_count)) { + sequence = mapping->truncate_count; spin_unlock(&mm->page_table_lock); page_cache_release(new_page); - pte_chain_free(pte_chain); goto retry; } page_table = pte_offset_map(pmd, address); @@ -1575,14 +1940,20 @@ retry: /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { if (!PageReserved(new_page)) - // ++mm->rss; vx_rsspages_inc(mm); + acct_update_integrals(); + update_mem_hiwater(); + flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + if (anon) { + lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, vma, address); + } else + page_add_file_rmap(new_page); pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ @@ -1595,13 +1966,12 @@ retry: /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); spin_unlock(&mm->page_table_lock); - goto out; +out: + return ret; oom: page_cache_release(new_page); ret = VM_FAULT_OOM; -out: - pte_chain_free(pte_chain); - return ret; + goto out; } /* @@ -1687,7 +2057,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - ptep_establish(vma, address, pte, entry); + ptep_set_access_flags(vma, address, pte, entry, write_access); update_mmu_cache(vma, address, entry); pte_unmap(pte); spin_unlock(&mm->page_table_lock); @@ -1698,13 +2068,14 @@ static inline int handle_pte_fault(struct mm_struct *mm, * By the time we get here, we already hold the mm semaphore */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, int write_access) + unsigned long address, int write_access) { pgd_t *pgd; + pud_t *pud; pmd_t *pmd; + pte_t *pte; __set_current_state(TASK_RUNNING); - pgd = pgd_offset(mm, address); inc_page_state(pgfault); @@ -1715,18 +2086,61 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ + pgd = pgd_offset(mm, address); spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); - if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); - if (pte) - return handle_pte_fault(mm, vma, address, write_access, pte, pmd); - } + pud = pud_alloc(mm, pgd, address); + if (!pud) + goto oom; + + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + goto oom; + + pte = pte_alloc_map(mm, pmd, address); + if (!pte) + goto oom; + + return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + + oom: spin_unlock(&mm->page_table_lock); return VM_FAULT_OOM; } +#ifndef __ARCH_HAS_4LEVEL_HACK +/* + * Allocate page upper directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + * + * On a two-level or three-level page table, this ends up actually being + * entirely optimized away. + */ +pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pud_t *new; + + spin_unlock(&mm->page_table_lock); + new = pud_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pud_free(new); + goto out; + } + pgd_populate(mm, pgd, new); + out: + return pud_offset(pgd, address); +} + /* * Allocate page middle directory. * @@ -1736,7 +2150,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, * On a two-level page table, this ends up actually being entirely * optimized away. */ -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { pmd_t *new; @@ -1750,14 +2164,38 @@ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ - if (pgd_present(*pgd)) { + if (pud_present(*pud)) { pmd_free(new); goto out; } - pgd_populate(mm, pgd, new); + pud_populate(mm, pud, new); + out: + return pmd_offset(pud, address); +} +#else +pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + pmd_t *new; + + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pud)) { + pmd_free(new); + goto out; + } + pgd_populate(mm, pud, new); out: - return pmd_offset(pgd, address); + return pmd_offset(pud, address); } +#endif int make_pages_present(unsigned long addr, unsigned long end) { @@ -1765,6 +2203,8 @@ int make_pages_present(unsigned long addr, unsigned long end) struct vm_area_struct * vma; vma = find_vma(current->mm, addr); + if (!vma) + return -1; write = (vma->vm_flags & VM_WRITE) != 0; if (addr >= end) BUG(); @@ -1786,19 +2226,21 @@ struct page * vmalloc_to_page(void * vmalloc_addr) unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); - if (!pmd_none(*pmd)) { - preempt_disable(); - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - preempt_enable(); + pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + } } } return page; @@ -1806,7 +2248,33 @@ struct page * vmalloc_to_page(void * vmalloc_addr) EXPORT_SYMBOL(vmalloc_to_page); -#if !defined(CONFIG_ARCH_GATE_AREA) +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(void * vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} + +EXPORT_SYMBOL(vmalloc_to_pfn); + +/* + * update_mem_hiwater + * - update per process rss and vm high water data + */ +void update_mem_hiwater(void) +{ + struct task_struct *tsk = current; + + if (tsk->mm) { + if (tsk->mm->hiwater_rss < tsk->mm->rss) + tsk->mm->hiwater_rss = tsk->mm->rss; + if (tsk->mm->hiwater_vm < tsk->mm->total_vm) + tsk->mm->hiwater_vm = tsk->mm->total_vm; + } +} + +#if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) struct vm_area_struct gate_vma; @@ -1828,11 +2296,11 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) #ifdef AT_SYSINFO_EHDR return &gate_vma; #else - return 0; + return NULL; #endif } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area_no_task(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) @@ -1841,4 +2309,4 @@ int in_gate_area(struct task_struct *task, unsigned long addr) return 0; } -#endif +#endif /* __HAVE_ARCH_GATE_AREA */