X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fmemory.c;h=c24a375ae5a5dab78e580778fe85bc13604cd78a;hb=refs%2Fheads%2Fvserver;hp=dbce9edd30619e1210d87b09693644ad9d6dc58f;hpb=76828883507a47dae78837ab5dec5a5b4513c667;p=linux-2.6.git diff --git a/mm/memory.c b/mm/memory.c index dbce9edd3..c24a375ae 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -47,7 +47,9 @@ #include #include #include +#include #include +#include #include #include @@ -87,7 +89,7 @@ int randomize_va_space __read_mostly = 1; static int __init disable_randmaps(char *s) { randomize_va_space = 0; - return 0; + return 1; } __setup("norandmaps", disable_randmaps); @@ -126,7 +128,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) pmd_clear(pmd); pte_lock_deinit(page); pte_free_tlb(tlb, page); - dec_page_state(nr_page_table_pages); + dec_zone_page_state(page, NR_PAGETABLE); tlb->mm->nr_ptes--; } @@ -277,7 +279,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, anon_vma_unlink(vma); unlink_file_vma(vma); - if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { @@ -285,8 +287,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_hugepage_only_range(vma->vm_mm, next->vm_start, - HPAGE_SIZE)) { + && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); @@ -312,7 +313,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) pte_free(new); } else { mm->nr_ptes++; - inc_page_state(nr_page_table_pages); + inc_zone_page_state(new, NR_PAGETABLE); pmd_populate(mm, pmd, new); } spin_unlock(&mm->page_table_lock); @@ -388,7 +389,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ { unsigned long pfn = pte_pfn(pte); - if (vma->vm_flags & VM_PFNMAP) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; @@ -401,11 +402,10 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ * we should just do "return pfn_to_page(pfn)", but * in the meantime we check that we get a valid pfn, * and that the resulting page looks ok. - * - * Remove this test eventually! */ if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); + if (!(vma->vm_flags & VM_RESERVED)) + print_bad_pte(vma, pte, addr); return NULL; } @@ -437,7 +437,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { if (!pte_file(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); + swp_entry_t entry = pte_to_swp_entry(pte); + + swap_duplicate(entry); /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -446,6 +448,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both parent + * and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -456,7 +468,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); - pte = *src_pte; + pte = pte_wrprotect(pte); } /* @@ -487,6 +499,9 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, int progress = 0; int rss[2]; + if (!vx_rss_avail(dst_mm, ((end - addr)/PAGE_SIZE + 1))) + return -ENOMEM; + again: rss[1] = rss[0] = 0; dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); @@ -494,7 +509,8 @@ again: return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); - spin_lock(src_ptl); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); do { /* @@ -516,6 +532,7 @@ again: progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); add_mm_rss(dst_mm, rss[0], rss[1]); @@ -617,6 +634,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, int anon_rss = 0; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; if (pte_none(ptent)) { @@ -667,7 +685,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); file_rss--; } - page_remove_rmap(page); + page_remove_rmap(page, vma); tlb_remove_page(tlb, page); continue; } @@ -679,10 +697,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear_full(mm, addr, pte, tlb->fullmm); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); add_mm_rss(mm, file_rss, anon_rss); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return addr; @@ -881,6 +900,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, tlb_finish_mmu(tlb, address, end); return end; } +EXPORT_SYMBOL(zap_page_range); /* * Do a quick page-table lookup for a single page. @@ -1020,6 +1040,26 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } +#ifdef CONFIG_XEN + if (vma && (vma->vm_flags & VM_FOREIGN)) { + struct page **map = vma->vm_private_data; + int offset = (start - vma->vm_start) >> PAGE_SHIFT; + if (map[offset] != NULL) { + if (pages) { + struct page *page = map[offset]; + + pages[i] = page; + get_page(page); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + } +#endif if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; @@ -1071,9 +1111,12 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } + cond_resched(); } if (pages) { pages[i] = page; + + flush_anon_page(vma, page, start); flush_dcache_page(page); } if (vmas) @@ -1092,21 +1135,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; spinlock_t *ptl; + int err = 0; pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) - return -ENOMEM; + return -EAGAIN; + arch_enter_lazy_mmu_mode(); do { struct page *page = ZERO_PAGE(addr); pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); + + if (unlikely(!pte_none(*pte))) { + err = -EEXIST; + pte++; + break; + } page_cache_get(page); page_add_file_rmap(page); inc_mm_counter(mm, file_rss); - BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); - return 0; + return err; } static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -1114,16 +1165,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, { pmd_t *pmd; unsigned long next; + int err; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - return -ENOMEM; + return -EAGAIN; do { next = pmd_addr_end(addr, end); - if (zeromap_pte_range(mm, pmd, addr, next, prot)) - return -ENOMEM; + err = zeromap_pte_range(mm, pmd, addr, next, prot); + if (err) + break; } while (pmd++, addr = next, addr != end); - return 0; + return err; } static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, @@ -1131,16 +1184,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, { pud_t *pud; unsigned long next; + int err; pud = pud_alloc(mm, pgd, addr); if (!pud) - return -ENOMEM; + return -EAGAIN; do { next = pud_addr_end(addr, end); - if (zeromap_pmd_range(mm, pud, addr, next, prot)) - return -ENOMEM; + err = zeromap_pmd_range(mm, pud, addr, next, prot); + if (err) + break; } while (pud++, addr = next, addr != end); - return 0; + return err; } int zeromap_page_range(struct vm_area_struct *vma, @@ -1214,16 +1269,19 @@ out: return retval; } -/* +/** + * vm_insert_page - insert single page into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @page: source kernel page + * * This allows drivers to insert individual pages they've allocated * into a user vma. * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself - * (which is mainly an issue of doing "set_page_count(page, 1)" for - * each sub-page, and then freeing them one by one when you free - * them rather than freeing it as a compound page). + * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow @@ -1259,11 +1317,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; + arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1308,7 +1368,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/* Note: this is only safe if the mm semaphore is held when called. */ +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -1359,6 +1428,102 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +#ifdef CONFIG_XEN +static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + struct page *pmd_page; + spinlock_t *ptl; + + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + + BUG_ON(pmd_huge(*pmd)); + + pmd_page = pmd_page(*pmd); + + do { + err = fn(pte, pmd_page, addr, data); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (mm != &init_mm) + pte_unmap_unlock(pte-1, ptl); + return err; +} + +static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + return err; +} +EXPORT_SYMBOL_GPL(apply_to_page_range); +#endif + /* * handle_pte_fault chooses page fault handler according to an entry * which was read non-atomically. Before making any commitment, on @@ -1397,7 +1562,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* * If the source page was a PFN mapping, we don't have @@ -1420,9 +1585,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); return; - + } - copy_user_highpage(dst, src, va); + copy_user_highpage(dst, src, va, vma); } /* @@ -1449,25 +1614,71 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int ret = VM_FAULT_MINOR; + int reuse = 0, ret = VM_FAULT_MINOR; + struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) goto gotten; - if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { - int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_set_access_flags(vma, address, page_table, entry, 1); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); - ret |= VM_FAULT_WRITE; - goto unlock; + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + if (PageAnon(old_page)) { + if (!TestSetPageLocked(old_page)) { + reuse = can_share_swap_page(old_page); + unlock_page(old_page); } + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + /* + * Notify the address space that the page is about to + * become writable so that it can prohibit this or wait + * for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can + * sleep if it needs to. + */ + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + + if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) + goto unwritable_page; + + page_cache_release(old_page); + + /* + * Since we dropped the lock we need to revalidate + * the PTE as someone else may have changed it. If + * they did, we just return, as we can count on the + * MMU to tell us if they didn't also make it writable. + */ + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) + goto unlock; + } + dirty_page = old_page; + get_page(dirty_page); + reuse = 1; + } + + if (reuse) { + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ptep_set_access_flags(vma, address, page_table, entry, 1); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + ret |= VM_FAULT_WRITE; + goto unlock; } /* @@ -1487,7 +1698,7 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, address); + cow_user_page(new_page, old_page, address, vma); } /* @@ -1496,7 +1707,7 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page); + page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1506,9 +1717,16 @@ gotten: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_establish(vma, address, page_table, entry); - update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry. This will avoid a race condition + * seen in the presence of one thread doing SMC and another + * thread doing COW. + */ + ptep_clear_flush(vma, address, page_table); + set_pte_at(mm, address, page_table, entry); + update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); @@ -1522,11 +1740,19 @@ gotten: page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty_balance(dirty_page); + put_page(dirty_page); + } return ret; oom: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; + +unwritable_page: + page_cache_release(old_page); + return VM_FAULT_SIGBUS; } /* @@ -1737,9 +1963,10 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating * * NOTE! We have to be ready to update the memory sharing * between the file and the memory map for a potential last @@ -1806,13 +2033,17 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return 0; } -EXPORT_SYMBOL(vmtruncate_range); -/* +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @addr: address to start + * @vma: user vma this addresses belong to + * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue - * the 'original' request together with the readahead ones... + * the 'original' request together with the readahead ones... * * This has been extended to use the NUMA policies from the mm triggering * the readahead. @@ -1883,9 +2114,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); -again: + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + goto out; + } + delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { + grab_swap_token(); /* Contend for token _before_ read-in */ swapin_readahead(entry, address, vma); page = read_swap_cache_async(entry, vma, address); if (!page) { @@ -1896,27 +2132,23 @@ again: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; } /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); - grab_swap_token(); + count_vm_event(PGMAJFAULT); } - if (!vx_rsspages_avail(mm, 1)) { + if (!vx_rss_avail(mm, 1)) { ret = VM_FAULT_OOM; goto out; } + + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); mark_page_accessed(page); lock_page(page); - if (!PageSwapCache(page)) { - /* Page migration has occured */ - unlock_page(page); - page_cache_release(page); - goto again; - } /* * Back out if somebody else already faulted in this pte. @@ -1986,7 +2218,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Allocate our own private page. */ pte_unmap(page_table); - if (!vx_rsspages_avail(mm, 1)) + if (!vx_rss_avail(mm, 1)) goto oom; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2056,19 +2288,20 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; + struct page *dirty_page = NULL; pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); + if (!vx_rss_avail(mm, 1)) + return VM_FAULT_OOM; + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; sequence = mapping->truncate_count; smp_rmb(); /* serializes i_size against truncate_count */ } retry: - /* FIXME: is that check useful here? */ - if (!vx_rsspages_avail(mm, 1)) - return VM_FAULT_OOM; new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* * No smp_rmb is needed here as long as there's a full @@ -2078,27 +2311,42 @@ retry: * after the next truncate_count read. */ - /* no page was available -- either SIGBUS or OOM */ - if (new_page == NOPAGE_SIGBUS) + /* no page was available -- either SIGBUS, OOM or REFAULT */ + if (unlikely(new_page == NOPAGE_SIGBUS)) return VM_FAULT_SIGBUS; - if (new_page == NOPAGE_OOM) + else if (unlikely(new_page == NOPAGE_OOM)) return VM_FAULT_OOM; + else if (unlikely(new_page == NOPAGE_REFAULT)) + return VM_FAULT_MINOR; /* * Should we do an early C-O-W break? */ - if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page *page; + if (write_access) { + if (!(vma->vm_flags & VM_SHARED)) { + struct page *page; - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!page) - goto oom; - copy_user_highpage(page, new_page, address); - page_cache_release(new_page); - new_page = page; - anon = 1; + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!page) + goto oom; + copy_user_highpage(page, new_page, address, vma); + page_cache_release(new_page); + new_page = page; + anon = 1; + + } else { + /* if the page will be shareable, see if the backing + * address space wants to know that the page is about + * to become writable */ + if (vma->vm_ops->page_mkwrite && + vma->vm_ops->page_mkwrite(vma, new_page) < 0 + ) { + page_cache_release(new_page); + return VM_FAULT_SIGBUS; + } + } } page_table = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2140,6 +2388,10 @@ retry: } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); + if (write_access) { + dirty_page = new_page; + get_page(dirty_page); + } } } else { /* One of our sibling threads was faster, back out. */ @@ -2152,12 +2404,64 @@ retry: lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty_balance(dirty_page); + put_page(dirty_page); + } return ret; oom: page_cache_release(new_page); return VM_FAULT_OOM; } +/* + * do_no_pfn() tries to create a new page mapping for a page without + * a struct_page backing it + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + * + * It is expected that the ->nopfn handler always returns the same pfn + * for a given virtual mapping. + * + * Mark this `noinline' to prevent it from bloating the main pagefault code. + */ +static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + spinlock_t *ptl; + pte_t entry; + unsigned long pfn; + int ret = VM_FAULT_MINOR; + + pte_unmap(page_table); + BUG_ON(!(vma->vm_flags & VM_PFNMAP)); + BUG_ON(is_cow_mapping(vma->vm_flags)); + + pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); + if (pfn == NOPFN_OOM) + return VM_FAULT_OOM; + if (pfn == NOPFN_SIGBUS) + return VM_FAULT_SIGBUS; + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + entry = pfn_pte(pfn, vma->vm_page_prot); + if (write_access) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_pte_at(mm, address, page_table, entry); + } + pte_unmap_unlock(page_table, ptl); + return ret; +} + /* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable @@ -2216,15 +2520,22 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte_t entry; pte_t old_entry; spinlock_t *ptl; + int ret, type = VXPT_UNKNOWN; old_entry = entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); - return do_no_page(mm, vma, address, - pte, pmd, write_access); + if (vma->vm_ops) { + if (vma->vm_ops->nopage) + return do_no_page(mm, vma, address, + pte, pmd, + write_access); + if (unlikely(vma->vm_ops->nopfn)) + return do_no_pfn(mm, vma, address, pte, + pmd, write_access); + } + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); } if (pte_file(entry)) return do_file_page(mm, vma, address, @@ -2238,9 +2549,12 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (unlikely(!pte_same(*pte, entry))) goto unlock; if (write_access) { - if (!pte_write(entry)) - return do_wp_page(mm, vma, address, + if (!pte_write(entry)) { + ret = do_wp_page(mm, vma, address, pte, pmd, ptl, entry); + type = VXPT_WRITE; + goto out; + } entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); @@ -2260,7 +2574,10 @@ static inline int handle_pte_fault(struct mm_struct *mm, } unlock: pte_unmap_unlock(pte, ptl); - return VM_FAULT_MINOR; + ret = VM_FAULT_MINOR; +out: + vx_page_fault(mm, vma, type, ret); + return ret; } /* @@ -2276,7 +2593,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, __set_current_state(TASK_RUNNING); - inc_page_state(pgfault); + count_vm_event(PGFAULT); if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, write_access); @@ -2367,10 +2684,8 @@ int make_pages_present(unsigned long addr, unsigned long end) if (!vma) return -1; write = (vma->vm_flags & VM_WRITE) != 0; - if (addr >= end) - BUG(); - if (end > vma->vm_end) - BUG(); + BUG_ON(addr >= end); + BUG_ON(end > vma->vm_end); len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); @@ -2429,8 +2744,15 @@ static int __init gate_vma_init(void) gate_vma.vm_mm = NULL; gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_page_prot = PAGE_READONLY; - gate_vma.vm_flags = 0; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; return 0; } __initcall(gate_vma_init); @@ -2455,3 +2777,56 @@ int in_gate_area_no_task(unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, ret, offset; + void *maddr; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +}