From adf2645e29bf573584be6105dddeef86a1b452fc Mon Sep 17 00:00:00 2001 From: Mark Huang Date: Thu, 29 Jul 2004 19:02:13 +0000 Subject: [PATCH] Initial revision --- include/asm-um/pgtable.h.orig | 415 +++++++ include/asm-um/unistd.h.orig | 144 +++ include/linux/gfp.h.orig | 128 +++ include/linux/mm.h.orig | 729 ++++++++++++ mm/Makefile.orig | 17 + mm/mmap.c.orig | 1805 +++++++++++++++++++++++++++++ mm/mprotect.c.orig | 282 +++++ mm/page_alloc.c.orig | 2013 +++++++++++++++++++++++++++++++++ 8 files changed, 5533 insertions(+) create mode 100644 include/asm-um/pgtable.h.orig create mode 100644 include/asm-um/unistd.h.orig create mode 100644 include/linux/gfp.h.orig create mode 100644 include/linux/mm.h.orig create mode 100644 mm/Makefile.orig create mode 100644 mm/mmap.c.orig create mode 100644 mm/mprotect.c.orig create mode 100644 mm/page_alloc.c.orig diff --git a/include/asm-um/pgtable.h.orig b/include/asm-um/pgtable.h.orig new file mode 100644 index 000000000..148dd8e42 --- /dev/null +++ b/include/asm-um/pgtable.h.orig @@ -0,0 +1,415 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Derived from include/asm-i386/pgtable.h + * Licensed under the GPL + */ + +#ifndef __UM_PGTABLE_H +#define __UM_PGTABLE_H + +#include "linux/sched.h" +#include "asm/processor.h" +#include "asm/page.h" +#include "asm/fixmap.h" + +extern pgd_t swapper_pg_dir[1024]; + +extern void *um_virt_to_phys(struct task_struct *task, unsigned long virt, + pte_t *pte_out); + +/* zero page used for uninitialized stuff */ +extern unsigned long *empty_zero_page; + +#define pgtable_cache_init() do ; while (0) + +/* PMD_SHIFT determines the size of the area a second-level page table can map */ +#define PMD_SHIFT 22 +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) + +/* PGDIR_SHIFT determines what a third-level page table entry can map */ +#define PGDIR_SHIFT 22 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +/* + * entries per page directory level: the i386 is two-level, so + * we don't really have any PMD directory physically. + */ +#define PTRS_PER_PTE 1024 +#define PTRS_PER_PMD 1 +#define PTRS_PER_PGD 1024 +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define FIRST_USER_PGD_NR 0 + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + +/* + * pgd entries used up by user/kernel: + */ + +#define USER_PGD_PTRS (TASK_SIZE >> PGDIR_SHIFT) +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) + +#ifndef __ASSEMBLY__ +/* Just any arbitrary offset to the start of the vmalloc VM area: the + * current 8MB value just means that there will be a 8MB "hole" after the + * physical memory until the kernel virtual memory starts. That means that + * any out-of-bounds memory accesses will hopefully be caught. + * The vmalloc() routines leaves a hole of 4kB between each vmalloced + * area for the same reason. ;) + */ + +extern unsigned long high_physmem; + +#define VMALLOC_OFFSET (__va_space) +#define VMALLOC_START (((unsigned long) high_physmem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) + +#ifdef CONFIG_HIGHMEM +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) +#else +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +#endif + +#define _PAGE_PRESENT 0x001 +#define _PAGE_NEWPAGE 0x002 +#define _PAGE_PROTNONE 0x004 /* If not present */ +#define _PAGE_RW 0x008 +#define _PAGE_USER 0x010 +#define _PAGE_ACCESSED 0x020 +#define _PAGE_DIRTY 0x040 +#define _PAGE_NEWPROT 0x080 + +#define REGION_MASK 0xf0000000 +#define REGION_SHIFT 28 + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) +#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED) + +/* + * The i386 can't do page protection for execute, and considers that the same are read. + * Also, write permissions imply read permissions. This is the closest we can get.. + */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY +#define __P101 PAGE_READONLY +#define __P110 PAGE_COPY +#define __P111 PAGE_COPY + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY +#define __S101 PAGE_READONLY +#define __S110 PAGE_SHARED +#define __S111 PAGE_SHARED + +/* + * Define this if things work differently on an i386 and an i486: + * it will (on an i486) warn about kernel memory accesses that are + * done without a 'verify_area(VERIFY_WRITE,..)' + */ +#undef TEST_VERIFY_AREA + +/* page table for 0-4MB for everybody */ +extern unsigned long pg0[1024]; + +/* + * BAD_PAGETABLE is used when we need a bogus page-table, while + * BAD_PAGE is used for a bogus page. + * + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +extern pte_t __bad_page(void); +extern pte_t * __bad_pagetable(void); + +#define BAD_PAGETABLE __bad_pagetable() +#define BAD_PAGE __bad_page() +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + +/* number of bits that fit into a memory pointer */ +#define BITS_PER_PTR (8*sizeof(unsigned long)) + +/* to align the pointer to a pointer address */ +#define PTR_MASK (~(sizeof(void*)-1)) + +/* sizeof(void*)==1<>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK) + +#define pte_none(x) !(pte_val(x) & ~_PAGE_NEWPAGE) +#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE)) + +#define pte_clear(xp) do { pte_val(*(xp)) = _PAGE_NEWPAGE; } while (0) + +#define phys_region_index(x) (((x) & REGION_MASK) >> REGION_SHIFT) +#define pte_region_index(x) phys_region_index(pte_val(x)) + +#define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) +#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) + +#define pmd_newpage(x) (pmd_val(x) & _PAGE_NEWPAGE) +#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEWPAGE) + +/* + * The "pgd_xxx()" functions here are trivial for a folded two-level + * setup: the pgd is never bad, and a pmd always exists (as it's folded + * into the pgd entry) + */ +static inline int pgd_none(pgd_t pgd) { return 0; } +static inline int pgd_bad(pgd_t pgd) { return 0; } +static inline int pgd_present(pgd_t pgd) { return 1; } +static inline void pgd_clear(pgd_t * pgdp) { } + + +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) + +extern struct page *pte_mem_map(pte_t pte); +extern struct page *phys_mem_map(unsigned long phys); +extern unsigned long phys_to_pfn(unsigned long p); +extern unsigned long pfn_to_phys(unsigned long pfn); + +#define pte_page(x) pfn_to_page(pte_pfn(x)) +#define pte_address(x) (__va(pte_val(x) & PAGE_MASK)) +#define mk_phys(a, r) ((a) + (r << REGION_SHIFT)) +#define phys_addr(p) ((p) & ~REGION_MASK) +#define phys_page(p) (phys_mem_map(p) + ((phys_addr(p)) >> PAGE_SHIFT)) +#define pte_pfn(x) phys_to_pfn(pte_val(x)) +#define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) + +static inline pte_t pte_mknewprot(pte_t pte) +{ + pte_val(pte) |= _PAGE_NEWPROT; + return(pte); +} + +static inline pte_t pte_mknewpage(pte_t pte) +{ + pte_val(pte) |= _PAGE_NEWPAGE; + return(pte); +} + +static inline void set_pte(pte_t *pteptr, pte_t pteval) +{ + /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so + * fix_range knows to unmap it. _PAGE_NEWPROT is specific to + * mapped pages. + */ + *pteptr = pte_mknewpage(pteval); + if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); +} + +/* + * (pmds are folded into pgds so this doesn't get actually called, + * but the define is needed for a generic inline function.) + */ +#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) +#define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline int pte_read(pte_t pte) +{ + return((pte_val(pte) & _PAGE_USER) && + !(pte_val(pte) & _PAGE_PROTNONE)); +} + +static inline int pte_exec(pte_t pte){ + return((pte_val(pte) & _PAGE_USER) && + !(pte_val(pte) & _PAGE_PROTNONE)); +} + +static inline int pte_write(pte_t pte) +{ + return((pte_val(pte) & _PAGE_RW) && + !(pte_val(pte) & _PAGE_PROTNONE)); +} + +static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } +static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } +static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; } +static inline int pte_newprot(pte_t pte) +{ + return(pte_present(pte) && (pte_val(pte) & _PAGE_NEWPROT)); +} + +static inline pte_t pte_rdprotect(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_USER; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_exprotect(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_USER; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_DIRTY; + return(pte); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_ACCESSED; + return(pte); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_RW; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkread(pte_t pte) +{ + pte_val(pte) |= _PAGE_USER; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + pte_val(pte) |= _PAGE_USER; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + pte_val(pte) |= _PAGE_DIRTY; + return(pte); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + pte_val(pte) |= _PAGE_ACCESSED; + return(pte); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + pte_val(pte) |= _PAGE_RW; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkuptodate(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_NEWPAGE; + if(pte_present(pte)) pte_val(pte) &= ~_PAGE_NEWPROT; + return(pte); +} + +extern unsigned long page_to_phys(struct page *page); + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ + +#define mk_pte(page, pgprot) \ +({ \ + pte_t __pte; \ + \ + pte_val(__pte) = page_to_phys(page) + pgprot_val(pgprot);\ + if(pte_present(__pte)) pte_mknewprot(pte_mknewpage(__pte)); \ + __pte; \ +}) + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot); + if(pte_present(pte)) pte = pte_mknewpage(pte_mknewprot(pte)); + return pte; +} + +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +#define pmd_page(pmd) (phys_mem_map(pmd_val(pmd) & PAGE_MASK) + \ + ((phys_addr(pmd_val(pmd)) >> PAGE_SHIFT))) + +/* to find an entry in a page-table-directory. */ +#define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) + +/* to find an entry in a page-table-directory */ +#define pgd_offset(mm, address) \ +((mm)->pgd + ((address) >> PGDIR_SHIFT)) + +/* to find an entry in a kernel page-table-directory */ +#define pgd_offset_k(address) pgd_offset(&init_mm, address) + +#define pmd_index(address) \ + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) + +/* Find an entry in the second-level page table.. */ +static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) +{ + return (pmd_t *) dir; +} + +/* Find an entry in the third-level page table.. */ +#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +#define pte_offset_kernel(dir, address) \ + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) +#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) + +#define update_mmu_cache(vma,address,pte) do ; while (0) + +/* Encode and de-code a swap entry */ +#define __swp_type(x) (((x).val >> 3) & 0x7f) +#define __swp_offset(x) ((x).val >> 10) + +#define __swp_entry(type, offset) \ + ((swp_entry_t) { ((type) << 3) | ((offset) << 10) }) +#define __pte_to_swp_entry(pte) \ + ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) +#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) + +#define kern_addr_valid(addr) (1) + +#include + +#endif + +#endif +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff --git a/include/asm-um/unistd.h.orig b/include/asm-um/unistd.h.orig new file mode 100644 index 000000000..5850620bb --- /dev/null +++ b/include/asm-um/unistd.h.orig @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef _UM_UNISTD_H_ +#define _UM_UNISTD_H_ + +#include +#include "linux/resource.h" +#include "asm/uaccess.h" + +extern int um_execve(const char *file, char *const argv[], char *const env[]); + +#ifdef __KERNEL__ +#define __ARCH_WANT_IPC_PARSE_VERSION +#define __ARCH_WANT_OLD_READDIR +#define __ARCH_WANT_OLD_STAT +#define __ARCH_WANT_STAT64 +#define __ARCH_WANT_SYS_ALARM +#define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_PAUSE +#define __ARCH_WANT_SYS_SGETMASK +#define __ARCH_WANT_SYS_SIGNAL +#define __ARCH_WANT_SYS_TIME +#define __ARCH_WANT_SYS_UTIME +#define __ARCH_WANT_SYS_WAITPID +#define __ARCH_WANT_SYS_SOCKETCALL +#define __ARCH_WANT_SYS_FADVISE64 +#define __ARCH_WANT_SYS_GETPGRP +#define __ARCH_WANT_SYS_LLSEEK +#define __ARCH_WANT_SYS_NICE +#define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLDUMOUNT +#define __ARCH_WANT_SYS_SIGPENDING +#define __ARCH_WANT_SYS_SIGPROCMASK +#define __ARCH_WANT_SYS_RT_SIGACTION +#endif + +#ifdef __KERNEL_SYSCALLS__ + +#include +#include + +#define KERNEL_CALL(ret_t, sys, args...) \ + mm_segment_t fs = get_fs(); \ + ret_t ret; \ + set_fs(KERNEL_DS); \ + ret = sys(args); \ + set_fs(fs); \ + return ret; + +static inline long open(const char *pathname, int flags, int mode) +{ + KERNEL_CALL(int, sys_open, pathname, flags, mode) +} + +static inline long dup(unsigned int fd) +{ + KERNEL_CALL(int, sys_dup, fd); +} + +static inline long close(unsigned int fd) +{ + KERNEL_CALL(int, sys_close, fd); +} + +static inline int execve(const char *filename, char *const argv[], + char *const envp[]) +{ + KERNEL_CALL(int, um_execve, filename, argv, envp); +} + +static inline long waitpid(pid_t pid, unsigned int *status, int options) +{ + KERNEL_CALL(pid_t, sys_wait4, pid, status, options, NULL) +} + +static inline pid_t setsid(void) +{ + KERNEL_CALL(pid_t, sys_setsid) +} + +static inline long lseek(unsigned int fd, off_t offset, unsigned int whence) +{ + KERNEL_CALL(long, sys_lseek, fd, offset, whence) +} + +static inline int read(unsigned int fd, char * buf, int len) +{ + KERNEL_CALL(int, sys_read, fd, buf, len) +} + +static inline int write(unsigned int fd, char * buf, int len) +{ + KERNEL_CALL(int, sys_write, fd, buf, len) +} + +long sys_mmap2(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); +int sys_execve(char *file, char **argv, char **env); +long sys_clone(unsigned long clone_flags, unsigned long newsp, + int *parent_tid, int *child_tid); +long sys_fork(void); +long sys_vfork(void); +int sys_pipe(unsigned long *fildes); +int sys_ptrace(long request, long pid, long addr, long data); +struct sigaction; +asmlinkage long sys_rt_sigaction(int sig, + const struct sigaction __user *act, + struct sigaction __user *oact, + size_t sigsetsize); + +#endif + +/* Save the value of __KERNEL_SYSCALLS__, undefine it, include the underlying + * arch's unistd.h for the system call numbers, and restore the old + * __KERNEL_SYSCALLS__. + */ + +#ifdef __KERNEL_SYSCALLS__ +#define __SAVE_KERNEL_SYSCALLS__ __KERNEL_SYSCALLS__ +#endif + +#undef __KERNEL_SYSCALLS__ +#include "asm/arch/unistd.h" + +#ifdef __KERNEL_SYSCALLS__ +#define __KERNEL_SYSCALLS__ __SAVE_KERNEL_SYSCALLS__ +#endif + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff --git a/include/linux/gfp.h.orig b/include/linux/gfp.h.orig new file mode 100644 index 000000000..8980d1fd7 --- /dev/null +++ b/include/linux/gfp.h.orig @@ -0,0 +1,128 @@ +#ifndef __LINUX_GFP_H +#define __LINUX_GFP_H + +#include +#include +#include +#include + +struct vm_area_struct; + +/* + * GFP bitmasks.. + */ +/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ +#define __GFP_DMA 0x01 +#define __GFP_HIGHMEM 0x02 + +/* + * Action modifiers - doesn't change the zoning + * + * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt + * _might_ fail. This depends upon the particular VM implementation. + * + * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller + * cannot handle allocation failures. + * + * __GFP_NORETRY: The VM implementation must not retry indefinitely. + */ +#define __GFP_WAIT 0x10 /* Can wait and reschedule? */ +#define __GFP_HIGH 0x20 /* Should access emergency pools? */ +#define __GFP_IO 0x40 /* Can start physical IO? */ +#define __GFP_FS 0x80 /* Can call down to low-level FS? */ +#define __GFP_COLD 0x100 /* Cache-cold page required */ +#define __GFP_NOWARN 0x200 /* Suppress page allocation failure warning */ +#define __GFP_REPEAT 0x400 /* Retry the allocation. Might fail */ +#define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ +#define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ +#define __GFP_NO_GROW 0x2000 /* Slab internal usage */ +#define __GFP_COMP 0x4000 /* Add compound page metadata */ + +#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ +#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) + +/* if you forget to add the bitmask here kernel will crash, period */ +#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ + __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ + __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP) + +#define GFP_ATOMIC (__GFP_HIGH) +#define GFP_NOIO (__GFP_WAIT) +#define GFP_NOFS (__GFP_WAIT | __GFP_IO) +#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) +#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS) +#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM) + +/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some + platforms, used as appropriate on others */ + +#define GFP_DMA __GFP_DMA + + +/* + * There is only one page-allocator function, and two main namespaces to + * it. The alloc_page*() variants return 'struct page *' and as such + * can allocate highmem pages, the *get*page*() variants return + * virtual kernel addresses to the allocated page(s). + */ + +/* + * We get the zone list from the current node and the gfp_mask. + * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. + * + * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets + * optimized to &contig_page_data at compile-time. + */ +extern struct page * +FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *)); + +static inline struct page *alloc_pages_node(int nid, unsigned int gfp_mask, + unsigned int order) +{ + if (unlikely(order >= MAX_ORDER)) + return NULL; + + return __alloc_pages(gfp_mask, order, + NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); +} + +#ifdef CONFIG_NUMA +extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order); + +static inline struct page * +alloc_pages(unsigned int gfp_mask, unsigned int order) +{ + if (unlikely(order >= MAX_ORDER)) + return NULL; + + return alloc_pages_current(gfp_mask, order); +} +extern struct page *alloc_page_vma(unsigned gfp_mask, + struct vm_area_struct *vma, unsigned long addr); +#else +#define alloc_pages(gfp_mask, order) \ + alloc_pages_node(numa_node_id(), gfp_mask, order) +#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#endif +#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) + +extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); +extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); + +#define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask),0) + +#define __get_dma_pages(gfp_mask, order) \ + __get_free_pages((gfp_mask) | GFP_DMA,(order)) + +extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); +extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); +extern void FASTCALL(free_hot_page(struct page *page)); +extern void FASTCALL(free_cold_page(struct page *page)); + +#define __free_page(page) __free_pages((page), 0) +#define free_page(addr) free_pages((addr),0) + +void page_alloc_init(void); + +#endif /* __LINUX_GFP_H */ diff --git a/include/linux/mm.h.orig b/include/linux/mm.h.orig new file mode 100644 index 000000000..8f8a8a3a3 --- /dev/null +++ b/include/linux/mm.h.orig @@ -0,0 +1,729 @@ +#ifndef _LINUX_MM_H +#define _LINUX_MM_H + +#include +#include + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include +#include + +struct mempolicy; +struct anon_vma; + +#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ +extern unsigned long max_mapnr; +#endif + +extern unsigned long num_physpages; +extern void * high_memory; +extern int page_cluster; + +#include +#include +#include +#include + +#ifndef MM_VM_SIZE +#define MM_VM_SIZE(mm) TASK_SIZE +#endif + +/* + * Linux kernel virtual memory manager primitives. + * The idea being to have a "virtual" mm in the same way + * we have a virtual fs - giving a cleaner interface to the + * mm details, and allowing different kinds of memory mappings + * (from shared memory to executable loading to arbitrary + * mmap() functions). + */ + +/* + * This struct defines a memory VMM memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory + * space that has a special rule for the page-fault handlers (ie a shared + * library, the executable area etc). + */ +struct vm_area_struct { + struct mm_struct * vm_mm; /* The address space we belong to. */ + unsigned long vm_start; /* Our start address within vm_mm. */ + unsigned long vm_end; /* The first byte after our end address + within vm_mm. */ + + /* linked list of VM areas per task, sorted by address */ + struct vm_area_struct *vm_next; + + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + unsigned long vm_flags; /* Flags, listed below. */ + + struct rb_node vm_rb; + + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap prio tree, or + * linkage to the list of like vmas hanging off its node, or + * linkage of vma in the address_space->i_mmap_nonlinear list. + */ + union { + struct { + struct list_head list; + void *parent; /* aligns with prio_tree_node parent */ + struct vm_area_struct *head; + } vm_set; + + struct prio_tree_node prio_tree_node; + } shared; + + /* + * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma + * list, after a COW of one of the file pages. A MAP_SHARED vma + * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack + * or brk vma (with NULL file) can only be in an anon_vma list. + */ + struct list_head anon_vma_node; /* Serialized by anon_vma->lock */ + struct anon_vma *anon_vma; /* Serialized by page_table_lock */ + + /* Function pointers to deal with this struct. */ + struct vm_operations_struct * vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units, *not* PAGE_CACHE_SIZE */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ + +#ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ +#endif +}; + +/* + * vm_flags.. + */ +#define VM_READ 0x00000001 /* currently active flags */ +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 + +#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ +#define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 +#define VM_MAYSHARE 0x00000080 + +#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define VM_GROWSUP 0x00000200 +#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ +#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ + +#define VM_EXECUTABLE 0x00001000 +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ + + /* Used by sys_madvise() */ +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ + +#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ +#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ +#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#ifdef CONFIG_STACK_GROWSUP +#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#else +#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#endif + +#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) +#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK +#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) +#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) +#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) + +/* + * mapping from the currently active vm_flags protection bits (the + * low four bits) to a page protection mask.. + */ +extern pgprot_t protection_map[16]; + + +/* + * These are the virtual MM functions - opening of an area, closing and + * unmapping it (needed to keep files on disk up-to-date etc), pointer + * to the functions called when a no-page or a wp-page exception occurs. + */ +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct * area); + struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); + int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); +#ifdef CONFIG_NUMA + int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); + struct mempolicy *(*get_policy)(struct vm_area_struct *vma, + unsigned long addr); +#endif +}; + +struct mmu_gather; +struct inode; + +#ifdef ARCH_HAS_ATOMIC_UNSIGNED +typedef unsigned page_flags_t; +#else +typedef unsigned long page_flags_t; +#endif + +/* + * Each physical page in the system has a struct page associated with + * it to keep track of whatever it is we are using the page for at the + * moment. Note that we have no way to track which tasks are using + * a page. + */ +struct page { + page_flags_t flags; /* Atomic flags, some possibly + * updated asynchronously */ + atomic_t _count; /* Usage count, see below. */ + unsigned int mapcount; /* Count of ptes mapped in mms, + * to show when page is mapped + * & limit reverse map searches, + * protected by PG_maplock. + */ + unsigned long private; /* Mapping-private opaque data: + * usually used for buffer_heads + * if PagePrivate set; used for + * swp_entry_t if PageSwapCache + */ + struct address_space *mapping; /* If PG_anon clear, points to + * inode address_space, or NULL. + * If page mapped as anonymous + * memory, PG_anon is set, and + * it points to anon_vma object. + */ + pgoff_t index; /* Our offset within mapping. */ + struct list_head lru; /* Pageout list, eg. active_list + * protected by zone->lru_lock ! + */ + /* + * On machines where all RAM is mapped into kernel address space, + * we can simply calculate the virtual address. On machines with + * highmem some memory is mapped into kernel virtual memory + * dynamically, so we need a place to store that address. + * Note that this field could be 16 bits on x86 ... ;) + * + * Architectures with slow multiplication can define + * WANT_PAGE_VIRTUAL in asm/page.h + */ +#if defined(WANT_PAGE_VIRTUAL) + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ +#endif /* WANT_PAGE_VIRTUAL */ +}; + +/* + * FIXME: take this include out, include page-flags.h in + * files which need it (119 of them) + */ +#include + +/* + * Methods to modify the page usage count. + * + * What counts for a page usage: + * - cache mapping (page->mapping) + * - private data (page->private) + * - page mapped in a task's page tables, each mapping + * is counted separately + * + * Also, many kernel routines increase the page count before a critical + * routine so they can be sure the page doesn't go away from under them. + * + * Since 2.6.6 (approx), a free page has ->_count = -1. This is so that we + * can use atomic_add_negative(-1, page->_count) to detect when the page + * becomes free and so that we can also use atomic_inc_and_test to atomically + * detect when we just tried to grab a ref on a page which some other CPU has + * already deemed to be freeable. + * + * NO code should make assumptions about this internal detail! Use the provided + * macros which retain the old rules: page_count(page) == 0 is a free page. + */ + +/* + * Drop a ref, return true if the logical refcount fell to zero (the page has + * no users) + */ +#define put_page_testzero(p) \ + ({ \ + BUG_ON(page_count(p) == 0); \ + atomic_add_negative(-1, &(p)->_count); \ + }) + +/* + * Grab a ref, return true if the page previously had a logical refcount of + * zero. ie: returns true if we just grabbed an already-deemed-to-be-free page + */ +#define get_page_testone(p) atomic_inc_and_test(&(p)->_count) + +#define set_page_count(p,v) atomic_set(&(p)->_count, v - 1) +#define __put_page(p) atomic_dec(&(p)->_count) + +extern void FASTCALL(__page_cache_release(struct page *)); + +#ifdef CONFIG_HUGETLB_PAGE + +static inline int page_count(struct page *p) +{ + if (PageCompound(p)) + p = (struct page *)p->private; + return atomic_read(&(p)->_count) + 1; +} + +static inline void get_page(struct page *page) +{ + if (unlikely(PageCompound(page))) + page = (struct page *)page->private; + atomic_inc(&page->_count); +} + +void put_page(struct page *page); + +#else /* CONFIG_HUGETLB_PAGE */ + +#define page_count(p) (atomic_read(&(p)->_count) + 1) + +static inline void get_page(struct page *page) +{ + atomic_inc(&page->_count); +} + +static inline void put_page(struct page *page) +{ + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Multiple processes may "see" the same page. E.g. for untouched + * mappings of /dev/null, all processes see the same page full of + * zeroes, and text pages of executables and shared libraries have + * only one copy in memory, at most, normally. + * + * For the non-reserved pages, page_count(page) denotes a reference count. + * page_count() == 0 means the page is free. + * page_count() == 1 means the page is used for exactly one purpose + * (e.g. a private data page of one process). + * + * A page may be used for kmalloc() or anyone else who does a + * __get_free_page(). In this case the page_count() is at least 1, and + * all other fields are unused but should be 0 or NULL. The + * management of this page is the responsibility of the one who uses + * it. + * + * The other pages (we may call them "process pages") are completely + * managed by the Linux memory manager: I/O, buffers, swapping etc. + * The following discussion applies only to them. + * + * A page may belong to an inode's memory mapping. In this case, + * page->mapping is the pointer to the inode, and page->index is the + * file offset of the page, in units of PAGE_CACHE_SIZE. + * + * A page contains an opaque `private' member, which belongs to the + * page's address_space. Usually, this is the address of a circular + * list of the page's disk buffers. + * + * For pages belonging to inodes, the page_count() is the number of + * attaches, plus 1 if `private' contains something, plus one for + * the page cache itself. + * + * All pages belonging to an inode are in these doubly linked lists: + * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; + * using the page->list list_head. These fields are also used for + * freelist managemet (when page_count()==0). + * + * There is also a per-mapping radix tree mapping index to the page + * in memory if present. The tree is rooted at mapping->root. + * + * All process pages can do I/O: + * - inode pages may need to be read from disk, + * - inode pages which have been modified and are MAP_SHARED may need + * to be written to disk, + * - private pages which have been modified may need to be swapped out + * to swap space and (later) to be read back into memory. + */ + +/* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total, + * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits. + */ +#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT) +#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) + +static inline unsigned long page_zonenum(struct page *page) +{ + return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT)); +} +static inline unsigned long page_to_nid(struct page *page) +{ + return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT)); +} + +struct zone; +extern struct zone *zone_table[]; + +static inline struct zone *page_zone(struct page *page) +{ + return zone_table[page->flags >> NODEZONE_SHIFT]; +} + +static inline void set_page_zone(struct page *page, unsigned long nodezone_num) +{ + page->flags &= ~(~0UL << NODEZONE_SHIFT); + page->flags |= nodezone_num << NODEZONE_SHIFT; +} + +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) +{ + return __va(page_to_pfn(page) << PAGE_SHIFT); +} + +#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) +#define HASHED_PAGE_VIRTUAL +#endif + +#if defined(WANT_PAGE_VIRTUAL) +#define page_address(page) ((page)->virtual) +#define set_page_address(page, address) \ + do { \ + (page)->virtual = (address); \ + } while(0) +#define page_address_init() do { } while(0) +#endif + +#if defined(HASHED_PAGE_VIRTUAL) +void *page_address(struct page *page); +void set_page_address(struct page *page, void *virtual); +void page_address_init(void); +#endif + +#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) +#define page_address(page) lowmem_page_address(page) +#define set_page_address(page, address) do { } while(0) +#define page_address_init() do { } while(0) +#endif + +/* + * On an anonymous page mapped into a user virtual memory area, + * page->mapping points to its anon_vma, not to a struct address_space. + * + * Please note that, confusingly, "page_mapping" refers to the inode + * address_space which maps the page from disk; whereas "page_mapped" + * refers to user virtual address space into which the page is mapped. + */ +extern struct address_space swapper_space; +static inline struct address_space *page_mapping(struct page *page) +{ + struct address_space *mapping = NULL; + + if (unlikely(PageSwapCache(page))) + mapping = &swapper_space; + else if (likely(!PageAnon(page))) + mapping = page->mapping; + return mapping; +} + +/* + * Return the pagecache index of the passed page. Regular pagecache pages + * use ->index whereas swapcache pages use ->private + */ +static inline pgoff_t page_index(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return page->private; + return page->index; +} + +/* + * Return true if this page is mapped into pagetables. + */ +static inline int page_mapped(struct page *page) +{ + return page->mapcount != 0; +} + +/* + * Error return values for the *_nopage functions + */ +#define NOPAGE_SIGBUS (NULL) +#define NOPAGE_OOM ((struct page *) (-1)) + +/* + * Different kinds of faults, as returned by handle_mm_fault(). + * Used to decide whether a process gets delivered SIGBUS or + * just gets major/minor fault counters bumped up. + */ +#define VM_FAULT_OOM (-1) +#define VM_FAULT_SIGBUS 0 +#define VM_FAULT_MINOR 1 +#define VM_FAULT_MAJOR 2 + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +extern void show_free_areas(void); + +struct page *shmem_nopage(struct vm_area_struct * vma, + unsigned long address, int *type); +int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new); +struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + unsigned long addr); +struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); +void shmem_lock(struct file * file, int lock); +int shmem_zero_setup(struct vm_area_struct *); + +/* + * Parameter block passed down to zap_pte_range in exceptional cases. + */ +struct zap_details { + struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ + struct address_space *check_mapping; /* Check page->mapping if set */ + pgoff_t first_index; /* Lowest page->index to unmap */ + pgoff_t last_index; /* Highest page->index to unmap */ + int atomic; /* May not schedule() */ +}; + +void zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *); +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, + struct vm_area_struct *start_vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted, + struct zap_details *); +void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma); +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long size, pgprot_t prot); +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); + +static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) +{ + unmap_mapping_range(mapping, holebegin, holelen, 0); +} + +extern int vmtruncate(struct inode * inode, loff_t offset); +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); +extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); +extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); +extern int make_pages_present(unsigned long addr, unsigned long end); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +void install_arg_page(struct vm_area_struct *, struct page *, unsigned long); + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + +int __set_page_dirty_buffers(struct page *page); +int __set_page_dirty_nobuffers(struct page *page); +int redirty_page_for_writepage(struct writeback_control *wbc, + struct page *page); +int FASTCALL(set_page_dirty(struct page *page)); +int set_page_dirty_lock(struct page *page); +int clear_page_dirty_for_io(struct page *page); + +/* + * Prototype to add a shrinker callback for ageable caches. + * + * These functions are passed a count `nr_to_scan' and a gfpmask. They should + * scan `nr_to_scan' objects, attempting to free them. + * + * The callback must the number of objects which remain in the cache. + * + * The callback will be passes nr_to_scan == 0 when the VM is querying the + * cache size, so a fastpath for that case is appropriate. + */ +typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); + +/* + * Add an aging callback. The int is the number of 'seeks' it takes + * to recreate one of the objects that these functions age. + */ + +#define DEFAULT_SEEKS 2 +struct shrinker; +extern struct shrinker *set_shrinker(int, shrinker_t); +extern void remove_shrinker(struct shrinker *shrinker); + +/* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc_map() that does all + * of this out-of-line. + */ +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + if (pgd_none(*pgd)) + return __pmd_alloc(mm, pgd, address); + return pmd_offset(pgd, address); +} + +extern void free_area_init(unsigned long * zones_size); +extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, + unsigned long * zones_size, unsigned long zone_start_pfn, + unsigned long *zholes_size); +extern void memmap_init_zone(struct page *, unsigned long, int, + unsigned long, unsigned long); +extern void mem_init(void); +extern void show_mem(void); +extern void si_meminfo(struct sysinfo * val); +extern void si_meminfo_node(struct sysinfo *val, int nid); + +static inline void vma_prio_tree_init(struct vm_area_struct *vma) +{ + vma->shared.vm_set.list.next = NULL; + vma->shared.vm_set.list.prev = NULL; + vma->shared.vm_set.parent = NULL; + vma->shared.vm_set.head = NULL; +} + +/* prio_tree.c */ +void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); +void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); +void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); +struct vm_area_struct *vma_prio_tree_next( + struct vm_area_struct *, struct prio_tree_root *, + struct prio_tree_iter *, pgoff_t begin, pgoff_t end); + +/* mmap.c */ +extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); +extern struct vm_area_struct *vma_merge(struct mm_struct *, + struct vm_area_struct *prev, unsigned long addr, unsigned long end, + unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, + struct mempolicy *); +extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); +extern int split_vma(struct mm_struct *, + struct vm_area_struct *, unsigned long addr, int new_below); +extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); +extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, + struct rb_node **, struct rb_node *); +extern struct vm_area_struct *copy_vma(struct vm_area_struct **, + unsigned long addr, unsigned long len, pgoff_t pgoff); +extern void exit_mmap(struct mm_struct *); + +extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff); + +static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + unsigned long ret = -EINVAL; + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) + ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +out: + return ret; +} + +extern int do_munmap(struct mm_struct *, unsigned long, size_t); + +extern unsigned long do_brk(unsigned long, unsigned long); + +/* filemap.c */ +extern unsigned long page_unuse(struct page *); +extern void truncate_inode_pages(struct address_space *, loff_t); + +/* generic vm_area_ops exported for stackable file systems */ +struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); + +/* mm/page-writeback.c */ +int write_one_page(struct page *page, int wait); + +/* readahead.c */ +#define VM_MAX_READAHEAD 128 /* kbytes */ +#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ + +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +void page_cache_readahead(struct address_space *mapping, + struct file_ra_state *ra, + struct file *filp, + unsigned long offset); +void handle_ra_miss(struct address_space *mapping, + struct file_ra_state *ra, pgoff_t offset); +unsigned long max_sane_readahead(unsigned long nr); + +/* Do stack extension */ +extern int expand_stack(struct vm_area_struct * vma, unsigned long address); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev); + +/* Look up the first VMA which intersects the interval start_addr..end_addr-1, + NULL if none. Assume start_addr < end_addr. */ +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +{ + struct vm_area_struct * vma = find_vma(mm,start_addr); + + if (vma && end_addr <= vma->vm_start) + vma = NULL; + return vma; +} + +static inline unsigned long vma_pages(struct vm_area_struct *vma) +{ + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; +} + +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); + +extern unsigned int nr_used_zone_pages(void); + +extern struct page * vmalloc_to_page(void *addr); +extern struct page * follow_page(struct mm_struct *mm, unsigned long address, + int write); +extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long to, unsigned long size, pgprot_t prot); + +#ifndef CONFIG_DEBUG_PAGEALLOC +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ +} +#endif + +#ifndef CONFIG_ARCH_GATE_AREA +extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); +int in_gate_area(struct task_struct *task, unsigned long addr); +#endif + +#endif /* __KERNEL__ */ +#endif /* _LINUX_MM_H */ diff --git a/mm/Makefile.orig b/mm/Makefile.orig new file mode 100644 index 000000000..d22feb38a --- /dev/null +++ b/mm/Makefile.orig @@ -0,0 +1,17 @@ +# +# Makefile for the linux memory manager. +# + +mmu-y := nommu.o +mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ + mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ + shmem.o vmalloc.o + +obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ + page_alloc.o page-writeback.o pdflush.o prio_tree.o \ + readahead.o slab.o swap.o truncate.o vmscan.o \ + $(mmu-y) + +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_NUMA) += mempolicy.o diff --git a/mm/mmap.c.orig b/mm/mmap.c.orig new file mode 100644 index 000000000..d6fd2fe13 --- /dev/null +++ b/mm/mmap.c.orig @@ -0,0 +1,1805 @@ +/* + * mm/mmap.c + * + * Written by obz. + * + * Address space accounting code + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * WARNING: the debugging will use recursive algorithms so never enable this + * unless you know what you are doing. + */ +#undef DEBUG_MM_RB + +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +int sysctl_overcommit_memory = 0; /* default is heuristic overcommit */ +int sysctl_overcommit_ratio = 50; /* default is 50% */ +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +atomic_t vm_committed_space = ATOMIC_INIT(0); + +EXPORT_SYMBOL(sysctl_overcommit_memory); +EXPORT_SYMBOL(sysctl_overcommit_ratio); +EXPORT_SYMBOL(sysctl_max_map_count); +EXPORT_SYMBOL(vm_committed_space); + +/* + * Requires inode->i_mapping->i_mmap_lock + */ +static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct file *file, struct address_space *mapping) +{ + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + if (vma->vm_flags & VM_SHARED) + mapping->i_mmap_writable--; + + flush_dcache_mmap_lock(mapping); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) + list_del_init(&vma->shared.vm_set.list); + else + vma_prio_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * Remove one vm structure and free it. + */ +static void remove_vm_struct(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (file) { + struct address_space *mapping = file->f_mapping; + spin_lock(&mapping->i_mmap_lock); + __remove_shared_vm_struct(vma, file, mapping); + spin_unlock(&mapping->i_mmap_lock); + } + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (file) + fput(file); + anon_vma_unlink(vma); + mpol_free(vma_policy(vma)); + kmem_cache_free(vm_area_cachep, vma); +} + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +asmlinkage unsigned long sys_brk(unsigned long brk) +{ + unsigned long rlim, retval; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + + if (brk < mm->end_code) + goto out; + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against rlimit.. */ + rlim = current->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + goto out; + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; +set_brk: + mm->brk = brk; +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + return retval; +} + +#ifdef DEBUG_MM_RB +static int browse_rb(struct rb_root *root) +{ + int i = 0, j; + struct rb_node *nd, *pn = NULL; + unsigned long prev = 0, pend = 0; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct vm_area_struct *vma; + vma = rb_entry(nd, struct vm_area_struct, vm_rb); + if (vma->vm_start < prev) + printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; + if (vma->vm_start < pend) + printk("vm_start %lx pend %lx\n", vma->vm_start, pend); + if (vma->vm_start > vma->vm_end) + printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); + i++; + pn = nd; + } + j = 0; + for (nd = pn; nd; nd = rb_prev(nd)) { + j++; + } + if (i != j) + printk("backwards %d, forwards %d\n", j, i), i = 0; + return i; +} + +void validate_mm(struct mm_struct *mm) +{ + int bug = 0; + int i = 0; + struct vm_area_struct *tmp = mm->mmap; + while (tmp) { + tmp = tmp->vm_next; + i++; + } + if (i != mm->map_count) + printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + i = browse_rb(&mm->mm_rb); + if (i != mm->map_count) + printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (bug) + BUG(); +} +#else +#define validate_mm(mm) do { } while (0) +#endif + +static struct vm_area_struct * +find_vma_prepare(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev, struct rb_node ***rb_link, + struct rb_node ** rb_parent) +{ + struct vm_area_struct * vma; + struct rb_node ** __rb_link, * __rb_parent, * rb_prev; + + __rb_link = &mm->mm_rb.rb_node; + rb_prev = __rb_parent = NULL; + vma = NULL; + + while (*__rb_link) { + struct vm_area_struct *vma_tmp; + + __rb_parent = *__rb_link; + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + return vma; + __rb_link = &__rb_parent->rb_left; + } else { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } + } + + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + *rb_link = __rb_link; + *rb_parent = __rb_parent; + return vma; +} + +static inline void +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) +{ + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + vma->vm_next = rb_entry(rb_parent, + struct vm_area_struct, vm_rb); + else + vma->vm_next = NULL; + } +} + +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, + struct rb_node **rb_link, struct rb_node *rb_parent) +{ + rb_link_node(&vma->vm_rb, rb_parent, rb_link); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); +} + +static inline void __vma_link_file(struct vm_area_struct *vma) +{ + struct file * file; + + file = vma->vm_file; + if (file) { + struct address_space *mapping = file->f_mapping; + + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&file->f_dentry->d_inode->i_writecount); + if (vma->vm_flags & VM_SHARED) + mapping->i_mmap_writable++; + + flush_dcache_mmap_lock(mapping); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) + list_add_tail(&vma->shared.vm_set.list, + &mapping->i_mmap_nonlinear); + else + vma_prio_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + } +} + +static void +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_rb(mm, vma, rb_link, rb_parent); + __anon_vma_link(vma); +} + +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + struct address_space *mapping = NULL; + + if (vma->vm_file) + mapping = vma->vm_file->f_mapping; + + if (mapping) + spin_lock(&mapping->i_mmap_lock); + anon_vma_lock(vma); + + __vma_link(mm, vma, prev, rb_link, rb_parent); + __vma_link_file(vma); + + anon_vma_unlock(vma); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + + mark_mm_hugetlb(mm, vma); + mm->map_count++; + validate_mm(mm); +} + +/* + * Helper for vma_adjust in the split_vma insert case: + * insert vm structure into list and rbtree and anon_vma, + * but it has already been inserted into prio_tree earlier. + */ +static void +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + __vma_link(mm, vma, prev, rb_link, rb_parent); + mm->map_count++; +} + +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + +/* + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that + * is already present in an i_mmap tree without adjusting the tree. + * The following helper function should be used when such adjustments + * are necessary. The "insert" vma (if any) is to be inserted + * before we drop the necessary locks. + */ +void vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next = vma->vm_next; + struct address_space *mapping = NULL; + struct prio_tree_root *root = NULL; + struct file *file = vma->vm_file; + struct anon_vma *anon_vma = NULL; + long adjust_next = 0; + int remove_next = 0; + + if (next && !insert) { + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and + * perhaps the one after too (mprotect case 6). + */ +again: remove_next = 1 + (end > next->vm_end); + end = next->vm_end; + anon_vma = next->anon_vma; + } else if (end > next->vm_start) { + /* + * vma expands, overlapping part of the next: + * mprotect case 5 shifting the boundary up. + */ + adjust_next = (end - next->vm_start) >> PAGE_SHIFT; + anon_vma = next->anon_vma; + } else if (end < vma->vm_end) { + /* + * vma shrinks, and !insert tells it's not + * split_vma inserting another: so it must be + * mprotect case 4 shifting the boundary down. + */ + adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); + anon_vma = next->anon_vma; + } + } + + if (file) { + mapping = file->f_mapping; + if (!(vma->vm_flags & VM_NONLINEAR)) + root = &mapping->i_mmap; + spin_lock(&mapping->i_mmap_lock); + if (insert) { + /* + * Put into prio_tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(insert); + } + } + + /* + * When changing only vma->vm_end, we don't really need + * anon_vma lock: but is that case worth optimizing out? + */ + if (vma->anon_vma) + anon_vma = vma->anon_vma; + if (anon_vma) + spin_lock(&anon_vma->lock); + + if (root) { + flush_dcache_mmap_lock(mapping); + vma_prio_tree_remove(vma, root); + if (adjust_next) + vma_prio_tree_remove(next, root); + } + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; + } + + if (root) { + if (adjust_next) { + vma_prio_tree_init(next); + vma_prio_tree_insert(next, root); + } + vma_prio_tree_init(vma); + vma_prio_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + if (remove_next) { + /* + * vma_merge has merged next into vma, and needs + * us to remove next before dropping the locks. + */ + __vma_unlink(mm, next, vma); + if (file) + __remove_shared_vm_struct(next, file, mapping); + if (next->anon_vma) + __anon_vma_merge(vma, next); + } else if (insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + __insert_vm_struct(mm, insert); + } + + if (anon_vma) + spin_unlock(&anon_vma->lock); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); + + if (remove_next) { + if (file) + fput(file); + mm->map_count--; + mpol_free(vma_policy(next)); + kmem_cache_free(vm_area_cachep, next); + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove another next too. It would clutter + * up the code too much to do both in one go. + */ + if (remove_next == 2) { + next = vma->vm_next; + goto again; + } + } + + validate_mm(mm); +} + +/* + * If the vma has a ->close operation then the driver probably needs to release + * per-vma resources, so we don't attempt to merge those. + */ +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) + +static inline int is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, unsigned long vm_flags) +{ + if (vma->vm_flags != vm_flags) + return 0; + if (vma->vm_file != file) + return 0; + if (vma->vm_ops && vma->vm_ops->close) + return 0; + return 1; +} + +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, + struct anon_vma *anon_vma2) +{ + return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * in front of (at a lower virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We don't check here for the merged mmap wrapping around the end of pagecache + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which + * wrap, nor mmaps which cover the final page at index -1UL. + */ +static int +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + if (vma->vm_pgoff == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * beyond (at a higher virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + */ +static int +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { + pgoff_t vm_pglen; + vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (vma->vm_pgoff + vm_pglen == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out + * whether that can be merged with its predecessor or its successor. + * Or both (it neatly fills a hole). + * + * In most cases - when called for mmap, brk or mremap - [addr,end) is + * certain not to be mapped by the time vma_merge is called; but when + * called for mprotect, it is certain to be already mapped (either at + * an offset within prev, or at the start of next), and the flags of + * this area are about to be changed to vm_flags - and the no-change + * case has already been eliminated. + * + * The following mprotect cases have to be considered, where AAAA is + * the area passed down from mprotect_fixup, never extending beyond one + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: + * + * AAAA AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX + * cannot merge might become might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or + * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or + * mremap move: PPPPNNNNNNNN 8 + * AAAA + * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN + * might become case 1 below case 2 below case 3 below + * + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: + * mprotect_fixup updates vm_flags & vm_page_prot on successful return. + */ +struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy) +{ + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + struct vm_area_struct *area, *next; + + /* + * We later require that vma->vm_flags == vm_flags, + * so this tests vma->vm_flags & VM_SPECIAL, too. + */ + if (vm_flags & VM_SPECIAL) + return NULL; + + if (prev) + next = prev->vm_next; + else + next = mm->mmap; + area = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = next->vm_next; + + /* + * Can it merge with the predecessor? + */ + if (prev && prev->vm_end == addr && + mpol_equal(vma_policy(prev), policy) && + can_vma_merge_after(prev, vm_flags, + anon_vma, file, pgoff)) { + /* + * OK, it can. Can we now merge in the successor as well? + */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen) && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma)) { + /* cases 1, 6 */ + vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL); + } else /* cases 2, 5, 7 */ + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); + return prev; + } + + /* + * Can this new request be merged in front of next? + */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen)) { + if (prev && addr < prev->vm_end) /* case 4 */ + vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL); + else /* cases 3, 8 */ + vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL); + return area; + } + + return NULL; +} + +/* + * find_mergeable_anon_vma is used by anon_vma_prepare, to check + * neighbouring vmas for a suitable anon_vma, before it goes off + * to allocate a new anon_vma. It checks because a repetitive + * sequence of mprotects and faults may otherwise lead to distinct + * anon_vmas being allocated, preventing vma merge in subsequent + * mprotect. + */ +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *near; + unsigned long vm_flags; + + near = vma->vm_next; + if (!near) + goto try_prev; + + /* + * Since only mprotect tries to remerge vmas, match flags + * which might be mprotected into each other later on. + * Neither mlock nor madvise tries to remerge at present, + * so leave their flags as obstructing a merge. + */ + vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); + + if (near->anon_vma && vma->vm_end == near->vm_start && + mpol_equal(vma_policy(vma), vma_policy(near)) && + can_vma_merge_before(near, vm_flags, + NULL, vma->vm_file, vma->vm_pgoff + + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) + return near->anon_vma; +try_prev: + /* + * It is potentially slow to have to call find_vma_prev here. + * But it's only on the first write fault on the vma, not + * every time, and we could devise a way to avoid it later + * (e.g. stash info in next's anon_vma_node when assigning + * an anon_vma, or when trying vma_merge). Another time. + */ + if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma) + BUG(); + if (!near) + goto none; + + vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); + + if (near->anon_vma && near->vm_end == vma->vm_start && + mpol_equal(vma_policy(near), vma_policy(vma)) && + can_vma_merge_after(near, vm_flags, + NULL, vma->vm_file, vma->vm_pgoff)) + return near->anon_vma; +none: + /* + * There's no absolute need to look only at touching neighbours: + * we could search further afield for "compatible" anon_vmas. + * But it would probably just be a waste of time searching, + * or lead to too many vmas hanging off the same anon_vma. + * We're trying to allow mprotect remerging later on, + * not trying to minimize memory used for anon_vmas. + */ + return NULL; +} + +/* + * The caller must hold down_write(current->mm->mmap_sem). + */ + +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flags, unsigned long pgoff) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + struct inode *inode; + unsigned int vm_flags; + int correct_wcount = 0; + int error; + struct rb_node ** rb_link, * rb_parent; + int accountable = 1; + unsigned long charged = 0; + + if (file) { + if (is_file_hugepages(file)) + accountable = 0; + + if (!file->f_op || !file->f_op->mmap) + return -ENODEV; + + if ((prot & PROT_EXEC) && + (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) + return -EPERM; + } + + if (!len) + return addr; + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len || len > TASK_SIZE) + return -EINVAL; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EINVAL; + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + if (flags & MAP_LOCKED) { + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + vm_flags |= VM_LOCKED; + } + /* mlock MCL_FUTURE? */ + if (vm_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + inode = file ? file->f_dentry->d_inode : NULL; + + if (file) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) + return -EACCES; + + /* + * Make sure we don't allow writing to an append-only + * file.. + */ + if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* + * Make sure there are no mandatory locks on the file. + */ + if (locks_verify_locked(inode)) + return -EAGAIN; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + break; + + default: + return -EINVAL; + } + } else { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + vm_flags |= VM_SHARED | VM_MAYSHARE; + break; + case MAP_PRIVATE: + /* + * Set pgoff according to addr for anon_vma. + */ + pgoff = addr >> PAGE_SHIFT; + break; + default: + return -EINVAL; + } + } + + error = security_file_mmap(file, prot, flags); + if (error) + return error; + + /* Clear old maps */ + error = -ENOMEM; +munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limit. */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (accountable && (!(flags & MAP_NORESERVE) || + sysctl_overcommit_memory > 1)) { + if (vm_flags & VM_SHARED) { + /* Check memory availability in shmem_file_setup? */ + vm_flags |= VM_ACCOUNT; + } else if (vm_flags & VM_WRITE) { + /* + * Private writable mapping: check memory availability + */ + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + } + + /* + * Can we just expand an old private anonymous mapping? + * The VM_SHARED test is necessary because shmem_zero_setup + * will create the file object for a shared anonymous map below. + */ + if (!file && !(vm_flags & VM_SHARED) && + vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, NULL, pgoff, NULL)) + goto out; + + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + memset(vma, 0, sizeof(*vma)); + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags; + vma->vm_page_prot = protection_map[vm_flags & 0x0f]; + vma->vm_pgoff = pgoff; + + if (file) { + error = -EINVAL; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + goto free_vma; + if (vm_flags & VM_DENYWRITE) { + error = deny_write_access(file); + if (error) + goto free_vma; + correct_wcount = 1; + } + vma->vm_file = file; + get_file(file); + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } + + /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform + * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) + * that memory reservation must be checked; but that reservation + * belongs to shared memory object, not to vma: so now clear it. + */ + if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) + vma->vm_flags &= ~VM_ACCOUNT; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + + if (!file || !vma_merge(mm, prev, addr, vma->vm_end, + vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { + vma_link(mm, vma, prev, rb_link, rb_parent); + if (correct_wcount) + atomic_inc(&inode->i_writecount); + } else { + if (file) { + if (correct_wcount) + atomic_inc(&inode->i_writecount); + fput(file); + } + mpol_free(vma_policy(vma)); + kmem_cache_free(vm_area_cachep, vma); + } +out: + mm->total_vm += len >> PAGE_SHIFT; + if (vm_flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + if (flags & MAP_POPULATE) { + up_write(&mm->mmap_sem); + sys_remap_file_pages(addr, len, 0, + pgoff, flags & MAP_NONBLOCK); + down_write(&mm->mmap_sem); + } + return addr; + +unmap_and_free_vma: + if (correct_wcount) + atomic_inc(&inode->i_writecount); + vma->vm_file = NULL; + fput(file); + + /* Undo any partial mapping done by a device driver. */ + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); +free_vma: + kmem_cache_free(vm_area_cachep, vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + return error; +} + +EXPORT_SYMBOL(do_mmap_pgoff); + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA +static inline unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + start_addr = addr = mm->free_area_cache; + +full_search: + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = addr = TASK_UNMAPPED_BASE; + goto full_search; + } + return -ENOMEM; + } + if (!vma || addr + len <= vma->vm_start) { + /* + * Remember the place where we stopped the search: + */ + mm->free_area_cache = addr + len; + return addr; + } + addr = vma->vm_end; + } +} +#else +extern unsigned long +arch_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); +#endif + +unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + if (flags & MAP_FIXED) { + unsigned long ret; + + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (addr & ~PAGE_MASK) + return -EINVAL; + if (file && is_file_hugepages(file)) { + /* + * Check if the given range is hugepage aligned, and + * can be made suitable for hugepages. + */ + ret = prepare_hugepage_range(addr, len); + } else { + /* + * Ensure that a normal request is not falling in a + * reserved hugepage range. For some archs like IA-64, + * there is a separate region for hugepages. + */ + ret = is_hugepage_only_range(addr, len); + } + if (ret) + return -EINVAL; + return addr; + } + + if (file && file->f_op && file->f_op->get_unmapped_area) + return file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} + +EXPORT_SYMBOL(get_unmapped_area); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + struct rb_node * rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, + struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + } + if (vma) + mm->mmap_cache = vma; + } + } + return vma; +} + +EXPORT_SYMBOL(find_vma); + +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma = NULL, *prev = NULL; + struct rb_node * rb_node; + if (!mm) + goto out; + + /* Guard against addr being lower than the first VMA */ + vma = mm->mmap; + + /* Go through the RB tree quickly. */ + rb_node = mm->mm_rb.rb_node; + + while (rb_node) { + struct vm_area_struct *vma_tmp; + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (addr < vma_tmp->vm_end) { + rb_node = rb_node->rb_left; + } else { + prev = vma_tmp; + if (!prev->vm_next || (addr < prev->vm_next->vm_end)) + break; + rb_node = rb_node->rb_right; + } + } + +out: + *pprev = prev; + return prev ? prev->vm_next : vma; +} + +#ifdef CONFIG_STACK_GROWSUP +/* + * vma is the first one with address > vma->vm_end. Have to extend vma. + */ +int expand_stack(struct vm_area_struct * vma, unsigned long address) +{ + unsigned long grow; + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + anon_vma_lock(vma); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + */ + address += 4 + PAGE_SIZE - 1; + address &= PAGE_MASK; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + /* Overcommit.. */ + if (security_vm_enough_memory(grow)) { + anon_vma_unlock(vma); + return -ENOMEM; + } + + if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + current->rlim[RLIMIT_AS].rlim_cur) { + anon_vma_unlock(vma); + vm_unacct_memory(grow); + return -ENOMEM; + } + vma->vm_end = address; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + anon_vma_unlock(vma); + return 0; +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + addr &= PAGE_MASK; + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; + if (!prev || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) { + make_pages_present(addr, prev->vm_end); + } + return prev; +} +#else +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + */ +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + unsigned long grow; + + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + anon_vma_lock(vma); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + */ + address &= PAGE_MASK; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + /* Overcommit.. */ + if (security_vm_enough_memory(grow)) { + anon_vma_unlock(vma); + return -ENOMEM; + } + + if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + current->rlim[RLIMIT_AS].rlim_cur) { + anon_vma_unlock(vma); + vm_unacct_memory(grow); + return -ENOMEM; + } + vma->vm_start = address; + vma->vm_pgoff -= grow; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + anon_vma_unlock(vma); + return 0; +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct * vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(mm,addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) { + make_pages_present(addr, start); + } + return vma; +} +#endif + +/* + * Try to free as many page directory entries as we can, + * without having to work very hard at actually scanning + * the page tables themselves. + * + * Right now we try to free page tables if we have a nice + * PGDIR-aligned area that got free'd up. We could be more + * granular if we want to, but this is fast and simple, + * and covers the bad cases. + * + * "prev", if it exists, points to a vma before the one + * we just free'd - but there's no telling how much before. + */ +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + unsigned long first = start & PGDIR_MASK; + unsigned long last = end + PGDIR_SIZE - 1; + unsigned long start_index, end_index; + struct mm_struct *mm = tlb->mm; + + if (!prev) { + prev = mm->mmap; + if (!prev) + goto no_mmaps; + if (prev->vm_end > start) { + if (last > prev->vm_start) + last = prev->vm_start; + goto no_mmaps; + } + } + for (;;) { + struct vm_area_struct *next = prev->vm_next; + + if (next) { + if (next->vm_start < start) { + prev = next; + continue; + } + if (last > next->vm_start) + last = next->vm_start; + } + if (prev->vm_end > first) + first = prev->vm_end + PGDIR_SIZE - 1; + break; + } +no_mmaps: + if (last < first) /* for arches with discontiguous pgd indices */ + return; + /* + * If the PGD bits are not consecutive in the virtual address, the + * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. + */ + start_index = pgd_index(first); + if (start_index < FIRST_USER_PGD_NR) + start_index = FIRST_USER_PGD_NR; + end_index = pgd_index(last); + if (end_index > start_index) { + clear_page_tables(tlb, start_index, end_index - start_index); + flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); + } +} + +/* Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list. + */ +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) +{ + size_t len = area->vm_end - area->vm_start; + + area->vm_mm->total_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_LOCKED) + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + /* + * Is this a new hole at the lowest possible address? + */ + if (area->vm_start >= TASK_UNMAPPED_BASE && + area->vm_start < area->vm_mm->free_area_cache) + area->vm_mm->free_area_cache = area->vm_start; + + remove_vm_struct(area); +} + +/* + * Update the VMA and inode share lists. + * + * Ok - we have the memory areas we should free on the 'free' list, + * so release them, and do the vma updates. + */ +static void unmap_vma_list(struct mm_struct *mm, + struct vm_area_struct *mpnt) +{ + do { + struct vm_area_struct *next = mpnt->vm_next; + unmap_vma(mm, mpnt); + mpnt = next; + } while (mpnt != NULL); + validate_mm(mm); +} + +/* + * Get rid of page table information in the indicated region. + * + * Called with the page table lock held. + */ +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev, + unsigned long start, + unsigned long end) +{ + struct mmu_gather *tlb; + unsigned long nr_accounted = 0; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); + + if (is_hugepage_only_range(start, end - start)) + hugetlb_free_pgtables(tlb, prev, start, end); + else + free_pgtables(tlb, prev, start, end); + tlb_finish_mmu(tlb, start, end); +} + +/* + * Create a list of vma's touched by the unmap, removing them from the mm's + * vma list as we go.. + */ +static void +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, unsigned long end) +{ + struct vm_area_struct **insertion_point; + struct vm_area_struct *tail_vma = NULL; + + insertion_point = (prev ? &prev->vm_next : &mm->mmap); + do { + rb_erase(&vma->vm_rb, &mm->mm_rb); + mm->map_count--; + tail_vma = vma; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + *insertion_point = vma; + tail_vma->vm_next = NULL; + mm->mmap_cache = NULL; /* Kill the cache. */ +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the the tail. + */ +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long addr, int new_below) +{ + struct mempolicy *pol; + struct vm_area_struct *new; + + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!new) + return -ENOMEM; + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + vma_prio_tree_init(new); + + if (new_below) + new->vm_end = addr; + else { + new->vm_start = addr; + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + } + + pol = mpol_copy(vma_policy(vma)); + if (IS_ERR(pol)) { + kmem_cache_free(vm_area_cachep, new); + return PTR_ERR(pol); + } + vma_set_policy(new, pol); + + if (new->vm_file) + get_file(new->vm_file); + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + if (new_below) + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + ((addr - new->vm_start) >> PAGE_SHIFT), new); + else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + + return 0; +} + +/* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardinge + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end; + struct vm_area_struct *mpnt, *prev, *last; + + if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return -EINVAL; + + /* Find the first overlapping VMA */ + mpnt = find_vma_prev(mm, start, &prev); + if (!mpnt) + return 0; + /* we have start < mpnt->vm_end */ + + if (is_vm_hugetlb_page(mpnt)) { + int ret = is_aligned_hugepage_range(start, len); + + if (ret) + return ret; + } + + /* if it doesn't overlap, we have nothing.. */ + end = start + len; + if (mpnt->vm_start >= end) + return 0; + + /* Something will probably happen, so notify. */ + if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) + profile_exec_unmap(mm); + + /* + * If we need to split any vma, do it now to save pain later. + * + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially + * unmapped vm_area_struct will remain in use: so lower split_vma + * places tmp vma above, and higher split_vma places tmp vma below. + */ + if (start > mpnt->vm_start) { + if (split_vma(mm, mpnt, start, 0)) + return -ENOMEM; + prev = mpnt; + } + + /* Does it split the last one? */ + last = find_vma(mm, end); + if (last && end > last->vm_start) { + if (split_vma(mm, last, end, 1)) + return -ENOMEM; + } + mpnt = prev? prev->vm_next: mm->mmap; + + /* + * Remove the vma's, and unmap the actual pages + */ + detach_vmas_to_be_unmapped(mm, mpnt, prev, end); + spin_lock(&mm->page_table_lock); + unmap_region(mm, mpnt, prev, start, end); + spin_unlock(&mm->page_table_lock); + + /* Fix up all other VM information */ + unmap_vma_list(mm, mpnt); + + return 0; +} + +EXPORT_SYMBOL(do_munmap); + +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned long flags; + struct rb_node ** rb_link, * rb_parent; + pgoff_t pgoff = addr >> PAGE_SHIFT; + + len = PAGE_ALIGN(len); + if (!len) + return addr; + + if ((addr + len) > TASK_SIZE || (addr + len) < addr) + return -EINVAL; + + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + /* + * Clear old maps. this also does some error checking for us + */ + munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limits *after* clearing old maps... */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + /* Can we just expand an old private anonymous mapping? */ + if (vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL)) + goto out; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + } + memset(vma, 0, sizeof(*vma)); + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_pgoff = pgoff; + vma->vm_flags = flags; + vma->vm_page_prot = protection_map[flags & 0x0f]; + vma_link(mm, vma, prev, rb_link, rb_parent); +out: + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; +} + +EXPORT_SYMBOL(do_brk); + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct *mm) +{ + struct mmu_gather *tlb; + struct vm_area_struct *vma; + unsigned long nr_accounted = 0; + + profile_exit_mmap(mm); + + lru_add_drain(); + + spin_lock(&mm->page_table_lock); + + tlb = tlb_gather_mmu(mm, 1); + flush_cache_mm(mm); + /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ + mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, + ~0UL, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); + BUG_ON(mm->map_count); /* This is just debugging */ + clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); + tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); + + vma = mm->mmap; + mm->mmap = mm->mmap_cache = NULL; + mm->mm_rb = RB_ROOT; + mm->rss = 0; + mm->total_vm = 0; + mm->locked_vm = 0; + + spin_unlock(&mm->page_table_lock); + + /* + * Walk the list again, actually closing and freeing it + * without holding any MM locks. + */ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + remove_vm_struct(vma); + vma = next; + } +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap tree. If vm_file is non-NULL + * then i_mmap_lock is taken here. + */ +void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index + * are set. But now set the vm_pgoff it will almost certainly + * end up with (unless mremap moves it elsewhere before that + * first wfault), so /proc/pid/maps tells a consistent story. + * + * By setting it to reflect the virtual start address of the + * vma, merges and splits can happen in a seamless way, just + * using the existing file pgoff checks and manipulations. + * Similarly in do_mmap_pgoff and in do_brk. + */ + if (!vma->vm_file) { + BUG_ON(vma->anon_vma); + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + } + __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + vma_link(mm, vma, prev, rb_link, rb_parent); +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff) +{ + struct vm_area_struct *vma = *vmap; + unsigned long vma_start = vma->vm_start; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma, *prev; + struct rb_node **rb_link, *rb_parent; + struct mempolicy *pol; + + /* + * If anonymous vma has not yet been faulted, update new pgoff + * to match new location, to increase its chance of merging. + */ + if (!vma->vm_file && !vma->anon_vma) + pgoff = addr >> PAGE_SHIFT; + + find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + if (new_vma) { + /* + * Source vma may have been merged into new_vma + */ + if (vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end) + *vmap = new_vma; + } else { + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (new_vma) { + *new_vma = *vma; + vma_prio_tree_init(new_vma); + pol = mpol_copy(vma_policy(vma)); + if (IS_ERR(pol)) { + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; + } + vma_set_policy(new_vma, pol); + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + } + } + return new_vma; +} diff --git a/mm/mprotect.c.orig b/mm/mprotect.c.orig new file mode 100644 index 000000000..5b438e1a0 --- /dev/null +++ b/mm/mprotect.c.orig @@ -0,0 +1,282 @@ +/* + * mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + * + * Address space accounting code + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static inline void +change_pte_range(pmd_t *pmd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + pte = pte_offset_map(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + if (pte_present(*pte)) { + pte_t entry; + + /* Avoid an SMP race with hardware updated dirty/clean + * bits by wiping the pte and then setting the new pte + * into place. + */ + entry = ptep_get_and_clear(pte); + set_pte(pte, pte_modify(entry, newprot)); + } + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); + pte_unmap(pte - 1); +} + +static inline void +change_pmd_range(pgd_t *pgd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*pgd)) + return; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return; + } + pmd = pmd_offset(pgd, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + change_pte_range(pmd, address, end - address, newprot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +static void +change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot) +{ + pgd_t *dir; + unsigned long beg = start; + + dir = pgd_offset(current->mm, start); + flush_cache_range(vma, beg, end); + if (start >= end) + BUG(); + spin_lock(¤t->mm->page_table_lock); + do { + change_pmd_range(dir, start, end - start, newprot); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (start && (start < end)); + flush_tlb_range(vma, beg, end); + spin_unlock(¤t->mm->page_table_lock); + return; +} + +static int +mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned int newflags) +{ + struct mm_struct * mm = vma->vm_mm; + unsigned long charged = 0; + pgprot_t newprot; + pgoff_t pgoff; + int error; + + if (newflags == vma->vm_flags) { + *pprev = vma; + return 0; + } + + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. + * + * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting + * a MAP_NORESERVE private mapping to writable will now reserve. + */ + if (newflags & VM_WRITE) { + if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + charged = (end - start) >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + newflags |= VM_ACCOUNT; + } + } + + newprot = protection_map[newflags & 0xf]; + + /* + * First try to merge with previous and/or next vma. + */ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *pprev = vma_merge(mm, *pprev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + if (*pprev) { + vma = *pprev; + goto success; + } + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto fail; + } + /* + * Unless it returns an error, this function always sets *pprev to + * the first vma for which vma->vm_end >= end. + */ + *pprev = vma; + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto fail; + } + +success: + /* + * vm_flags and vm_page_prot are protected by the mmap_sem + * held in write mode. + */ + vma->vm_flags = newflags; + vma->vm_page_prot = newprot; + change_protection(vma, start, end, newprot); + return 0; + +fail: + vm_unacct_memory(charged); + return error; +} + +asmlinkage long +sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + unsigned long vm_flags, nstart, end, tmp; + struct vm_area_struct *vma, *prev; + int error = -EINVAL; + const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); + if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ + return -EINVAL; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = PAGE_ALIGN(len); + end = start + len; + if (end < start) + return -ENOMEM; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) + return -EINVAL; + if (end == start) + return 0; + + vm_flags = calc_vm_prot_bits(prot); + + down_write(¤t->mm->mmap_sem); + + vma = find_vma_prev(current->mm, start, &prev); + error = -ENOMEM; + if (!vma) + goto out; + if (unlikely(grows & PROT_GROWSDOWN)) { + if (vma->vm_start >= end) + goto out; + start = vma->vm_start; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + } + else { + if (vma->vm_start > start) + goto out; + if (unlikely(grows & PROT_GROWSUP)) { + end = vma->vm_end; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSUP)) + goto out; + } + } + if (start > vma->vm_start) + prev = vma; + + for (nstart = start ; ; ) { + unsigned int newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + if (is_vm_hugetlb_page(vma)) { + error = -EACCES; + goto out; + } + + newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + goto out; + } + + error = security_file_mprotect(vma, prot); + if (error) + goto out; + + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + goto out; + nstart = tmp; + + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + goto out; + + vma = prev->vm_next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + goto out; + } + } +out: + up_write(¤t->mm->mmap_sem); + return error; +} diff --git a/mm/page_alloc.c.orig b/mm/page_alloc.c.orig new file mode 100644 index 000000000..3dbbeb2f3 --- /dev/null +++ b/mm/page_alloc.c.orig @@ -0,0 +1,2013 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DECLARE_BITMAP(node_online_map, MAX_NUMNODES); +struct pglist_data *pgdat_list; +unsigned long totalram_pages; +unsigned long totalhigh_pages; +int nr_swap_pages; +int numnodes = 1; +int sysctl_lower_zone_protection = 0; + +EXPORT_SYMBOL(totalram_pages); +EXPORT_SYMBOL(nr_swap_pages); + +/* + * Used by page_zone() to look up the address of the struct zone whose + * id is encoded in the upper bits of page->flags + */ +struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; +EXPORT_SYMBOL(zone_table); + +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +int min_free_kbytes = 1024; + +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + return 1; + if (page_to_pfn(page) < zone->zone_start_pfn) + return 1; + if (zone != page_zone(page)) + return 1; + return 0; +} + +static void bad_page(const char *function, struct page *page) +{ + printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", + function, current->comm, page); + printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n", + (unsigned long)page->flags, page->mapping, + (int)page->mapcount, page_count(page)); + printk(KERN_EMERG "Backtrace:\n"); + dump_stack(); + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); + page->flags &= ~(1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_maplock | + 1 << PG_anon | + 1 << PG_swapcache | + 1 << PG_writeback); + set_page_count(page, 0); + page->mapping = NULL; + page->mapcount = 0; +} + +#ifndef CONFIG_HUGETLB_PAGE +#define prep_compound_page(page, order) do { } while (0) +#define destroy_compound_page(page, order) do { } while (0) +#else +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page". + * + * The remaining PAGE_SIZE pages are called "tail pages". + * + * All pages have PG_compound set. All pages have their ->private pointing at + * the head page (even the head page has this). + * + * The first tail page's ->mapping, if non-zero, holds the address of the + * compound page's put_page() function. + * + * The order of the allocation is stored in the first tail page's ->index + * This is only for debug at present. This usage means that zero-order pages + * may not be compound. + */ +static void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + page[1].mapping = 0; + page[1].index = order; + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + SetPageCompound(p); + p->private = (unsigned long)page; + } +} + +static void destroy_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + if (!PageCompound(page)) + return; + + if (page[1].index != order) + bad_page(__FUNCTION__, page); + + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + if (!PageCompound(p)) + bad_page(__FUNCTION__, page); + if (p->private != (unsigned long)page) + bad_page(__FUNCTION__, page); + ClearPageCompound(p); + } +} +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep one bit for each pair of blocks, which + * is set to 1 iff only one of the pair is allocated. So when we + * are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static inline void __free_pages_bulk (struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + unsigned long page_idx, index; + + if (order) + destroy_compound_page(page, order); + page_idx = page - base; + if (page_idx & ~mask) + BUG(); + index = page_idx >> (1 + order); + + zone->free_pages -= mask; + while (mask + (1 << (MAX_ORDER-1))) { + struct page *buddy1, *buddy2; + + BUG_ON(area >= zone->free_area + MAX_ORDER); + if (!__test_and_change_bit(index, area->map)) + /* + * the buddy page is still allocated. + */ + break; + /* + * Move the buddy up one level. + * This code is taking advantage of the identity: + * -mask = 1+~mask + */ + buddy1 = base + (page_idx ^ -mask); + buddy2 = base + page_idx; + BUG_ON(bad_range(zone, buddy1)); + BUG_ON(bad_range(zone, buddy2)); + list_del(&buddy1->lru); + mask <<= 1; + area++; + index >>= 1; + page_idx &= mask; + } + list_add(&(base + page_idx)->lru, &area->free_list); +} + +static inline void free_pages_check(const char *function, struct page *page) +{ + if ( page_mapped(page) || + page->mapping != NULL || + page_count(page) != 0 || + (page->flags & ( + 1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_maplock | + 1 << PG_anon | + 1 << PG_swapcache | + 1 << PG_writeback ))) + bad_page(function, page); + if (PageDirty(page)) + ClearPageDirty(page); +} + +/* + * Frees a list of pages. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free, or 0 for all on the list. + * + * If the zone was previously in an "all pages pinned" state then look to + * see if this freeing clears that state. + * + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +static int +free_pages_bulk(struct zone *zone, int count, + struct list_head *list, unsigned int order) +{ + unsigned long mask, flags; + struct free_area *area; + struct page *base, *page = NULL; + int ret = 0; + + mask = (~0UL) << order; + base = zone->zone_mem_map; + area = zone->free_area + order; + spin_lock_irqsave(&zone->lock, flags); + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + while (!list_empty(list) && count--) { + page = list_entry(list->prev, struct page, lru); + /* have to delete it as __free_pages_bulk list manipulates */ + list_del(&page->lru); + __free_pages_bulk(page, base, zone, area, mask, order); + ret++; + } + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +} + +void __free_pages_ok(struct page *page, unsigned int order) +{ + LIST_HEAD(list); + int i; + + mod_page_state(pgfree, 1 << order); + for (i = 0 ; i < (1 << order) ; ++i) + free_pages_check(__FUNCTION__, page + i); + list_add(&page->lru, &list); + kernel_map_pages(page, 1<> (1+(order)), (area)->map) + +static inline struct page * +expand(struct zone *zone, struct page *page, + unsigned long index, int low, int high, struct free_area *area) +{ + unsigned long size = 1 << high; + + while (high > low) { + BUG_ON(bad_range(zone, page)); + area--; + high--; + size >>= 1; + list_add(&page->lru, &area->free_list); + MARK_USED(index, high, area); + index += size; + page += size; + } + return page; +} + +static inline void set_page_refs(struct page *page, int order) +{ +#ifdef CONFIG_MMU + set_page_count(page, 1); +#else + int i; + + /* + * We need to reference all the pages for this order, otherwise if + * anyone accesses one of the pages with (get/put) it will be freed. + */ + for (i = 0; i < (1 << order); i++) + set_page_count(page+i, 1); +#endif /* CONFIG_MMU */ +} + +/* + * This page is about to be returned from the page allocator + */ +static void prep_new_page(struct page *page, int order) +{ + if (page->mapping || page_mapped(page) || + (page->flags & ( + 1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_maplock | + 1 << PG_anon | + 1 << PG_swapcache | + 1 << PG_writeback ))) + bad_page(__FUNCTION__, page); + + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked | 1 << PG_mappedtodisk); + page->private = 0; + set_page_refs(page, order); +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order) +{ + struct free_area * area; + unsigned int current_order; + struct page *page; + unsigned int index; + + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = zone->free_area + current_order; + if (list_empty(&area->free_list)) + continue; + + page = list_entry(area->free_list.next, struct page, lru); + list_del(&page->lru); + index = page - zone->zone_mem_map; + if (current_order != MAX_ORDER-1) + MARK_USED(index, current_order, area); + zone->free_pages -= 1UL << order; + return expand(zone, page, index, order, current_order, area); + } + + return NULL; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list) +{ + unsigned long flags; + int i; + int allocated = 0; + struct page *page; + + spin_lock_irqsave(&zone->lock, flags); + for (i = 0; i < count; ++i) { + page = __rmqueue(zone, order); + if (page == NULL) + break; + allocated++; + list_add_tail(&page->lru, list); + } + spin_unlock_irqrestore(&zone->lock, flags); + return allocated; +} + +#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) +static void __drain_pages(unsigned int cpu) +{ + struct zone *zone; + int i; + + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + pset = &zone->pageset[cpu]; + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + pcp->count -= free_pages_bulk(zone, pcp->count, + &pcp->list, 0); + } + } +} +#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_PM +int is_head_of_free_region(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + int order; + struct list_head *curr; + + /* + * Should not matter as we need quiescent system for + * suspend anyway, but... + */ + spin_lock_irqsave(&zone->lock, flags); + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area[order].free_list) + if (page == list_entry(curr, struct page, lru)) { + spin_unlock_irqrestore(&zone->lock, flags); + return 1 << order; + } + spin_unlock_irqrestore(&zone->lock, flags); + return 0; +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void) +{ + unsigned long flags; + + local_irq_save(flags); + __drain_pages(smp_processor_id()); + local_irq_restore(flags); +} +#endif /* CONFIG_PM */ + +static void zone_statistics(struct zonelist *zonelist, struct zone *z) +{ +#ifdef CONFIG_NUMA + unsigned long flags; + int cpu; + pg_data_t *pg = z->zone_pgdat; + pg_data_t *orig = zonelist->zones[0]->zone_pgdat; + struct per_cpu_pageset *p; + + local_irq_save(flags); + cpu = smp_processor_id(); + p = &z->pageset[cpu]; + if (pg == orig) { + z->pageset[cpu].numa_hit++; + } else { + p->numa_miss++; + zonelist->zones[0]->pageset[cpu].numa_foreign++; + } + if (pg == NODE_DATA(numa_node_id())) + p->local_node++; + else + p->other_node++; + local_irq_restore(flags); +#endif +} + +/* + * Free a 0-order page + */ +static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); +static void fastcall free_hot_cold_page(struct page *page, int cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + + kernel_map_pages(page, 1, 0); + inc_page_state(pgfree); + free_pages_check(__FUNCTION__, page); + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + list_add(&page->lru, &pcp->list); + pcp->count++; + local_irq_restore(flags); + put_cpu(); +} + +void fastcall free_hot_page(struct page *page) +{ + free_hot_cold_page(page, 0); +} + +void fastcall free_cold_page(struct page *page) +{ + free_hot_cold_page(page, 1); +} + +/* + * Really, prep_compound_page() should be called from __rmqueue_bulk(). But + * we cheat by calling it from here, in the order > 0 path. Saves a branch + * or two. + */ + +static struct page * +buffered_rmqueue(struct zone *zone, int order, int gfp_flags) +{ + unsigned long flags; + struct page *page = NULL; + int cold = !!(gfp_flags & __GFP_COLD); + + if (order == 0) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count <= pcp->low) + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (pcp->count) { + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; + } + local_irq_restore(flags); + put_cpu(); + } + + if (page == NULL) { + spin_lock_irqsave(&zone->lock, flags); + page = __rmqueue(zone, order); + spin_unlock_irqrestore(&zone->lock, flags); + } + + if (page != NULL) { + BUG_ON(bad_range(zone, page)); + mod_page_state_zone(zone, pgalloc, 1 << order); + prep_new_page(page, order); + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + } + return page; +} + +/* + * This is the 'heart' of the zoned buddy allocator. + * + * Herein lies the mysterious "incremental min". That's the + * + * local_low = z->pages_low; + * min += local_low; + * + * thing. The intent here is to provide additional protection to low zones for + * allocation requests which _could_ use higher zones. So a GFP_HIGHMEM + * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL + * request. This preserves additional space in those lower zones for requests + * which really do need memory from those zones. It means that on a decent + * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA + * zone untouched. + */ +struct page * fastcall +__alloc_pages(unsigned int gfp_mask, unsigned int order, + struct zonelist *zonelist) +{ + const int wait = gfp_mask & __GFP_WAIT; + unsigned long min; + struct zone **zones; + struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int i; + int alloc_type; + int do_retry; + + might_sleep_if(wait); + + zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ + if (zones[0] == NULL) /* no zones in the zonelist */ + return NULL; + + alloc_type = zone_idx(zones[0]); + + /* Go through the zonelist once, looking for a zone with enough free */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + min = (1<protection[alloc_type]; + + /* + * We let real-time tasks dip their real-time paws a little + * deeper into reserves. + */ + if (rt_task(p)) + min -= z->pages_low >> 1; + + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, z); + goto got_pg; + } + } + } + + /* we're somewhat low on memory, failed to find what we needed */ + for (i = 0; zones[i] != NULL; i++) + wakeup_kswapd(zones[i]); + + /* Go through the zonelist again, taking __GFP_HIGH into account */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + min = (1<protection[alloc_type]; + + if (gfp_mask & __GFP_HIGH) + min -= z->pages_low >> 2; + if (rt_task(p)) + min -= z->pages_low >> 1; + + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, z); + goto got_pg; + } + } + } + + /* here we're in the low on memory slow path */ + +rebalance: + if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { + /* go through the zonelist yet again, ignoring mins */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + page = buffered_rmqueue(z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, z); + goto got_pg; + } + } + goto nopage; + } + + /* Atomic allocations - we can't balance anything */ + if (!wait) + goto nopage; + + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + try_to_free_pages(zones, gfp_mask, order); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + /* go through the zonelist yet one more time */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + min = (1UL << order) + z->protection[alloc_type]; + + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, z); + goto got_pg; + } + } + } + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that + * may not be true in other implementations. + */ + do_retry = 0; + if (!(gfp_mask & __GFP_NORETRY)) { + if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) + do_retry = 1; + if (gfp_mask & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + blk_congestion_wait(WRITE, HZ/50); + goto rebalance; + } + +nopage: + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + } + return NULL; +got_pg: + kernel_map_pages(page, 1 << order, 1); + return page; +} + +EXPORT_SYMBOL(__alloc_pages); + +#ifdef CONFIG_NUMA +/* Early boot: Everything is done by one cpu, but the data structures will be + * used by all cpus - spread them on all nodes. + */ +static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order) +{ +static int nodenr; + int i = nodenr; + struct page *page; + + for (;;) { + if (i > nodenr + numnodes) + return 0; + if (node_present_pages(i%numnodes)) { + struct zone **z; + /* The node contains memory. Check that there is + * memory in the intended zonelist. + */ + z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones; + while (*z) { + if ( (*z)->free_pages > (1UL<= 0) + free_hot_cold_page(pvec->pages[i], pvec->cold); +} + +fastcall void __free_pages(struct page *page, unsigned int order) +{ + if (!PageReserved(page) && put_page_testzero(page)) { + if (order == 0) + free_hot_page(page); + else + __free_pages_ok(page, order); + } +} + +EXPORT_SYMBOL(__free_pages); + +fastcall void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + BUG_ON(!virt_addr_valid(addr)); + __free_pages(virt_to_page(addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages(void) +{ + unsigned int sum = 0; + struct zone *zone; + + for_each_zone(zone) + sum += zone->free_pages; + + return sum; +} + +EXPORT_SYMBOL(nr_free_pages); + +unsigned int nr_used_zone_pages(void) +{ + unsigned int pages = 0; + struct zone *zone; + + for_each_zone(zone) + pages += zone->nr_active + zone->nr_inactive; + + return pages; +} + +#ifdef CONFIG_NUMA +unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) +{ + unsigned int i, sum = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += pgdat->node_zones[i].free_pages; + + return sum; +} +#endif + +static unsigned int nr_free_zone_pages(int offset) +{ + pg_data_t *pgdat; + unsigned int sum = 0; + + for_each_pgdat(pgdat) { + struct zonelist *zonelist = pgdat->node_zonelists + offset; + struct zone **zonep = zonelist->zones; + struct zone *zone; + + for (zone = *zonep++; zone; zone = *zonep++) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; + } + } + + return sum; +} + +/* + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL + */ +unsigned int nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); +} + +/* + * Amount of free RAM allocatable within all zones + */ +unsigned int nr_free_pagecache_pages(void) +{ + return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); +} + +#ifdef CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} +#endif + +#ifdef CONFIG_NUMA +static void show_node(struct zone *zone) +{ + printk("Node %d ", zone->zone_pgdat->node_id); +} +#else +#define show_node(zone) do { } while (0) +#endif + +/* + * Accumulate the page_state information across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. + */ +DEFINE_PER_CPU(struct page_state, page_states) = {0}; +EXPORT_PER_CPU_SYMBOL(page_states); + +atomic_t nr_pagecache = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_pagecache); +#ifdef CONFIG_SMP +DEFINE_PER_CPU(long, nr_pagecache_local) = 0; +#endif + +void __get_page_state(struct page_state *ret, int nr) +{ + int cpu = 0; + + memset(ret, 0, sizeof(*ret)); + while (cpu < NR_CPUS) { + unsigned long *in, *out, off; + + if (!cpu_possible(cpu)) { + cpu++; + continue; + } + + in = (unsigned long *)&per_cpu(page_states, cpu); + cpu++; + if (cpu < NR_CPUS && cpu_possible(cpu)) + prefetch(&per_cpu(page_states, cpu)); + out = (unsigned long *)ret; + for (off = 0; off < nr; off++) + *out++ += *in++; + } +} + +void get_page_state(struct page_state *ret) +{ + int nr; + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr + 1); +} + +void get_full_page_state(struct page_state *ret) +{ + __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); +} + +unsigned long __read_page_state(unsigned offset) +{ + unsigned long ret = 0; + int cpu; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + unsigned long in; + + if (!cpu_possible(cpu)) + continue; + + in = (unsigned long)&per_cpu(page_states, cpu) + offset; + ret += *((unsigned long *)in); + } + return ret; +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct zone *zone; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_zone(zone) { + *active += zone->nr_active; + *inactive += zone->nr_inactive; + *free += zone->free_pages; + } +} + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages; + val->sharedram = 0; + val->freeram = nr_free_pages(); + val->bufferram = nr_blockdev_pages(); +#ifdef CONFIG_HIGHMEM + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + val->totalram = pgdat->node_present_pages; + val->freeram = nr_free_pages_pgdat(pgdat); + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; + val->mem_unit = PAGE_SIZE; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + struct page_state ps; + int cpu, temperature; + unsigned long active; + unsigned long inactive; + unsigned long free; + struct zone *zone; + + for_each_zone(zone) { + show_node(zone); + printk("%s per-cpu:", zone->name); + + if (!zone->present_pages) { + printk(" empty\n"); + continue; + } else + printk("\n"); + + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + struct per_cpu_pageset *pageset; + + if (!cpu_possible(cpu)) + continue; + + pageset = zone->pageset + cpu; + + for (temperature = 0; temperature < 2; temperature++) + printk("cpu %d %s: low %d, high %d, batch %d\n", + cpu, + temperature ? "cold" : "hot", + pageset->pcp[temperature].low, + pageset->pcp[temperature].high, + pageset->pcp[temperature].batch); + } + } + + get_page_state(&ps); + get_zone_counts(&active, &inactive, &free); + + printk("\nFree pages: %11ukB (%ukB HighMem)\n", + K(nr_free_pages()), + K(nr_free_highpages())); + + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " + "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", + active, + inactive, + ps.nr_dirty, + ps.nr_writeback, + ps.nr_unstable, + nr_free_pages(), + ps.nr_slab, + ps.nr_mapped, + ps.nr_page_table_pages); + + for_each_zone(zone) { + int i; + + show_node(zone); + printk("%s" + " free:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" + "\n", + zone->name, + K(zone->free_pages), + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), + K(zone->nr_active), + K(zone->nr_inactive), + K(zone->present_pages) + ); + printk("protections[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %lu", zone->protection[i]); + printk("\n"); + } + + for_each_zone(zone) { + struct list_head *elem; + unsigned long nr, flags, order, total = 0; + + show_node(zone); + printk("%s: ", zone->name); + if (!zone->present_pages) { + printk("empty\n"); + continue; + } + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + nr = 0; + list_for_each(elem, &zone->free_area[order].free_list) + ++nr; + total += nr << order; + printk("%lu*%lukB ", nr, K(1UL) << order); + } + spin_unlock_irqrestore(&zone->lock, flags); + printk("= %lukB\n", K(total)); + } + + show_swap_cache_info(); +} + +/* + * Builds allocation fallback zone lists. + */ +static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +{ + switch (k) { + struct zone *zone; + default: + BUG(); + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->present_pages) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->present_pages) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->present_pages) + zonelist->zones[j++] = zone; + } + + return j; +} + +#ifdef CONFIG_NUMA +#define MAX_NODE_LOAD (numnodes) +static int __initdata node_load[MAX_NUMNODES]; +/** + * find_next_best_node - find the next node that should appear in a given + * node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: pointer to the bitmap of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int __init find_next_best_node(int node, void *used_node_mask) +{ + int i, n, val; + int min_val = INT_MAX; + int best_node = -1; + + for (i = 0; i < numnodes; i++) { + cpumask_t tmp; + + /* Start from local node */ + n = (node+i)%numnodes; + + /* Don't want a node to appear more than once */ + if (test_bit(n, used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Give preference to headless and unused nodes */ + tmp = node_to_cpumask(n); + if (!cpus_empty(tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + set_bit(best_node, used_node_mask); + + return best_node; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + int prev_node, load; + struct zonelist *zonelist; + DECLARE_BITMAP(used_mask, MAX_NUMNODES); + + /* initialize zonelists */ + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + zonelist->zones[0] = NULL; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = numnodes; + prev_node = local_node; + bitmap_zero(used_mask, MAX_NUMNODES); + while ((node = find_next_best_node(local_node, used_mask)) >= 0) { + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] += load; + prev_node = node; + load--; + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++); + + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j] = NULL; + } + } +} + +#else /* CONFIG_NUMA */ + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + + local_node = pgdat->node_id; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zonelist *zonelist; + + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + + j = 0; + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < numnodes; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + for (node = 0; node < local_node; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + + zonelist->zones[j] = NULL; + } +} + +#endif /* CONFIG_NUMA */ + +void __init build_all_zonelists(void) +{ + int i; + + for(i = 0 ; i < numnodes ; i++) + build_zonelists(NODE_DATA(i)); + printk("Built %i zonelists\n", numnodes); +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +static inline unsigned long wait_table_size(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return max(size, 4UL); +} + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +static void __init calculate_zone_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + pgdat->node_present_pages = realtotalpages; + printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + + +/* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ +void __init memmap_init_zone(struct page *start, unsigned long size, int nid, + unsigned long zone, unsigned long start_pfn) +{ + struct page *page; + + for (page = start; page < (start + size); page++) { + set_page_zone(page, NODEZONE(nid, zone)); + set_page_count(page, 0); + SetPageReserved(page); + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (zone != ZONE_HIGHMEM) + set_page_address(page, __va(start_pfn << PAGE_SHIFT)); +#endif + start_pfn++; + } +} + +#ifndef __HAVE_ARCH_MEMMAP_INIT +#define memmap_init(start, size, nid, zone, start_pfn) \ + memmap_init_zone((start), (size), (nid), (zone), (start_pfn)) +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +static void __init free_area_init_core(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long i, j; + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + int cpu, nid = pgdat->node_id; + struct page *lmem_map = pgdat->node_mem_map; + unsigned long zone_start_pfn = pgdat->node_start_pfn; + + pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, realsize; + unsigned long batch; + + zone_table[NODEZONE(nid, j)] = zone; + realsize = size = zones_size[j]; + if (zholes_size) + realsize -= zholes_size[j]; + + zone->spanned_pages = size; + zone->present_pages = realsize; + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); + spin_lock_init(&zone->lru_lock); + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + + zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[cpu].pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + + pcp = &zone->pageset[cpu].pcp[1]; /* cold */ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + } + printk(" %s zone: %lu pages, LIFO batch:%lu\n", + zone_names[j], realsize, batch); + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); + atomic_set(&zone->nr_scan_active, 0); + atomic_set(&zone->nr_scan_inactive, 0); + zone->nr_active = 0; + zone->nr_inactive = 0; + if (!size) + continue; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(size); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); + + pgdat->nr_zones = j+1; + + zone->zone_mem_map = lmem_map; + zone->zone_start_pfn = zone_start_pfn; + + if ((zone_start_pfn) & (zone_required_alignment-1)) + printk("BUG: wrong zone alignment, it will crash\n"); + + memmap_init(lmem_map, size, nid, j, zone_start_pfn); + + zone_start_pfn += size; + lmem_map += size; + + for (i = 0; ; i++) { + unsigned long bitmap_size; + + INIT_LIST_HEAD(&zone->free_area[i].free_list); + if (i == MAX_ORDER-1) { + zone->free_area[i].map = NULL; + break; + } + + /* + * Page buddy system uses "index >> (i+1)", + * where "index" is at most "size-1". + * + * The extra "+3" is to round down to byte + * size (8 bits per byte assumption). Thus + * we get "(size-1) >> (i+4)" as the last byte + * we can access. + * + * The "+1" is because we want to round the + * byte allocation up rather than down. So + * we should have had a "+7" before we shifted + * down by three. Also, we have to add one as + * we actually _use_ the last bit (it's [0,n] + * inclusive, not [0,n[). + * + * So we actually had +7+1 before we shift + * down by 3. But (n+8) >> 3 == (n >> 3) + 1 + * (modulo overflows, which we do not have). + * + * Finally, we LONG_ALIGN because all bitmap + * operations are on longs. + */ + bitmap_size = (size-1) >> (i+4); + bitmap_size = LONG_ALIGN(bitmap_size+1); + zone->free_area[i].map = + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + } + } +} + +void __init free_area_init_node(int nid, struct pglist_data *pgdat, + struct page *node_mem_map, unsigned long *zones_size, + unsigned long node_start_pfn, unsigned long *zholes_size) +{ + unsigned long size; + + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_zone_totalpages(pgdat, zones_size, zholes_size); + if (!node_mem_map) { + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + node_mem_map = alloc_bootmem_node(pgdat, size); + } + pgdat->node_mem_map = node_mem_map; + + free_area_init_core(pgdat, zones_size, zholes_size); +} + +#ifndef CONFIG_DISCONTIGMEM +static bootmem_data_t contig_bootmem_data; +struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; + +EXPORT_SYMBOL(contig_page_data); + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_node(0, &contig_page_data, NULL, zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + mem_map = contig_page_data.node_mem_map; +} +#endif + +#ifdef CONFIG_PROC_FS + +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return pgdat->pgdat_next; +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * This walks the freelist for each zone. Whilst this is slow, I'd rather + * be slow here than slow down the fast path by keeping stats - mjbligh + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + int order; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!zone->present_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + unsigned long nr_bufs = 0; + struct list_head *elem; + + list_for_each(elem, &(zone->free_area[order].free_list)) + ++nr_bufs; + seq_printf(m, "%6lu ", nr_bufs); + } + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static char *vmstat_text[] = { + "nr_dirty", + "nr_writeback", + "nr_unstable", + "nr_page_table_pages", + "nr_mapped", + "nr_slab", + + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + "pgalloc_high", + + "pgalloc_normal", + "pgalloc_dma", + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + "pgrefill_high", + "pgrefill_normal", + "pgrefill_dma", + + "pgsteal_high", + "pgsteal_normal", + "pgsteal_dma", + "pgscan_kswapd_high", + "pgscan_kswapd_normal", + + "pgscan_kswapd_dma", + "pgscan_direct_high", + "pgscan_direct_normal", + "pgscan_direct_dma", + "pginodesteal", + + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + struct page_state *ps; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); + get_full_page_state(ps); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_HOTPLUG_CPU +static int page_alloc_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + long *count; + + if (action == CPU_DEAD) { + /* Drain local pagecache count. */ + count = &per_cpu(nr_pagecache_local, cpu); + atomic_add(*count, &nr_pagecache); + *count = 0; + local_irq_disable(); + __drain_pages(cpu); + local_irq_enable(); + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init page_alloc_init(void) +{ + hotcpu_notifier(page_alloc_cpu_notify, 0); +} + +static unsigned long higherzone_val(struct zone *z, int max_zone, + int alloc_type) +{ + int z_idx = zone_idx(z); + struct zone *higherzone; + unsigned long pages; + + /* there is no higher zone to get a contribution from */ + if (z_idx == MAX_NR_ZONES-1) + return 0; + + higherzone = &z->zone_pgdat->node_zones[z_idx+1]; + + /* We always start with the higher zone's protection value */ + pages = higherzone->protection[alloc_type]; + + /* + * We get a lower-zone-protection contribution only if there are + * pages in the higher zone and if we're not the highest zone + * in the current zonelist. e.g., never happens for GFP_DMA. Happens + * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA + * and ZONE_NORMAL for a GFP_HIGHMEM allocation. + */ + if (higherzone->present_pages && z_idx < alloc_type) + pages += higherzone->pages_low * sysctl_lower_zone_protection; + + return pages; +} + +/* + * setup_per_zone_protection - called whenver min_free_kbytes or + * sysctl_lower_zone_protection changes. Ensures that each zone + * has a correct pages_protected value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + * + * This algorithm is way confusing. I tries to keep the same behavior + * as we had with the incremental min iterative algorithm. + */ +static void setup_per_zone_protection(void) +{ + struct pglist_data *pgdat; + struct zone *zones, *zone; + int max_zone; + int i, j; + + for_each_pgdat(pgdat) { + zones = pgdat->node_zones; + + for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++) + if (zones[i].present_pages) + max_zone = i; + + /* + * For each of the different allocation types: + * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM + */ + for (i = 0; i < MAX_NR_ZONES; i++) { + /* + * For each of the zones: + * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA + */ + for (j = MAX_NR_ZONES-1; j >= 0; j--) { + zone = &zones[j]; + + /* + * We never protect zones that don't have memory + * in them (j>max_zone) or zones that aren't in + * the zonelists for a certain type of + * allocation (j>i). We have to assign these to + * zero because the lower zones take + * contributions from the higher zones. + */ + if (j > max_zone || j > i) { + zone->protection[i] = 0; + continue; + } + /* + * The contribution of the next higher zone + */ + zone->protection[i] = higherzone_val(zone, + max_zone, i); + zone->protection[i] += zone->pages_low; + } + } + } +} + +/* + * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures + * that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +static void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + } + + for_each_zone(zone) { + spin_lock_irqsave(&zone->lru_lock, flags); + if (is_highmem(zone)) { + /* + * Often, highmem doesn't need to reserve any pages. + * But the pages_min/low/high values are also used for + * batching up page reclaim activity so we need a + * decent value here. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* if it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = (pages_min * zone->present_pages) / + lowmem_pages; + } + + zone->pages_low = zone->pages_min * 2; + zone->pages_high = zone->pages_min * 3; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (16MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = sqrt(lowmem_kbytes) + * + * which yields + * + * 16MB: 128k + * 32MB: 181k + * 64MB: 256k + * 128MB: 362k + * 256MB: 512k + * 512MB: 724k + * 1024MB: 1024k + * 2048MB: 1448k + * 4096MB: 2048k + * 8192MB: 2896k + * 16384MB: 4096k + */ +static int __init init_per_zone_pages_min(void) +{ + unsigned long lowmem_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + + min_free_kbytes = int_sqrt(lowmem_kbytes); + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 16384) + min_free_kbytes = 16384; + setup_per_zone_pages_min(); + setup_per_zone_protection(); + return 0; +} +module_init(init_per_zone_pages_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length) +{ + proc_dointvec(table, write, file, buffer, length); + setup_per_zone_pages_min(); + setup_per_zone_protection(); + return 0; +} + +/* + * lower_zone_protection_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_protection() + * whenever sysctl_lower_zone_protection changes. + */ +int lower_zone_protection_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length) +{ + proc_dointvec_minmax(table, write, file, buffer, length); + setup_per_zone_protection(); + return 0; +} -- 2.47.0