fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / mm / nommu.c
index ef55210..87e14d6 100644 (file)
@@ -44,10 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int heap_stack_gap = 0;
 
 EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(sysctl_max_map_count);
-EXPORT_SYMBOL(sysctl_overcommit_memory);
-EXPORT_SYMBOL(sysctl_overcommit_ratio);
-EXPORT_SYMBOL(vm_committed_space);
 EXPORT_SYMBOL(__vm_enough_memory);
 
 /* list of shareable VMAs */
@@ -57,6 +53,12 @@ DECLARE_RWSEM(nommu_vma_sem);
 struct vm_operations_struct generic_file_vm_ops = {
 };
 
+EXPORT_SYMBOL(vfree);
+EXPORT_SYMBOL(vmalloc_to_page);
+EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
+
 /*
  * Handle all mappings that got truncated by a "truncate()"
  * system call.
@@ -120,28 +122,54 @@ unsigned int kobjsize(const void *objp)
 }
 
 /*
- * The nommu dodgy version :-)
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ *   slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
  */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, int len, int write, int force,
        struct page **pages, struct vm_area_struct **vmas)
 {
+       struct vm_area_struct *vma;
+       unsigned long vm_flags;
        int i;
-       static struct vm_area_struct dummy_vma;
+
+       /* calculate required read or write permissions.
+        * - if 'force' is set, we only require the "MAY" flags.
+        */
+       vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+       vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 
        for (i = 0; i < len; i++) {
+               vma = find_vma(mm, start);
+               if (!vma)
+                       goto finish_or_fault;
+
+               /* protect what we can, including chardevs */
+               if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
+                   !(vm_flags & vma->vm_flags))
+                       goto finish_or_fault;
+
                if (pages) {
                        pages[i] = virt_to_page(start);
                        if (pages[i])
                                page_cache_get(pages[i]);
                }
                if (vmas)
-                       vmas[i] = &dummy_vma;
+                       vmas[i] = vma;
                start += PAGE_SIZE;
        }
-       return(i);
+
+       return i;
+
+finish_or_fault:
+       return i ? : -EFAULT;
 }
 
+EXPORT_SYMBOL(get_user_pages);
+
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 
@@ -150,12 +178,12 @@ void vfree(void *addr)
        kfree(addr);
 }
 
-void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
        /*
         * kmalloc doesn't like __GFP_HIGHMEM for some reason
         */
-       return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+       return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 }
 
 struct page * vmalloc_to_page(void *addr)
@@ -193,13 +221,20 @@ long vwrite(char *buf, char *addr, unsigned long count)
  *     Allocate enough pages to cover @size from the page level
  *     allocator and map them into continguos kernel virtual space.
  *
- *     For tight cotrol over page level allocator and protection flags
+ *     For tight control over page level allocator and protection flags
  *     use __vmalloc() instead.
  */
 void *vmalloc(unsigned long size)
 {
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
+EXPORT_SYMBOL(vmalloc);
+
+void *vmalloc_node(unsigned long size, int node)
+{
+       return vmalloc(size);
+}
+EXPORT_SYMBOL(vmalloc_node);
 
 /*
  *     vmalloc_32  -  allocate virtually continguos memory (32bit addressable)
@@ -256,29 +291,6 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
        return mm->brk = brk;
 }
 
-/*
- * Combine the mmap "prot" and "flags" argument into one "vm_flags" used
- * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
- * into "VM_xxx".
- */
-static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags)
-{
-#define _trans(x,bit1,bit2) \
-((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
-
-       unsigned long prot_bits, flag_bits;
-       prot_bits =
-               _trans(prot, PROT_READ, VM_READ) |
-               _trans(prot, PROT_WRITE, VM_WRITE) |
-               _trans(prot, PROT_EXEC, VM_EXEC);
-       flag_bits =
-               _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
-               _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
-               _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
-       return prot_bits | flag_bits;
-#undef _trans
-}
-
 #ifdef DEBUG
 static void show_process_blocks(void)
 {
@@ -298,6 +310,77 @@ static void show_process_blocks(void)
 }
 #endif /* DEBUG */
 
+/*
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * - should be called with mm->mmap_sem held writelocked
+ */
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+{
+       struct vm_list_struct **ppv;
+
+       for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
+               if ((*ppv)->vma->vm_start > vml->vma->vm_start)
+                       break;
+
+       vml->next = *ppv;
+       *ppv = vml;
+}
+
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+       struct vm_list_struct *loop, *vml;
+
+       /* search the vm_start ordered list */
+       vml = NULL;
+       for (loop = mm->context.vmlist; loop; loop = loop->next) {
+               if (loop->vma->vm_start > addr)
+                       break;
+               vml = loop;
+       }
+
+       if (vml && vml->vma->vm_end > addr)
+               return vml->vma;
+
+       return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+       return find_vma(mm, addr);
+}
+
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+                                                   unsigned long addr)
+{
+       struct vm_list_struct *vml;
+
+       /* search the vm_start ordered list */
+       for (vml = mm->context.vmlist; vml; vml = vml->next) {
+               if (vml->vma->vm_start == addr)
+                       return vml->vma;
+               if (vml->vma->vm_start > addr)
+                       break;
+       }
+
+       return NULL;
+}
+
+/*
+ * find a VMA in the global tree
+ */
 static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
 {
        struct vm_area_struct *vma;
@@ -317,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
        return NULL;
 }
 
+/*
+ * add a VMA in the global tree
+ */
 static void add_nommu_vma(struct vm_area_struct *vma)
 {
        struct vm_area_struct *pvma;
@@ -363,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma)
        rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
 }
 
+/*
+ * delete a VMA from the global list
+ */
 static void delete_nommu_vma(struct vm_area_struct *vma)
 {
        struct address_space *mapping;
@@ -381,145 +470,352 @@ static void delete_nommu_vma(struct vm_area_struct *vma)
 }
 
 /*
- * handle mapping creation for uClinux
+ * determine whether a mapping should be permitted and, if so, what sort of
+ * mapping we're capable of supporting
  */
-unsigned long do_mmap_pgoff(struct file *file,
-                           unsigned long addr,
-                           unsigned long len,
-                           unsigned long prot,
-                           unsigned long flags,
-                           unsigned long pgoff)
+static int validate_mmap_request(struct file *file,
+                                unsigned long addr,
+                                unsigned long len,
+                                unsigned long prot,
+                                unsigned long flags,
+                                unsigned long pgoff,
+                                unsigned long *_capabilities)
 {
-       struct vm_list_struct *vml = NULL;
-       struct vm_area_struct *vma = NULL;
-       struct rb_node *rb;
-       unsigned int vm_flags;
-       void *result;
-       int ret, membacked;
+       unsigned long capabilities;
+       unsigned long reqprot = prot;
+       int ret;
 
        /* do the simple checks first */
        if (flags & MAP_FIXED || addr) {
-               printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n",
+               printk(KERN_DEBUG
+                      "%d: Can't do fixed-address/overlay mmap of RAM\n",
                       current->pid);
                return -EINVAL;
        }
 
-       if (PAGE_ALIGN(len) == 0)
-               return addr;
+       if ((flags & MAP_TYPE) != MAP_PRIVATE &&
+           (flags & MAP_TYPE) != MAP_SHARED)
+               return -EINVAL;
 
-       if (len > TASK_SIZE)
+       if (!len)
                return -EINVAL;
 
+       /* Careful about overflows.. */
+       len = PAGE_ALIGN(len);
+       if (!len || len > TASK_SIZE)
+               return -ENOMEM;
+
        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
-               return -EINVAL;
+               return -EOVERFLOW;
 
-       /* validate file mapping requests */
-       membacked = 0;
        if (file) {
+               /* validate file mapping requests */
+               struct address_space *mapping;
+
                /* files must support mmap */
                if (!file->f_op || !file->f_op->mmap)
                        return -ENODEV;
 
-               if ((prot & PROT_EXEC) &&
-                   (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
-                       return -EPERM;
-
                /* work out if what we've got could possibly be shared
                 * - we support chardevs that provide their own "memory"
                 * - we support files/blockdevs that are memory backed
                 */
-               if (S_ISCHR(file->f_dentry->d_inode->i_mode)) {
-                       membacked = 1;
-               }
-               else {
-                       struct address_space *mapping = file->f_mapping;
-                       if (!mapping)
-                               mapping = file->f_dentry->d_inode->i_mapping;
-                       if (mapping && mapping->backing_dev_info)
-                               membacked = mapping->backing_dev_info->memory_backed;
+               mapping = file->f_mapping;
+               if (!mapping)
+                       mapping = file->f_path.dentry->d_inode->i_mapping;
+
+               capabilities = 0;
+               if (mapping && mapping->backing_dev_info)
+                       capabilities = mapping->backing_dev_info->capabilities;
+
+               if (!capabilities) {
+                       /* no explicit capabilities set, so assume some
+                        * defaults */
+                       switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
+                       case S_IFREG:
+                       case S_IFBLK:
+                               capabilities = BDI_CAP_MAP_COPY;
+                               break;
+
+                       case S_IFCHR:
+                               capabilities =
+                                       BDI_CAP_MAP_DIRECT |
+                                       BDI_CAP_READ_MAP |
+                                       BDI_CAP_WRITE_MAP;
+                               break;
+
+                       default:
+                               return -EINVAL;
+                       }
                }
 
+               /* eliminate any capabilities that we can't support on this
+                * device */
+               if (!file->f_op->get_unmapped_area)
+                       capabilities &= ~BDI_CAP_MAP_DIRECT;
+               if (!file->f_op->read)
+                       capabilities &= ~BDI_CAP_MAP_COPY;
+
                if (flags & MAP_SHARED) {
                        /* do checks for writing, appending and locking */
-                       if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE))
+                       if ((prot & PROT_WRITE) &&
+                           !(file->f_mode & FMODE_WRITE))
                                return -EACCES;
 
-                       if (IS_APPEND(file->f_dentry->d_inode) &&
+                       if (IS_APPEND(file->f_path.dentry->d_inode) &&
                            (file->f_mode & FMODE_WRITE))
                                return -EACCES;
 
-                       if (locks_verify_locked(file->f_dentry->d_inode))
+                       if (locks_verify_locked(file->f_path.dentry->d_inode))
                                return -EAGAIN;
 
-                       if (!membacked) {
+                       if (!(capabilities & BDI_CAP_MAP_DIRECT))
+                               return -ENODEV;
+
+                       if (((prot & PROT_READ)  && !(capabilities & BDI_CAP_READ_MAP))  ||
+                           ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
+                           ((prot & PROT_EXEC)  && !(capabilities & BDI_CAP_EXEC_MAP))
+                           ) {
                                printk("MAP_SHARED not completely supported on !MMU\n");
                                return -EINVAL;
                        }
 
-                       /* we require greater support from the driver or
-                        * filesystem - we ask it to tell us what memory to
-                        * use */
-                       if (!file->f_op->get_unmapped_area)
-                               return -ENODEV;
+                       /* we mustn't privatise shared mappings */
+                       capabilities &= ~BDI_CAP_MAP_COPY;
                }
                else {
-                       /* we read private files into memory we allocate */
-                       if (!file->f_op->read)
+                       /* we're going to read the file into private memory we
+                        * allocate */
+                       if (!(capabilities & BDI_CAP_MAP_COPY))
                                return -ENODEV;
+
+                       /* we don't permit a private writable mapping to be
+                        * shared with the backing device */
+                       if (prot & PROT_WRITE)
+                               capabilities &= ~BDI_CAP_MAP_DIRECT;
+               }
+
+               /* handle executable mappings and implied executable
+                * mappings */
+               if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+                       if (prot & PROT_EXEC)
+                               return -EPERM;
+               }
+               else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+                       /* handle implication of PROT_EXEC by PROT_READ */
+                       if (current->personality & READ_IMPLIES_EXEC) {
+                               if (capabilities & BDI_CAP_EXEC_MAP)
+                                       prot |= PROT_EXEC;
+                       }
+               }
+               else if ((prot & PROT_READ) &&
+                        (prot & PROT_EXEC) &&
+                        !(capabilities & BDI_CAP_EXEC_MAP)
+                        ) {
+                       /* backing file is not executable, try to copy */
+                       capabilities &= ~BDI_CAP_MAP_DIRECT;
                }
        }
+       else {
+               /* anonymous mappings are always memory backed and can be
+                * privately mapped
+                */
+               capabilities = BDI_CAP_MAP_COPY;
 
-       /* handle PROT_EXEC implication by PROT_READ */
-       if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
-               if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+               /* handle PROT_EXEC implication by PROT_READ */
+               if ((prot & PROT_READ) &&
+                   (current->personality & READ_IMPLIES_EXEC))
                        prot |= PROT_EXEC;
+       }
 
-       /* do simple checking here so the lower-level routines won't have
-        * to. we assume access permissions have been handled by the open
-        * of the memory object, so we don't do any here.
-        */
-       vm_flags = calc_vm_flags(prot,flags) /* | mm->def_flags */
-               | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+       /* allow the security API to have its say */
+       ret = security_file_mmap(file, reqprot, prot, flags);
+       if (ret < 0)
+               return ret;
 
-       if (!membacked) {
-               /* share any file segment that's mapped read-only */
-               if (((flags & MAP_PRIVATE) && !(prot & PROT_WRITE) && file) ||
-                   ((flags & MAP_SHARED) && !(prot & PROT_WRITE) && file))
-                       vm_flags |= VM_MAYSHARE;
+       /* looks okay */
+       *_capabilities = capabilities;
+       return 0;
+}
 
-               /* refuse to let anyone share files with this process if it's being traced -
-                * otherwise breakpoints set in it may interfere with another untraced process
-                */
-               if (current->ptrace & PT_PTRACED)
-                       vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
+/*
+ * we've determined that we can make the mapping, now translate what we
+ * now know into VMA flags
+ */
+static unsigned long determine_vm_flags(struct file *file,
+                                       unsigned long prot,
+                                       unsigned long flags,
+                                       unsigned long capabilities)
+{
+       unsigned long vm_flags;
+
+       vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
+       vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+       /* vm_flags |= mm->def_flags; */
+
+       if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+               /* attempt to share read-only copies of mapped file chunks */
+               if (file && !(prot & PROT_WRITE))
+                       vm_flags |= VM_MAYSHARE;
        }
        else {
-               /* permit sharing of character devices and ramfs files at any time for
-                * anything other than a privately writable mapping
-                */
-               if (!(flags & MAP_PRIVATE) || !(prot & PROT_WRITE)) {
+               /* overlay a shareable mapping on the backing device or inode
+                * if possible - used for chardevs, ramfs/tmpfs/shmfs and
+                * romfs/cramfs */
+               if (flags & MAP_SHARED)
+                       vm_flags |= VM_MAYSHARE | VM_SHARED;
+               else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
                        vm_flags |= VM_MAYSHARE;
-                       if (flags & MAP_SHARED)
-                               vm_flags |= VM_SHARED;
+       }
+
+       /* refuse to let anyone share private mappings with this process if
+        * it's being traced - otherwise breakpoints set in it may interfere
+        * with another untraced process
+        */
+       if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+               vm_flags &= ~VM_MAYSHARE;
+
+       return vm_flags;
+}
+
+/*
+ * set up a shared mapping on a file
+ */
+static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
+{
+       int ret;
+
+       ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+       if (ret != -ENOSYS)
+               return ret;
+
+       /* getting an ENOSYS error indicates that direct mmap isn't
+        * possible (as opposed to tried but failed) so we'll fall
+        * through to making a private copy of the data and mapping
+        * that if we can */
+       return -ENODEV;
+}
+
+/*
+ * set up a private mapping or an anonymous shared mapping
+ */
+static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
+{
+       void *base;
+       int ret;
+
+       /* invoke the file's mapping function so that it can keep track of
+        * shared mappings on devices or memory
+        * - VM_MAYSHARE will be set if it may attempt to share
+        */
+       if (vma->vm_file) {
+               ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+               if (ret != -ENOSYS) {
+                       /* shouldn't return success if we're not sharing */
+                       BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
+                       return ret; /* success or a real error */
                }
+
+               /* getting an ENOSYS error indicates that direct mmap isn't
+                * possible (as opposed to tried but failed) so we'll try to
+                * make a private copy of the data and map that instead */
        }
 
-       /* allow the security API to have its say */
-       ret = security_file_mmap(file, prot, flags);
-       if (ret)
+       /* allocate some memory to hold the mapping
+        * - note that this may not return a page-aligned address if the object
+        *   we're allocating is smaller than a page
+        */
+       base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
+       if (!base)
+               goto enomem;
+
+       vma->vm_start = (unsigned long) base;
+       vma->vm_end = vma->vm_start + len;
+       vma->vm_flags |= VM_MAPPED_COPY;
+
+#ifdef WARN_ON_SLACK
+       if (len + WARN_ON_SLACK <= kobjsize(result))
+               printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
+                      len, current->pid, kobjsize(result) - len);
+#endif
+
+       if (vma->vm_file) {
+               /* read the contents of a file into the copy */
+               mm_segment_t old_fs;
+               loff_t fpos;
+
+               fpos = vma->vm_pgoff;
+               fpos <<= PAGE_SHIFT;
+
+               old_fs = get_fs();
+               set_fs(KERNEL_DS);
+               ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+               set_fs(old_fs);
+
+               if (ret < 0)
+                       goto error_free;
+
+               /* clear the last little bit */
+               if (ret < len)
+                       memset(base + ret, 0, len - ret);
+
+       } else {
+               /* if it's an anonymous mapping, then just clear it */
+               memset(base, 0, len);
+       }
+
+       return 0;
+
+error_free:
+       kfree(base);
+       vma->vm_start = 0;
+       return ret;
+
+enomem:
+       printk("Allocation of length %lu from process %d failed\n",
+              len, current->pid);
+       show_free_areas();
+       return -ENOMEM;
+}
+
+/*
+ * handle mapping creation for uClinux
+ */
+unsigned long do_mmap_pgoff(struct file *file,
+                           unsigned long addr,
+                           unsigned long len,
+                           unsigned long prot,
+                           unsigned long flags,
+                           unsigned long pgoff)
+{
+       struct vm_list_struct *vml = NULL;
+       struct vm_area_struct *vma = NULL;
+       struct rb_node *rb;
+       unsigned long capabilities, vm_flags;
+       void *result;
+       int ret;
+
+       /* decide whether we should attempt the mapping, and if so what sort of
+        * mapping */
+       ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
+                                   &capabilities);
+       if (ret < 0)
                return ret;
 
+       /* we've determined that we can make the mapping, now translate what we
+        * now know into VMA flags */
+       vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
        /* we're going to need to record the mapping if it works */
-       vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
+       vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
        if (!vml)
                goto error_getting_vml;
-       memset(vml, 0, sizeof(*vml));
 
        down_write(&nommu_vma_sem);
 
-       /* if we want to share, we need to search for VMAs created by another
-        * mmap() call that overlap with our proposed mapping
+       /* if we want to share, we need to check for VMAs created by other
+        * mmap() calls that overlap with our proposed mapping
         * - we can only share with an exact match on most regular files
         * - shared mappings on character devices and memory backed files are
         *   permitted to overlap inexactly as far as we are concerned for in
@@ -537,19 +833,20 @@ unsigned long do_mmap_pgoff(struct file *file,
                                continue;
 
                        /* search for overlapping mappings on the same file */
-                       if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
+                       if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
                                continue;
 
                        if (vma->vm_pgoff >= pgoff + pglen)
                                continue;
 
-                       vmpglen = (vma->vm_end - vma->vm_start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                       vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
+                       vmpglen >>= PAGE_SHIFT;
                        if (pgoff >= vma->vm_pgoff + vmpglen)
                                continue;
 
-                       /* handle inexact matches between mappings */
-                       if (vmpglen != pglen || vma->vm_pgoff != pgoff) {
-                               if (!membacked)
+                       /* handle inexactly overlapping matches between mappings */
+                       if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
+                               if (!(capabilities & BDI_CAP_MAP_DIRECT))
                                        goto sharing_violation;
                                continue;
                        }
@@ -561,30 +858,38 @@ unsigned long do_mmap_pgoff(struct file *file,
                        result = (void *) vma->vm_start;
                        goto shared;
                }
-       }
 
-       vma = NULL;
+               vma = NULL;
 
-       /* obtain the address to map to. we verify (or select) it and ensure
-        * that it represents a valid section of the address space
-        * - this is the hook for quasi-memory character devices
-        */
-       if (file && file->f_op->get_unmapped_area) {
-               addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags);
-               if (IS_ERR((void *) addr)) {
-                       ret = addr;
-                       if (ret == (unsigned long) -ENOSYS)
+               /* obtain the address at which to make a shared mapping
+                * - this is the hook for quasi-memory character devices to
+                *   tell us the location of a shared mapping
+                */
+               if (file && file->f_op->get_unmapped_area) {
+                       addr = file->f_op->get_unmapped_area(file, addr, len,
+                                                            pgoff, flags);
+                       if (IS_ERR((void *) addr)) {
+                               ret = addr;
+                               if (ret != (unsigned long) -ENOSYS)
+                                       goto error;
+
+                               /* the driver refused to tell us where to site
+                                * the mapping so we'll have to attempt to copy
+                                * it */
                                ret = (unsigned long) -ENODEV;
-                       goto error;
+                               if (!(capabilities & BDI_CAP_MAP_COPY))
+                                       goto error;
+
+                               capabilities &= ~BDI_CAP_MAP_DIRECT;
+                       }
                }
        }
 
        /* we're going to need a VMA struct as well */
-       vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+       vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
        if (!vma)
                goto error_getting_vma;
 
-       memset(vma, 0, sizeof(*vma));
        INIT_LIST_HEAD(&vma->anon_vma_node);
        atomic_set(&vma->vm_usage, 1);
        if (file)
@@ -597,96 +902,18 @@ unsigned long do_mmap_pgoff(struct file *file,
 
        vml->vma = vma;
 
-       /* determine the object being mapped and call the appropriate specific
-        * mapper.
-        */
-       if (file) {
-#ifdef MAGIC_ROM_PTR
-               /* First, try simpler routine designed to give us a ROM pointer. */
-               if (file->f_op->romptr && !(prot & PROT_WRITE)) {
-                       ret = file->f_op->romptr(file, vma);
-#ifdef DEBUG
-                       printk("romptr mmap returned %d (st=%lx)\n",
-                              ret, vma->vm_start);
-#endif
-                       result = (void *) vma->vm_start;
-                       if (!ret)
-                               goto done;
-                       else if (ret != -ENOSYS)
-                               goto error;
-               } else
-#endif /* MAGIC_ROM_PTR */
-               /* Then try full mmap routine, which might return a RAM
-                * pointer, or do something truly complicated
-                */
-               if (file->f_op->mmap) {
-                       ret = file->f_op->mmap(file, vma);
-
-#ifdef DEBUG
-                       printk("f_op->mmap() returned %d (st=%lx)\n",
-                              ret, vma->vm_start);
-#endif
-                       result = (void *) vma->vm_start;
-                       if (!ret)
-                               goto done;
-                       else if (ret != -ENOSYS)
-                               goto error;
-               } else {
-                       ret = -ENODEV; /* No mapping operations defined */
-                       goto error;
-               }
-
-               /* An ENOSYS error indicates that mmap isn't possible (as
-                * opposed to tried but failed) so we'll fall through to the
-                * copy. */
-       }
-
-       /* allocate some memory to hold the mapping
-        * - note that this may not return a page-aligned address if the object
-        *   we're allocating is smaller than a page
-        */
-       ret = -ENOMEM;
-       result = kmalloc(len, GFP_KERNEL);
-       if (!result) {
-               printk("Allocation of length %lu from process %d failed\n",
-                      len, current->pid);
-               show_free_areas();
+       /* set up the mapping */
+       if (file && vma->vm_flags & VM_SHARED)
+               ret = do_mmap_shared_file(vma, len);
+       else
+               ret = do_mmap_private(vma, len);
+       if (ret < 0)
                goto error;
-       }
 
-       vma->vm_start = (unsigned long) result;
-       vma->vm_end = vma->vm_start + len;
-
-#ifdef WARN_ON_SLACK
-       if (len + WARN_ON_SLACK <= kobjsize(result))
-               printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
-                      len, current->pid, kobjsize(result) - len);
-#endif
-
-       if (file) {
-               mm_segment_t old_fs = get_fs();
-               loff_t fpos;
-
-               fpos = pgoff;
-               fpos <<= PAGE_SHIFT;
-
-               set_fs(KERNEL_DS);
-               ret = file->f_op->read(file, (char *) result, len, &fpos);
-               set_fs(old_fs);
-
-               if (ret < 0)
-                       goto error2;
-               if (ret < len)
-                       memset(result + ret, 0, len - ret);
-       } else {
-               memset(result, 0, len);
-       }
-
-       if (prot & PROT_EXEC)
-               flush_icache_range((unsigned long) result, (unsigned long) result + len);
+       /* okay... we have a mapping; now we have to register it */
+       result = (void *) vma->vm_start;
 
- done:
-       if (!(vma->vm_flags & VM_SHARED)) {
+       if (vma->vm_flags & VM_MAPPED_COPY) {
                realalloc += kobjsize(result);
                askedalloc += len;
        }
@@ -694,19 +921,22 @@ unsigned long do_mmap_pgoff(struct file *file,
        realalloc += kobjsize(vma);
        askedalloc += sizeof(*vma);
 
-       // current->mm->total_vm += len >> PAGE_SHIFT;
        vx_vmpages_add(current->mm, len >> PAGE_SHIFT);
 
        add_nommu_vma(vma);
+
  shared:
        realalloc += kobjsize(vml);
        askedalloc += sizeof(*vml);
 
-       vml->next = current->mm->context.vmlist;
-       current->mm->context.vmlist = vml;
+       add_vma_to_mm(current->mm, vml);
 
        up_write(&nommu_vma_sem);
 
+       if (prot & PROT_EXEC)
+               flush_icache_range((unsigned long) result,
+                                  (unsigned long) result + len);
+
 #ifdef DEBUG
        printk("do_mmap:\n");
        show_process_blocks();
@@ -714,13 +944,12 @@ unsigned long do_mmap_pgoff(struct file *file,
 
        return (unsigned long) result;
 
- error2:
-       kfree(result);
  error:
        up_write(&nommu_vma_sem);
        kfree(vml);
        if (vma) {
-               fput(vma->vm_file);
+               if (vma->vm_file)
+                       fput(vma->vm_file);
                kfree(vma);
        }
        return ret;
@@ -734,7 +963,7 @@ unsigned long do_mmap_pgoff(struct file *file,
  error_getting_vma:
        up_write(&nommu_vma_sem);
        kfree(vml);
-       printk("Allocation of vml for %lu byte allocation from process %d failed\n",
+       printk("Allocation of vma for %lu byte allocation from process %d failed\n",
               len, current->pid);
        show_free_areas();
        return -ENOMEM;
@@ -762,7 +991,7 @@ static void put_vma(struct vm_area_struct *vma)
 
                        /* IO memory and memory shared directly out of the pagecache from
                         * ramfs/tmpfs mustn't be released here */
-                       if (!(vma->vm_flags & (VM_IO | VM_SHARED)) && vma->vm_start) {
+                       if (vma->vm_flags & VM_MAPPED_COPY) {
                                realalloc -= kobjsize((void *) vma->vm_start);
                                askedalloc -= vma->vm_end - vma->vm_start;
                                kfree((void *) vma->vm_start);
@@ -780,26 +1009,27 @@ static void put_vma(struct vm_area_struct *vma)
        }
 }
 
+/*
+ * release a mapping
+ * - under NOMMU conditions the parameters must match exactly to the mapping to
+ *   be removed
+ */
 int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 {
        struct vm_list_struct *vml, **parent;
        unsigned long end = addr + len;
 
-#ifdef MAGIC_ROM_PTR
-       /* For efficiency's sake, if the pointer is obviously in ROM,
-          don't bother walking the lists to free it */
-       if (is_in_rom(addr))
-               return 0;
-#endif
-
 #ifdef DEBUG
        printk("do_munmap:\n");
 #endif
 
-       for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
+       for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
+               if ((*parent)->vma->vm_start > addr)
+                       break;
                if ((*parent)->vma->vm_start == addr &&
-                   (*parent)->vma->vm_end == end)
+                   ((len == 0) || ((*parent)->vma->vm_end == end)))
                        goto found;
+       }
 
        printk("munmap of non-mmaped memory by process %d (%s): %p\n",
               current->pid, current->comm, (void *) addr);
@@ -814,7 +1044,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
        realalloc -= kobjsize(vml);
        askedalloc -= sizeof(*vml);
        kfree(vml);
-       // mm->total_vm -= len >> PAGE_SHIFT;
+
+       update_hiwater_vm(mm);
        vx_vmpages_sub(mm, len >> PAGE_SHIFT);
 
 #ifdef DEBUG
@@ -824,7 +1055,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
        return 0;
 }
 
-/* Release all mmaps. */
+asmlinkage long sys_munmap(unsigned long addr, size_t len)
+{
+       int ret;
+       struct mm_struct *mm = current->mm;
+
+       down_write(&mm->mmap_sem);
+       ret = do_munmap(mm, addr, len);
+       up_write(&mm->mmap_sem);
+       return ret;
+}
+
+/*
+ * Release all mappings
+ */
 void exit_mmap(struct mm_struct * mm)
 {
        struct vm_list_struct *tmp;
@@ -834,7 +1078,6 @@ void exit_mmap(struct mm_struct * mm)
                printk("Exit_mmap:\n");
 #endif
 
-               // mm->total_vm = 0;
                vx_vmpages_sub(mm, mm->total_vm);
 
                while ((tmp = mm->context.vmlist)) {
@@ -852,37 +1095,26 @@ void exit_mmap(struct mm_struct * mm)
        }
 }
 
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
-{
-       int ret;
-       struct mm_struct *mm = current->mm;
-
-       down_write(&mm->mmap_sem);
-       ret = do_munmap(mm, addr, len);
-       up_write(&mm->mmap_sem);
-       return ret;
-}
-
 unsigned long do_brk(unsigned long addr, unsigned long len)
 {
        return -ENOMEM;
 }
 
 /*
- * Expand (or shrink) an existing mapping, potentially moving it at the
- * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ * expand (or shrink) an existing mapping, potentially moving it at the same
+ * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
  *
- * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
- * This option implies MREMAP_MAYMOVE.
+ * under NOMMU conditions, we only permit changing a mapping's size, and only
+ * as long as it stays within the hole allocated by the kmalloc() call in
+ * do_mmap_pgoff() and the block is not shareable
  *
- * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the
- * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
+ * MREMAP_FIXED is not supported under NOMMU conditions
  */
 unsigned long do_mremap(unsigned long addr,
                        unsigned long old_len, unsigned long new_len,
                        unsigned long flags, unsigned long new_addr)
 {
-       struct vm_list_struct *vml = NULL;
+       struct vm_area_struct *vma;
 
        /* insanity checks first */
        if (new_len == 0)
@@ -891,53 +1123,42 @@ unsigned long do_mremap(unsigned long addr,
        if (flags & MREMAP_FIXED && new_addr != addr)
                return (unsigned long) -EINVAL;
 
-       for (vml = current->mm->context.vmlist; vml; vml = vml->next)
-               if (vml->vma->vm_start == addr)
-                       goto found;
-
-       return (unsigned long) -EINVAL;
+       vma = find_vma_exact(current->mm, addr);
+       if (!vma)
+               return (unsigned long) -EINVAL;
 
- found:
-       if (vml->vma->vm_end != vml->vma->vm_start + old_len)
+       if (vma->vm_end != vma->vm_start + old_len)
                return (unsigned long) -EFAULT;
 
-       if (vml->vma->vm_flags & VM_MAYSHARE)
+       if (vma->vm_flags & VM_MAYSHARE)
                return (unsigned long) -EPERM;
 
        if (new_len > kobjsize((void *) addr))
                return (unsigned long) -ENOMEM;
 
        /* all checks complete - do it */
-       vml->vma->vm_end = vml->vma->vm_start + new_len;
+       vma->vm_end = vma->vm_start + new_len;
 
        askedalloc -= old_len;
        askedalloc += new_len;
 
-       return vml->vma->vm_start;
+       return vma->vm_start;
 }
 
-/*
- * Look up the first VMA which satisfies  addr < vm_end,  NULL if none
- */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+asmlinkage unsigned long sys_mremap(unsigned long addr,
+       unsigned long old_len, unsigned long new_len,
+       unsigned long flags, unsigned long new_addr)
 {
-       struct vm_list_struct *vml;
-
-       for (vml = mm->context.vmlist; vml; vml = vml->next)
-               if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
-                       return vml->vma;
-
-       return NULL;
-}
+       unsigned long ret;
 
-EXPORT_SYMBOL(find_vma);
-
-struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
-{
-       return NULL;
+       down_write(&current->mm->mmap_sem);
+       ret = do_mremap(addr, old_len, new_len, flags, new_addr);
+       up_write(&current->mm->mmap_sem);
+       return ret;
 }
 
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+                       unsigned int foll_flags)
 {
        return NULL;
 }
@@ -945,8 +1166,10 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
 int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
                unsigned long to, unsigned long size, pgprot_t prot)
 {
-       return -EPERM;
+       vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+       return 0;
 }
+EXPORT_SYMBOL(remap_pfn_range);
 
 void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
@@ -958,20 +1181,8 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
        return -ENOMEM;
 }
 
-void arch_unmap_area(struct vm_area_struct *area)
-{
-}
-
-void update_mem_hiwater(void)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
-       struct task_struct *tsk = current;
-
-       if (likely(tsk->mm)) {
-               if (tsk->mm->hiwater_rss < tsk->mm->rss)
-                       tsk->mm->hiwater_rss = tsk->mm->rss;
-               if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-                       tsk->mm->hiwater_vm = tsk->mm->total_vm;
-       }
 }
 
 void unmap_mapping_range(struct address_space *mapping,
@@ -979,6 +1190,7 @@ void unmap_mapping_range(struct address_space *mapping,
                         int even_cows)
 {
 }
+EXPORT_SYMBOL(unmap_mapping_range);
 
 /*
  * Check that a process has enough memory to allocate a new virtual
@@ -1011,7 +1223,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                unsigned long n;
 
-               free = get_page_cache_size();
+               free = global_page_state(NR_FILE_PAGES);
                free += nr_swap_pages;
 
                /*
@@ -1020,7 +1232,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
                 * which are reclaimable, under pressure.  The dentry
                 * cache and most inode caches should fall into this
                 */
-               free += atomic_read(&slab_reclaim_pages);
+               free += global_page_state(NR_SLAB_RECLAIMABLE);
 
                /*
                 * Leave the last 3% for root
@@ -1036,14 +1248,26 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
                 * only call if we're about to fail.
                 */
                n = nr_free_pages();
+
+               /*
+                * Leave reserved pages. The pages are not for anonymous pages.
+                */
+               if (n <= totalreserve_pages)
+                       goto error;
+               else
+                       n -= totalreserve_pages;
+
+               /*
+                * Leave the last 3% for root
+                */
                if (!cap_sys_admin)
                        n -= n / 32;
                free += n;
 
                if (free > pages)
                        return 0;
-               vm_unacct_memory(pages);
-               return -ENOMEM;
+
+               goto error;
        }
 
        allowed = totalram_pages * sysctl_overcommit_ratio / 100;
@@ -1058,11 +1282,67 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
           leave 3% of the size of this process for other processes */
        allowed -= current->mm->total_vm / 32;
 
-       if (atomic_read(&vm_committed_space) < allowed)
+       /*
+        * cast `allowed' as a signed long because vm_committed_space
+        * sometimes has a negative value
+        */
+       if (atomic_read(&vm_committed_space) < (long)allowed)
                return 0;
-
+error:
        vm_unacct_memory(pages);
 
        return -ENOMEM;
 }
 
+int in_gate_area_no_task(unsigned long addr)
+{
+       return 0;
+}
+
+struct page *filemap_nopage(struct vm_area_struct *area,
+                       unsigned long address, int *type)
+{
+       BUG();
+       return NULL;
+}
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+       struct vm_area_struct *vma;
+       struct mm_struct *mm;
+
+       if (addr + len < addr)
+               return 0;
+
+       mm = get_task_mm(tsk);
+       if (!mm)
+               return 0;
+
+       down_read(&mm->mmap_sem);
+
+       /* the access must start within one of the target process's mappings */
+       vma = find_vma(mm, addr);
+       if (vma) {
+               /* don't overrun this mapping */
+               if (addr + len >= vma->vm_end)
+                       len = vma->vm_end - addr;
+
+               /* only read or write mappings where it is permitted */
+               if (write && vma->vm_flags & VM_MAYWRITE)
+                       len -= copy_to_user((void *) addr, buf, len);
+               else if (!write && vma->vm_flags & VM_MAYREAD)
+                       len -= copy_from_user(buf, (void *) addr, len);
+               else
+                       len = 0;
+       } else {
+               len = 0;
+       }
+
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+       return len;
+}