This commit was manufactured by cvs2svn to create tag

[linux-2.6.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 3df1f05..6c44ecc 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -66,12 +66,21 @@ EXPORT_SYMBOL(mem_map);
  #endif
  
  unsigned long num_physpages;
+/*
+ * A number of key systems in x86 including ioremap() rely on the assumption
+ * that high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
+ * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
+ * and ZONE_HIGHMEM.
+ */
  void * high_memory;
  struct page *highmem_start_page;
+unsigned long vmalloc_earlyreserve;
  
  EXPORT_SYMBOL(num_physpages);
  EXPORT_SYMBOL(highmem_start_page);
  EXPORT_SYMBOL(high_memory);
+EXPORT_SYMBOL(vmalloc_earlyreserve);
  
  /*
   * We special-case the C-O-W ZERO_PAGE, because it's such
@@ -108,7 +117,8 @@ static inline void free_one_pmd(struct mmu_gather *tlb, pmd_t * dir)
         pte_free_tlb(tlb, page);
  }
  
-static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
+static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir,
+                                                       int pgd_idx)
  {
         int j;
         pmd_t * pmd;
@@ -122,8 +132,11 @@ static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
         }
         pmd = pmd_offset(dir, 0);
         pgd_clear(dir);
-       for (j = 0; j < PTRS_PER_PMD ; j++)
+       for (j = 0; j < PTRS_PER_PMD ; j++) {
+               if (pgd_idx * PGDIR_SIZE + j * PMD_SIZE >= TASK_SIZE)
+                       break;
                 free_one_pmd(tlb, pmd+j);
+       }
         pmd_free_tlb(tlb, pmd);
  }
  
@@ -136,11 +149,13 @@ static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir)
  void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr)
  {
         pgd_t * page_dir = tlb->mm->pgd;
+       int pgd_idx = first;
  
         page_dir += first;
         do {
-               free_one_pgd(tlb, page_dir);
+               free_one_pgd(tlb, page_dir, pgd_idx);
                 page_dir++;
+               pgd_idx++;
         } while (--nr);
  }
  
@@ -273,6 +288,10 @@ skip_copy_pte_range:
                                 struct page *page;
                                 unsigned long pfn;
  
+                               if (!vx_rsspages_avail(dst, 1)) {
+                                       spin_unlock(&src->page_table_lock);
+                                       goto nomem;
+                               }
                                 /* copy_one_pte */
  
                                 if (pte_none(pte))
@@ -316,7 +335,8 @@ skip_copy_pte_range:
                                         pte = pte_mkclean(pte);
                                 pte = pte_mkold(pte);
                                 get_page(page);
-                               dst->rss++;
+                               // dst->rss++;
+                               vx_rsspages_inc(dst);
                                 set_pte(dst_pte, pte);
                                 page_dup_rmap(page);
  cont_copy_pte_range_noset:
@@ -407,7 +427,7 @@ static void zap_pte_range(struct mmu_gather *tlb,
                                 set_pte(ptep, pgoff_to_pte(page->index));
                         if (pte_dirty(pte))
                                 set_page_dirty(page);
-                       if (pte_young(pte) && page_mapping(page))
+                       if (pte_young(pte) && !PageAnon(page))
                                 mark_page_accessed(page);
                         tlb->freed++;
                         page_remove_rmap(page);
@@ -432,7 +452,7 @@ static void zap_pmd_range(struct mmu_gather *tlb,
                 unsigned long size, struct zap_details *details)
  {
         pmd_t * pmd;
-       unsigned long end;
+       unsigned long end, pgd_boundary;
  
         if (pgd_none(*dir))
                 return;
@@ -443,8 +463,9 @@ static void zap_pmd_range(struct mmu_gather *tlb,
         }
         pmd = pmd_offset(dir, address);
         end = address + size;
-       if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
-               end = ((address + PGDIR_SIZE) & PGDIR_MASK);
+       pgd_boundary = ((address + PGDIR_SIZE) & PGDIR_MASK);
+       if (pgd_boundary && (end > pgd_boundary))
+               end = pgd_boundary;
         do {
                 zap_pte_range(tlb, pmd, address, end - address, details);
                 address = (address + PMD_SIZE) & PMD_MASK; 
@@ -469,6 +490,10 @@ static void unmap_page_range(struct mmu_gather *tlb,
         tlb_end_vma(tlb, vma);
  }
  
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+# define ZAP_BLOCK_SIZE (128 * PAGE_SIZE)
+#else
+
  /* Dispose of an entire struct mmu_gather per rescheduling point */
  #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
  #define ZAP_BLOCK_SIZE (FREE_PTE_NR * PAGE_SIZE)
@@ -484,6 +509,8 @@ static void unmap_page_range(struct mmu_gather *tlb,
  #define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
  #endif
  
+#endif
+
  /**
   * unmap_vmas - unmap a range of memory covered by a list of vma's
   * @tlbp: address of the caller's struct mmu_gather
@@ -556,8 +583,6 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
  
                         start += block;
                         zap_bytes -= block;
-                       if ((long)zap_bytes > 0)
-                               continue;
                         if (!atomic && need_resched()) {
                                 int fullmm = tlb_is_full_mm(*tlbp);
                                 tlb_finish_mmu(*tlbp, tlb_start, start);
@@ -565,6 +590,8 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
                                 *tlbp = tlb_gather_mmu(mm, fullmm);
                                 tlb_start_valid = 0;
                         }
+                       if ((long)zap_bytes > 0)
+                               continue;
                         zap_bytes = ZAP_BLOCK_SIZE;
                 }
         }
@@ -651,6 +678,64 @@ out:
         return NULL;
  }
  
+struct page *
+follow_page_pfn(struct mm_struct *mm, unsigned long address, int write,
+               unsigned long *pfn_ptr)
+{
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *ptep, pte;
+       unsigned long pfn;
+       struct page *page;
+
+       *pfn_ptr = 0;
+       page = follow_huge_addr(mm, address, write);
+       if (!IS_ERR(page))
+               return page;
+
+       pgd = pgd_offset(mm, address);
+       if (pgd_none(*pgd) || pgd_bad(*pgd))
+               goto out;
+
+       pmd = pmd_offset(pgd, address);
+       if (pmd_none(*pmd))
+               goto out;
+       if (pmd_huge(*pmd))
+               return follow_huge_pmd(mm, address, pmd, write);
+       if (pmd_bad(*pmd))
+               goto out;
+
+       ptep = pte_offset_map(pmd, address);
+       if (!ptep)
+               goto out;
+
+       pte = *ptep;
+       pte_unmap(ptep);
+       if (pte_present(pte)) {
+               if (write && !pte_write(pte))
+                       goto out;
+               if (write && !pte_dirty(pte)) {
+                       struct page *page = pte_page(pte);
+                       if (!PageDirty(page))
+                               set_page_dirty(page);
+               }
+               pfn = pte_pfn(pte);
+               if (pfn_valid(pfn)) {
+                       struct page *page = pfn_to_page(pfn);
+                       
+                       mark_page_accessed(page);
+                       return page;
+               } else {
+                       *pfn_ptr = pfn;
+                       return NULL;
+               }
+       }
+
+out:
+       return NULL;
+}
+
+
  /* 
   * Given a physical address, is there a useful struct page pointing to
   * it?  This may become more complex in the future if we start dealing
@@ -660,11 +745,12 @@ out:
  static inline struct page *get_page_map(struct page *page)
  {
         if (!pfn_valid(page_to_pfn(page)))
-               return 0;
+               return NULL;
         return page;
  }
  
  
+#ifndef CONFIG_X86_4G
  static inline int
  untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
                          unsigned long address)
@@ -689,6 +775,7 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
         /* There is a pte slot for 'address' in 'mm'. */
         return 0;
  }
+#endif
  
  
  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -718,19 +805,24 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         pte_t *pte;
                         if (write) /* user gate pages are read-only */
                                 return i ? : -EFAULT;
-                       pgd = pgd_offset_k(pg);
+                       pgd = pgd_offset_gate(mm, pg);
                         if (!pgd)
                                 return i ? : -EFAULT;
                         pmd = pmd_offset(pgd, pg);
                         if (!pmd)
                                 return i ? : -EFAULT;
-                       pte = pte_offset_kernel(pmd, pg);
-                       if (!pte || !pte_present(*pte))
+                       pte = pte_offset_map(pmd, pg);
+                       if (!pte)
                                 return i ? : -EFAULT;
+                       if (!pte_present(*pte)) {
+                               pte_unmap(pte);
+                               return i ? : -EFAULT;
+                       }
                         if (pages) {
                                 pages[i] = pte_page(*pte);
                                 get_page(pages[i]);
                         }
+                       pte_unmap(pte);
                         if (vmas)
                                 vmas[i] = gate_vma;
                         i++;
@@ -759,12 +851,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                  * insanly big anonymously mapped areas that
                                  * nobody touched so far. This is important
                                  * for doing a core dump for these mappings.
+                                *
+                                * disable this for 4:4 - it prevents
+                                * follow_page() from ever seeing these pages.
+                                *
+                                * (The 'fix' is dubious anyway, there's
+                                * nothing that this code avoids which couldnt
+                                * be triggered from userspace anyway.)
                                  */
+#ifndef CONFIG_X86_4G
                                 if (!lookup_write &&
                                     untouched_anonymous_page(mm,vma,start)) {
                                         map = ZERO_PAGE(start);
                                         break;
                                 }
+#endif
                                 spin_unlock(&mm->page_table_lock);
                                 switch (handle_mm_fault(mm,vma,start,write)) {
                                 case VM_FAULT_MINOR:
@@ -1064,7 +1165,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
         /*
          * Ok, we need to copy. Oh, well..
          */
-       page_cache_get(old_page);
+       if (!PageReserved(old_page))
+               page_cache_get(old_page);
         spin_unlock(&mm->page_table_lock);
  
         if (unlikely(anon_vma_prepare(vma)))
@@ -1081,7 +1183,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
         page_table = pte_offset_map(pmd, address);
         if (likely(pte_same(*page_table, pte))) {
                 if (PageReserved(old_page))
-                       ++mm->rss;
+                       // ++mm->rss;
+                       vx_rsspages_inc(mm);
                 else
                         page_remove_rmap(old_page);
                 break_cow(vma, new_page, address, page_table);
@@ -1209,6 +1312,12 @@ int vmtruncate(struct inode * inode, loff_t offset)
  
         if (inode->i_size < offset)
                 goto do_expand;
+       /*
+        * truncation of in-use swapfiles is disallowed - it would cause
+        * subsequent swapout to scribble on the now-freed blocks.
+        */
+       if (IS_SWAPFILE(inode))
+               goto out_busy;
         i_size_write(inode, offset);
         unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
         truncate_inode_pages(mapping, offset);
@@ -1219,7 +1328,7 @@ do_expand:
         if (limit != RLIM_INFINITY && offset > limit)
                 goto out_sig;
         if (offset > inode->i_sb->s_maxbytes)
-               goto out;
+               goto out_big;
         i_size_write(inode, offset);
  
  out_truncate:
@@ -1228,8 +1337,10 @@ out_truncate:
         return 0;
  out_sig:
         send_sig(SIGXFSZ, current, 0);
-out:
+out_big:
         return -EFBIG;
+out_busy:
+       return -ETXTBSY;
  }
  
  EXPORT_SYMBOL(vmtruncate);
@@ -1328,8 +1439,13 @@ static int do_swap_page(struct mm_struct * mm,
                 /* Had to read the page from swap area: Major fault */
                 ret = VM_FAULT_MAJOR;
                 inc_page_state(pgmajfault);
+               grab_swap_token();
         }
  
+       if (!vx_rsspages_avail(mm, 1)) {
+               ret = VM_FAULT_OOM;
+               goto out;
+       }
         mark_page_accessed(page);
         lock_page(page);
  
@@ -1354,7 +1470,8 @@ static int do_swap_page(struct mm_struct * mm,
         if (vm_swap_full())
                 remove_exclusive_swap_page(page);
  
-       mm->rss++;
+       // mm->rss++;
+       vx_rsspages_inc(mm);
         pte = mk_pte(page, vma->vm_page_prot);
         if (write_access && can_share_swap_page(page)) {
                 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1405,6 +1522,9 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
                 if (unlikely(anon_vma_prepare(vma)))
                         goto no_mem;
+               if (!vx_rsspages_avail(mm, 1))
+                       goto no_mem;
+
                 page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
                 if (!page)
                         goto no_mem;
@@ -1419,7 +1539,8 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         spin_unlock(&mm->page_table_lock);
                         goto out;
                 }
-               mm->rss++;
+               // mm->rss++;
+               vx_rsspages_inc(mm);
                 entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
                                                          vma->vm_page_prot)),
                                       vma);
@@ -1482,6 +1603,8 @@ retry:
                 return VM_FAULT_SIGBUS;
         if (new_page == NOPAGE_OOM)
                 return VM_FAULT_OOM;
+       if (!vx_rsspages_avail(mm, 1))
+               return VM_FAULT_OOM;
  
         /*
          * Should we do an early C-O-W break?
@@ -1670,15 +1793,20 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
          * We need the page table lock to synchronize with kswapd
          * and the SMP-safe atomic PTE updates.
          */
+       set_delay_flag(current,PF_MEMIO);
         spin_lock(&mm->page_table_lock);
         pmd = pmd_alloc(mm, pgd, address);
  
         if (pmd) {
                 pte_t * pte = pte_alloc_map(mm, pmd, address);
-               if (pte)
-                       return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+               if (pte) {
+                       int rc = handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+                       clear_delay_flag(current,PF_MEMIO);
+                       return rc;
+               }
         }
         spin_unlock(&mm->page_table_lock);
+       clear_delay_flag(current,PF_MEMIO);
         return VM_FAULT_OOM;
  }
  
@@ -1783,7 +1911,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
  #ifdef AT_SYSINFO_EHDR
         return &gate_vma;
  #else
-       return 0;
+       return NULL;
  #endif
  }