vserver 2.0-rc4

[linux-2.6.git] / mm / swapfile.c
diff --git a/mm/swapfile.c b/mm/swapfile.c

index fce4cd4..2519963 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -24,16 +24,18 @@
  #include <linux/module.h>
  #include <linux/rmap.h>
  #include <linux/security.h>
+#include <linux/acct.h>
  #include <linux/backing-dev.h>
+#include <linux/syscalls.h>
  
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  #include <linux/swapops.h>
-#include <linux/vs_base.h>
+#include <linux/vs_memory.h>
  
-spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(swaplock);
  unsigned int nr_swapfiles;
-int total_swap_pages;
+long total_swap_pages;
  static int swap_overflow;
  
  EXPORT_SYMBOL(total_swap_pages);
@@ -111,7 +113,7 @@ static inline int scan_swap_map(struct swap_info_struct *si)
   check_next_cluster:
         if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
         {
-               int nr;
+               unsigned long nr;
                 for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
                         if (si->swap_map[nr])
                         {
@@ -431,21 +433,21 @@ static void
  unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
         swp_entry_t entry, struct page *page)
  {
-       // vma->vm_mm->rss++;
         vx_rsspages_inc(vma->vm_mm);
         get_page(page);
         set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
         page_add_anon_rmap(page, vma, address);
         swap_free(entry);
+       acct_update_integrals();
+       update_mem_hiwater();
  }
  
  /* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
-       unsigned long address, unsigned long size, unsigned long offset,
+static unsigned long unuse_pmd(struct vm_area_struct *vma, pmd_t *dir,
+       unsigned long address, unsigned long end,
         swp_entry_t entry, struct page *page)
  {
-       pte_t * pte;
-       unsigned long end;
+       pte_t *pte;
         pte_t swp_pte = swp_entry_to_pte(entry);
  
         if (pmd_none(*dir))
@@ -456,81 +458,120 @@ static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
                 return 0;
         }
         pte = pte_offset_map(dir, address);
-       offset += address & PMD_MASK;
-       address &= ~PMD_MASK;
-       end = address + size;
-       if (end > PMD_SIZE)
-               end = PMD_SIZE;
         do {
                 /*
                  * swapoff spends a _lot_ of time in this loop!
                  * Test inline before going to call unuse_pte.
                  */
                 if (unlikely(pte_same(*pte, swp_pte))) {
-                       unuse_pte(vma, offset + address, pte, entry, page);
+                       unuse_pte(vma, address, pte, entry, page);
                         pte_unmap(pte);
+
+                       /*
+                        * Move the page to the active list so it is not
+                        * immediately swapped out again after swapon.
+                        */
+                       activate_page(page);
+
                         /* add 1 since address may be 0 */
-                       return 1 + offset + address;
+                       return 1 + address;
                 }
                 address += PAGE_SIZE;
                 pte++;
-       } while (address && (address < end));
+       } while (address < end);
         pte_unmap(pte - 1);
         return 0;
  }
  
  /* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
-       unsigned long address, unsigned long size,
+static unsigned long unuse_pud(struct vm_area_struct *vma, pud_t *pud,
+        unsigned long address, unsigned long end,
         swp_entry_t entry, struct page *page)
  {
-       pmd_t * pmd;
-       unsigned long offset, end;
+       pmd_t *pmd;
+       unsigned long next;
         unsigned long foundaddr;
  
-       if (pgd_none(*dir))
+       if (pud_none(*pud))
                 return 0;
-       if (pgd_bad(*dir)) {
-               pgd_ERROR(*dir);
-               pgd_clear(dir);
+       if (pud_bad(*pud)) {
+               pud_ERROR(*pud);
+               pud_clear(pud);
                 return 0;
         }
-       pmd = pmd_offset(dir, address);
-       offset = address & PGDIR_MASK;
-       address &= ~PGDIR_MASK;
-       end = address + size;
-       if (end > PGDIR_SIZE)
-               end = PGDIR_SIZE;
-       if (address >= end)
-               BUG();
+       pmd = pmd_offset(pud, address);
         do {
-               foundaddr = unuse_pmd(vma, pmd, address, end - address,
-                                               offset, entry, page);
+               next = (address + PMD_SIZE) & PMD_MASK;
+               if (next > end || !next)
+                       next = end;
+               foundaddr = unuse_pmd(vma, pmd, address, next, entry, page);
                 if (foundaddr)
                         return foundaddr;
-               address = (address + PMD_SIZE) & PMD_MASK;
+               address = next;
                 pmd++;
-       } while (address && (address < end));
+       } while (address < end);
         return 0;
  }
  
  /* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+static unsigned long unuse_pgd(struct vm_area_struct *vma, pgd_t *pgd,
+       unsigned long address, unsigned long end,
         swp_entry_t entry, struct page *page)
  {
-       unsigned long start = vma->vm_start, end = vma->vm_end;
+       pud_t *pud;
+       unsigned long next;
         unsigned long foundaddr;
  
-       if (start >= end)
-               BUG();
+       if (pgd_none(*pgd))
+               return 0;
+       if (pgd_bad(*pgd)) {
+               pgd_ERROR(*pgd);
+               pgd_clear(pgd);
+               return 0;
+       }
+       pud = pud_offset(pgd, address);
+       do {
+               next = (address + PUD_SIZE) & PUD_MASK;
+               if (next > end || !next)
+                       next = end;
+               foundaddr = unuse_pud(vma, pud, address, next, entry, page);
+               if (foundaddr)
+                       return foundaddr;
+               address = next;
+               pud++;
+       } while (address < end);
+       return 0;
+}
+
+/* vma->vm_mm->page_table_lock is held */
+static unsigned long unuse_vma(struct vm_area_struct *vma,
+       swp_entry_t entry, struct page *page)
+{
+       pgd_t *pgd;
+       unsigned long address, next, end;
+       unsigned long foundaddr;
+
+       if (page->mapping) {
+               address = page_address_in_vma(page, vma);
+               if (address == -EFAULT)
+                       return 0;
+               else
+                       end = address + PAGE_SIZE;
+       } else {
+               address = vma->vm_start;
+               end = vma->vm_end;
+       }
+       pgd = pgd_offset(vma->vm_mm, address);
         do {
-               foundaddr = unuse_pgd(vma, pgdir, start, end - start,
-                                               entry, page);
+               next = (address + PGDIR_SIZE) & PGDIR_MASK;
+               if (next > end || !next)
+                       next = end;
+               foundaddr = unuse_pgd(vma, pgd, address, next, entry, page);
                 if (foundaddr)
                         return foundaddr;
-               start = (start + PGDIR_SIZE) & PGDIR_MASK;
-               pgdir++;
-       } while (start && (start < end));
+               address = next;
+               pgd++;
+       } while (address < end);
         return 0;
  }
  
@@ -543,12 +584,19 @@ static int unuse_process(struct mm_struct * mm,
         /*
          * Go through process' page directory.
          */
-       down_read(&mm->mmap_sem);
+       if (!down_read_trylock(&mm->mmap_sem)) {
+               /*
+                * Our reference to the page stops try_to_unmap_one from
+                * unmapping its ptes, so swapoff can make progress.
+                */
+               unlock_page(page);
+               down_read(&mm->mmap_sem);
+               lock_page(page);
+       }
         spin_lock(&mm->page_table_lock);
         for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               if (!is_vm_hugetlb_page(vma)) {
-                       pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-                       foundaddr = unuse_vma(vma, pgd, entry, page);
+               if (vma->anon_vma) {
+                       foundaddr = unuse_vma(vma, entry, page);
                         if (foundaddr)
                                 break;
                 }
@@ -625,11 +673,12 @@ static int try_to_unuse(unsigned int type)
          *
          * A simpler strategy would be to start at the last mm we
          * freed the previous entry from; but that would take less
-        * advantage of mmlist ordering (now preserved by swap_out()),
-        * which clusters forked address spaces together, most recent
-        * child immediately after parent.  If we race with dup_mmap(),
-        * we very much want to resolve parent before child, otherwise
-        * we may miss some entries: using last mm would invert that.
+        * advantage of mmlist ordering, which clusters forked mms
+        * together, child after parent.  If we race with dup_mmap(), we
+        * prefer to resolve parent before child, lest we miss entries
+        * duplicated after we scanned child: using last mm would invert
+        * that.  Though it's only a serious concern when an overflowed
+        * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
          */
         start_mm = &init_mm;
         atomic_inc(&init_mm.mm_users);
@@ -637,17 +686,9 @@ static int try_to_unuse(unsigned int type)
         /*
          * Keep on scanning until all entries have gone.  Usually,
          * one pass through swap_map is enough, but not necessarily:
-        * mmput() removes mm from mmlist before exit_mmap() and its
-        * zap_page_range().  That's not too bad, those entries are
-        * on their way out, and handled faster there than here.
-        * do_munmap() behaves similarly, taking the range out of mm's
-        * vma list before zap_page_range().  But unfortunately, when
-        * unmapping a part of a vma, it takes the whole out first,
-        * then reinserts what's left after (might even reschedule if
-        * open() method called) - so swap entries may be invisible
-        * to swapoff for a while, then reappear - but that is rare.
+        * there are races when an instance of an entry might be missed.
          */
-       while ((i = find_next_to_unuse(si, i))) {
+       while ((i = find_next_to_unuse(si, i)) != 0) {
                 if (signal_pending(current)) {
                         retval = -EINTR;
                         break;
@@ -697,7 +738,7 @@ static int try_to_unuse(unsigned int type)
                 wait_on_page_writeback(page);
  
                 /*
-                * Remove all references to entry, without blocking.
+                * Remove all references to entry.
                  * Whenever we reach init_mm, there's no address space
                  * to search, but use it as a reminder to search shmem.
                  */
@@ -722,7 +763,10 @@ static int try_to_unuse(unsigned int type)
                         while (*swap_map > 1 && !retval &&
                                         (p = p->next) != &start_mm->mmlist) {
                                 mm = list_entry(p, struct mm_struct, mmlist);
-                               atomic_inc(&mm->mm_users);
+                               if (atomic_inc_return(&mm->mm_users) == 1) {
+                                       atomic_dec(&mm->mm_users);
+                                       continue;
+                               }
                                 spin_unlock(&mmlist_lock);
                                 mmput(prev_mm);
                                 prev_mm = mm;
@@ -835,6 +879,26 @@ static int try_to_unuse(unsigned int type)
         return retval;
  }
  
+/*
+ * After a successful try_to_unuse, if no swap is now in use, we know we
+ * can empty the mmlist.  swap_list_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
+ * added to the mmlist just after page_duplicate - before would be racy.
+ */
+static void drain_mmlist(void)
+{
+       struct list_head *p, *next;
+       unsigned int i;
+
+       for (i = 0; i < nr_swapfiles; i++)
+               if (swap_info[i].inuse_pages)
+                       return;
+       spin_lock(&mmlist_lock);
+       list_for_each_safe(p, next, &init_mm.mmlist)
+               list_del_init(p);
+       spin_unlock(&mmlist_lock);
+}
+
  /*
   * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
   * corresponds to page offset `offset'.
@@ -1067,6 +1131,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
         unsigned short *swap_map;
         struct file *swap_file, *victim;
         struct address_space *mapping;
+       struct inode *inode;
         char * pathname;
         int i, type, prev;
         int err;
@@ -1079,7 +1144,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
         if (IS_ERR(pathname))
                 goto out;
  
-       victim = filp_open(pathname, O_RDWR, 0);
+       victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
         putname(pathname);
         err = PTR_ERR(victim);
         if (IS_ERR(victim))
@@ -1148,6 +1213,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
         }
         down(&swapon_sem);
         swap_list_lock();
+       drain_mmlist();
         swap_device_lock(p);
         swap_file = p->swap_file;
         p->swap_file = NULL;
@@ -1160,12 +1226,15 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
         swap_list_unlock();
         up(&swapon_sem);
         vfree(swap_map);
-       if (S_ISBLK(mapping->host->i_mode)) {
-               struct block_device *bdev = I_BDEV(mapping->host);
+       inode = mapping->host;
+       if (S_ISBLK(inode->i_mode)) {
+               struct block_device *bdev = I_BDEV(inode);
                 set_blocksize(bdev, p->old_block_size);
                 bd_release(bdev);
         } else {
-               up(&mapping->host->i_sem);
+               down(&inode->i_sem);
+               inode->i_flags &= ~S_SWAPFILE;
+               up(&inode->i_sem);
         }
         filp_close(swap_file, NULL);
         err = 0;
@@ -1284,7 +1353,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
         int i, prev;
         int error;
         static int least_priority;
-       union swap_header *swap_header = 0;
+       union swap_header *swap_header = NULL;
         int swap_header_version;
         int nr_good_pages = 0;
         unsigned long maxpages = 1;
@@ -1330,7 +1399,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
         p->highest_bit = 0;
         p->cluster_nr = 0;
         p->inuse_pages = 0;
-       p->sdev_lock = SPIN_LOCK_UNLOCKED;
+       spin_lock_init(&p->sdev_lock);
         p->next = -1;
         if (swap_flags & SWAP_FLAG_PREFER) {
                 p->prio =
@@ -1345,7 +1414,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                 name = NULL;
                 goto bad_swap_2;
         }
-       swap_file = filp_open(name, O_RDWR, 0);
+       swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
         error = PTR_ERR(swap_file);
         if (IS_ERR(swap_file)) {
                 swap_file = NULL;
@@ -1383,6 +1452,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                 p->bdev = inode->i_sb->s_bdev;
                 down(&inode->i_sem);
                 did_down = 1;
+               if (IS_SWAPFILE(inode)) {
+                       error = -EBUSY;
+                       goto bad_swap;
+               }
         } else {
                 goto bad_swap;
         }
@@ -1544,8 +1617,7 @@ bad_swap_2:
                 ++least_priority;
         swap_list_unlock();
         destroy_swap_extents(p);
-       if (swap_map)
-               vfree(swap_map);
+       vfree(swap_map);
         if (swap_file)
                 filp_close(swap_file, NULL);
  out:
@@ -1555,8 +1627,11 @@ out:
         }
         if (name)
                 putname(name);
-       if (error && did_down)
+       if (did_down) {
+               if (!error)
+                       inode->i_flags |= S_SWAPFILE;
                 up(&inode->i_sem);
+       }
         return error;
  }
  
@@ -1575,8 +1650,8 @@ void si_swapinfo(struct sysinfo *val)
         val->freeswap = nr_swap_pages + nr_to_be_unused;
         val->totalswap = total_swap_pages + nr_to_be_unused;
         swap_list_unlock();
-        if (vx_flags(VXF_VIRT_MEM, 0))
-                vx_vsi_swapinfo(val);
+       if (vx_flags(VXF_VIRT_MEM, 0))
+               vx_vsi_swapinfo(val);
  }
  
  /*