Patched to 2.6.10-1.14_FC2.
[linux-2.6.git] / mm / swapfile.c
index 6eca35a..1b4dae6 100644 (file)
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-#include <linux/vs_base.h>
+#include <linux/vs_memory.h>
 
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
 unsigned int nr_swapfiles;
@@ -111,7 +112,7 @@ static inline int scan_swap_map(struct swap_info_struct *si)
  check_next_cluster:
        if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
        {
-               int nr;
+               unsigned long nr;
                for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
                        if (si->swap_map[nr])
                        {
@@ -522,14 +523,24 @@ static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
 }
 
 /* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+static unsigned long unuse_vma(struct vm_area_struct * vma,
        swp_entry_t entry, struct page *page)
 {
-       unsigned long start = vma->vm_start, end = vma->vm_end;
+       pgd_t *pgdir;
+       unsigned long start, end;
        unsigned long foundaddr;
 
-       if (start >= end)
-               BUG();
+       if (page->mapping) {
+               start = page_address_in_vma(page, vma);
+               if (start == -EFAULT)
+                       return 0;
+               else
+                       end = start + PAGE_SIZE;
+       } else {
+               start = vma->vm_start;
+               end = vma->vm_end;
+       }
+       pgdir = pgd_offset(vma->vm_mm, start);
        do {
                foundaddr = unuse_pgd(vma, pgdir, start, end - start,
                                                entry, page);
@@ -550,12 +561,19 @@ static int unuse_process(struct mm_struct * mm,
        /*
         * Go through process' page directory.
         */
-       down_read(&mm->mmap_sem);
+       if (!down_read_trylock(&mm->mmap_sem)) {
+               /*
+                * Our reference to the page stops try_to_unmap_one from
+                * unmapping its ptes, so swapoff can make progress.
+                */
+               unlock_page(page);
+               down_read(&mm->mmap_sem);
+               lock_page(page);
+       }
        spin_lock(&mm->page_table_lock);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               if (!is_vm_hugetlb_page(vma)) {
-                       pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-                       foundaddr = unuse_vma(vma, pgd, entry, page);
+               if (vma->anon_vma) {
+                       foundaddr = unuse_vma(vma, entry, page);
                        if (foundaddr)
                                break;
                }
@@ -632,11 +650,12 @@ static int try_to_unuse(unsigned int type)
         *
         * A simpler strategy would be to start at the last mm we
         * freed the previous entry from; but that would take less
-        * advantage of mmlist ordering (now preserved by swap_out()),
-        * which clusters forked address spaces together, most recent
-        * child immediately after parent.  If we race with dup_mmap(),
-        * we very much want to resolve parent before child, otherwise
-        * we may miss some entries: using last mm would invert that.
+        * advantage of mmlist ordering, which clusters forked mms
+        * together, child after parent.  If we race with dup_mmap(), we
+        * prefer to resolve parent before child, lest we miss entries
+        * duplicated after we scanned child: using last mm would invert
+        * that.  Though it's only a serious concern when an overflowed
+        * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
         */
        start_mm = &init_mm;
        atomic_inc(&init_mm.mm_users);
@@ -644,15 +663,7 @@ static int try_to_unuse(unsigned int type)
        /*
         * Keep on scanning until all entries have gone.  Usually,
         * one pass through swap_map is enough, but not necessarily:
-        * mmput() removes mm from mmlist before exit_mmap() and its
-        * zap_page_range().  That's not too bad, those entries are
-        * on their way out, and handled faster there than here.
-        * do_munmap() behaves similarly, taking the range out of mm's
-        * vma list before zap_page_range().  But unfortunately, when
-        * unmapping a part of a vma, it takes the whole out first,
-        * then reinserts what's left after (might even reschedule if
-        * open() method called) - so swap entries may be invisible
-        * to swapoff for a while, then reappear - but that is rare.
+        * there are races when an instance of an entry might be missed.
         */
        while ((i = find_next_to_unuse(si, i)) != 0) {
                if (signal_pending(current)) {
@@ -704,7 +715,7 @@ static int try_to_unuse(unsigned int type)
                wait_on_page_writeback(page);
 
                /*
-                * Remove all references to entry, without blocking.
+                * Remove all references to entry.
                 * Whenever we reach init_mm, there's no address space
                 * to search, but use it as a reminder to search shmem.
                 */
@@ -729,7 +740,10 @@ static int try_to_unuse(unsigned int type)
                        while (*swap_map > 1 && !retval &&
                                        (p = p->next) != &start_mm->mmlist) {
                                mm = list_entry(p, struct mm_struct, mmlist);
-                               atomic_inc(&mm->mm_users);
+                               if (atomic_inc_return(&mm->mm_users) == 1) {
+                                       atomic_dec(&mm->mm_users);
+                                       continue;
+                               }
                                spin_unlock(&mmlist_lock);
                                mmput(prev_mm);
                                prev_mm = mm;
@@ -842,6 +856,26 @@ static int try_to_unuse(unsigned int type)
        return retval;
 }
 
+/*
+ * After a successful try_to_unuse, if no swap is now in use, we know we
+ * can empty the mmlist.  swap_list_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
+ * added to the mmlist just after page_duplicate - before would be racy.
+ */
+static void drain_mmlist(void)
+{
+       struct list_head *p, *next;
+       unsigned int i;
+
+       for (i = 0; i < nr_swapfiles; i++)
+               if (swap_info[i].inuse_pages)
+                       return;
+       spin_lock(&mmlist_lock);
+       list_for_each_safe(p, next, &init_mm.mmlist)
+               list_del_init(p);
+       spin_unlock(&mmlist_lock);
+}
+
 /*
  * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
  * corresponds to page offset `offset'.
@@ -1156,6 +1190,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        }
        down(&swapon_sem);
        swap_list_lock();
+       drain_mmlist();
        swap_device_lock(p);
        swap_file = p->swap_file;
        p->swap_file = NULL;
@@ -1341,7 +1376,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        p->highest_bit = 0;
        p->cluster_nr = 0;
        p->inuse_pages = 0;
-       p->sdev_lock = SPIN_LOCK_UNLOCKED;
+       spin_lock_init(&p->sdev_lock);
        p->next = -1;
        if (swap_flags & SWAP_FLAG_PREFER) {
                p->prio =
@@ -1593,8 +1628,8 @@ void si_swapinfo(struct sysinfo *val)
        val->freeswap = nr_swap_pages + nr_to_be_unused;
        val->totalswap = total_swap_pages + nr_to_be_unused;
        swap_list_unlock();
-        if (vx_flags(VXF_VIRT_MEM, 0))
-                vx_vsi_swapinfo(val);
+       if (vx_flags(VXF_VIRT_MEM, 0))
+               vx_vsi_swapinfo(val);
 }
 
 /*