patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / mm / swapfile.c
index 321b09c..fce4cd4 100644 (file)
@@ -7,6 +7,7 @@
 
 #include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/hugetlb.h>
 #include <linux/mman.h>
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
@@ -28,6 +29,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
+#include <linux/vs_base.h>
 
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
 unsigned int nr_swapfiles;
@@ -48,68 +50,38 @@ struct swap_info_struct swap_info[MAX_SWAPFILES];
 static DECLARE_MUTEX(swapon_sem);
 
 /*
- * Array of backing blockdevs, for swap_unplug_fn.  We need this because the
- * bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling
- * the unplug_fn.  And swap_list_lock cannot be turned into a semaphore.
+ * We need this because the bdev->unplug_fn can sleep and we cannot
+ * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
+ * cannot be turned into a semaphore.
  */
-static DECLARE_MUTEX(swap_bdevs_sem);
-static struct block_device *swap_bdevs[MAX_SWAPFILES];
+static DECLARE_RWSEM(swap_unplug_sem);
 
 #define SWAPFILE_CLUSTER 256
 
-/*
- * Caller holds swap_bdevs_sem
- */
-static void install_swap_bdev(struct block_device *bdev)
-{
-       int i;
-
-       for (i = 0; i < MAX_SWAPFILES; i++) {
-               if (swap_bdevs[i] == NULL) {
-                       swap_bdevs[i] = bdev;
-                       return;
-               }
-       }
-       BUG();
-}
-
-static void remove_swap_bdev(struct block_device *bdev)
-{
-       int i;
-
-       for (i = 0; i < MAX_SWAPFILES; i++) {
-               if (swap_bdevs[i] == bdev) {
-                       memcpy(&swap_bdevs[i], &swap_bdevs[i + 1],
-                               (MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs));
-                       swap_bdevs[MAX_SWAPFILES - 1] = NULL;
-                       return;
-               }
-       }
-       BUG();
-}
-
-/*
- * Unlike a standard unplug_io_fn, swap_unplug_io_fn is never called
- * through swap's backing_dev_info (which is only used by shrink_list),
- * but directly from sync_page when PageSwapCache: and takes the page
- * as argument, so that it can find the right device from swp_entry_t.
- */
-void swap_unplug_io_fn(struct page *page)
+void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 {
        swp_entry_t entry;
 
-       down(&swap_bdevs_sem);
+       down_read(&swap_unplug_sem);
        entry.val = page->private;
        if (PageSwapCache(page)) {
-               struct block_device *bdev = swap_bdevs[swp_type(entry)];
+               struct block_device *bdev = swap_info[swp_type(entry)].bdev;
                struct backing_dev_info *bdi;
 
-               if (bdev) {
-                       bdi = bdev->bd_inode->i_mapping->backing_dev_info;
-                       (*bdi->unplug_io_fn)(bdi);
-               }
+               /*
+                * If the page is removed from swapcache from under us (with a
+                * racy try_to_unuse/swapoff) we need an additional reference
+                * count to avoid reading garbage from page->private above. If
+                * the WARN_ON triggers during a swapoff it maybe the race
+                * condition and it's harmless. However if it triggers without
+                * swapoff it signals a problem.
+                */
+               WARN_ON(page_count(page) <= 1);
+
+               bdi = bdev->bd_inode->i_mapping->backing_dev_info;
+               bdi->unplug_io_fn(bdi, page);
        }
-       up(&swap_bdevs_sem);
+       up_read(&swap_unplug_sem);
 }
 
 static inline int scan_swap_map(struct swap_info_struct *si)
@@ -319,10 +291,10 @@ static int exclusive_swap_page(struct page *page)
                /* Is the only swap cache user the cache itself? */
                if (p->swap_map[swp_offset(entry)] == 1) {
                        /* Recheck the page count with the swapcache lock held.. */
-                       spin_lock(&swapper_space.tree_lock);
+                       spin_lock_irq(&swapper_space.tree_lock);
                        if (page_count(page) == 2)
                                retval = 1;
-                       spin_unlock(&swapper_space.tree_lock);
+                       spin_unlock_irq(&swapper_space.tree_lock);
                }
                swap_info_put(p);
        }
@@ -390,13 +362,13 @@ int remove_exclusive_swap_page(struct page *page)
        retval = 0;
        if (p->swap_map[swp_offset(entry)] == 1) {
                /* Recheck the page count with the swapcache lock held.. */
-               spin_lock(&swapper_space.tree_lock);
+               spin_lock_irq(&swapper_space.tree_lock);
                if ((page_count(page) == 2) && !PageWriteback(page)) {
                        __delete_from_swap_cache(page);
                        SetPageDirty(page);
                        retval = 1;
                }
-               spin_unlock(&swapper_space.tree_lock);
+               spin_unlock_irq(&swapper_space.tree_lock);
        }
        swap_info_put(p);
 
@@ -420,12 +392,12 @@ void free_swap_and_cache(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
                if (swap_entry_free(p, swp_offset(entry)) == 1) {
-                       spin_lock(&swapper_space.tree_lock);
+                       spin_lock_irq(&swapper_space.tree_lock);
                        page = radix_tree_lookup(&swapper_space.page_tree,
                                entry.val);
                        if (page && TestSetPageLocked(page))
                                page = NULL;
-                       spin_unlock(&swapper_space.tree_lock);
+                       spin_unlock_irq(&swapper_space.tree_lock);
                }
                swap_info_put(p);
        }
@@ -457,20 +429,20 @@ void free_swap_and_cache(swp_entry_t entry)
 /* vma->vm_mm->page_table_lock is held */
 static void
 unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
-       swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+       swp_entry_t entry, struct page *page)
 {
        // vma->vm_mm->rss++;
        vx_rsspages_inc(vma->vm_mm);
        get_page(page);
        set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
-       *pte_chainp = page_add_rmap(page, dir, *pte_chainp);
+       page_add_anon_rmap(page, vma, address);
        swap_free(entry);
 }
 
 /* vma->vm_mm->page_table_lock is held */
-static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
+static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
        unsigned long address, unsigned long size, unsigned long offset,
-       swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+       swp_entry_t entry, struct page *page)
 {
        pte_t * pte;
        unsigned long end;
@@ -495,10 +467,10 @@ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
                 * Test inline before going to call unuse_pte.
                 */
                if (unlikely(pte_same(*pte, swp_pte))) {
-                       unuse_pte(vma, offset + address, pte,
-                                       entry, page, pte_chainp);
+                       unuse_pte(vma, offset + address, pte, entry, page);
                        pte_unmap(pte);
-                       return 1;
+                       /* add 1 since address may be 0 */
+                       return 1 + offset + address;
                }
                address += PAGE_SIZE;
                pte++;
@@ -508,12 +480,13 @@ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
 }
 
 /* vma->vm_mm->page_table_lock is held */
-static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
+static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
        unsigned long address, unsigned long size,
-       swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+       swp_entry_t entry, struct page *page)
 {
        pmd_t * pmd;
        unsigned long offset, end;
+       unsigned long foundaddr;
 
        if (pgd_none(*dir))
                return 0;
@@ -531,9 +504,10 @@ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
        if (address >= end)
                BUG();
        do {
-               if (unuse_pmd(vma, pmd, address, end - address,
-                               offset, entry, page, pte_chainp))
-                       return 1;
+               foundaddr = unuse_pmd(vma, pmd, address, end - address,
+                                               offset, entry, page);
+               if (foundaddr)
+                       return foundaddr;
                address = (address + PMD_SIZE) & PMD_MASK;
                pmd++;
        } while (address && (address < end));
@@ -541,17 +515,19 @@ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
 }
 
 /* vma->vm_mm->page_table_lock is held */
-static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
-       swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
+static unsigned long unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
+       swp_entry_t entry, struct page *page)
 {
        unsigned long start = vma->vm_start, end = vma->vm_end;
+       unsigned long foundaddr;
 
        if (start >= end)
                BUG();
        do {
-               if (unuse_pgd(vma, pgdir, start, end - start,
-                               entry, page, pte_chainp))
-                       return 1;
+               foundaddr = unuse_pgd(vma, pgdir, start, end - start,
+                                               entry, page);
+               if (foundaddr)
+                       return foundaddr;
                start = (start + PGDIR_SIZE) & PGDIR_MASK;
                pgdir++;
        } while (start && (start < end));
@@ -562,23 +538,27 @@ static int unuse_process(struct mm_struct * mm,
                        swp_entry_t entry, struct page* page)
 {
        struct vm_area_struct* vma;
-       struct pte_chain *pte_chain;
-
-       pte_chain = pte_chain_alloc(GFP_KERNEL);
-       if (!pte_chain)
-               return -ENOMEM;
+       unsigned long foundaddr = 0;
 
        /*
         * Go through process' page directory.
         */
+       down_read(&mm->mmap_sem);
        spin_lock(&mm->page_table_lock);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               pgd_t * pgd = pgd_offset(mm, vma->vm_start);
-               if (unuse_vma(vma, pgd, entry, page, &pte_chain))
-                       break;
+               if (!is_vm_hugetlb_page(vma)) {
+                       pgd_t * pgd = pgd_offset(mm, vma->vm_start);
+                       foundaddr = unuse_vma(vma, pgd, entry, page);
+                       if (foundaddr)
+                               break;
+               }
        }
        spin_unlock(&mm->page_table_lock);
-       pte_chain_free(pte_chain);
+       up_read(&mm->mmap_sem);
+       /*
+        * Currently unuse_process cannot fail, but leave error handling
+        * at call sites for now, since we change it from time to time.
+        */
        return 0;
 }
 
@@ -680,7 +660,7 @@ static int try_to_unuse(unsigned int type)
                 */
                swap_map = &si->swap_map[i];
                entry = swp_entry(type, i);
-               page = read_swap_cache_async(entry);
+               page = read_swap_cache_async(entry, NULL, 0);
                if (!page) {
                        /*
                         * Either swap_duplicate() failed because entry
@@ -1144,6 +1124,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        current->flags |= PF_SWAPOFF;
        err = try_to_unuse(type);
        current->flags &= ~PF_SWAPOFF;
+
+       /* wait for any unplug function to finish */
+       down_write(&swap_unplug_sem);
+       up_write(&swap_unplug_sem);
+
        if (err) {
                /* re-insert swap space back into swap_list */
                swap_list_lock();
@@ -1162,7 +1147,6 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
                goto out_dput;
        }
        down(&swapon_sem);
-       down(&swap_bdevs_sem);
        swap_list_lock();
        swap_device_lock(p);
        swap_file = p->swap_file;
@@ -1174,8 +1158,6 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        destroy_swap_extents(p);
        swap_device_unlock(p);
        swap_list_unlock();
-       remove_swap_bdev(p->bdev);
-       up(&swap_bdevs_sem);
        up(&swapon_sem);
        vfree(swap_map);
        if (S_ISBLK(mapping->host->i_mode)) {
@@ -1519,7 +1501,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                goto bad_swap;
 
        down(&swapon_sem);
-       down(&swap_bdevs_sem);
        swap_list_lock();
        swap_device_lock(p);
        p->flags = SWP_ACTIVE;
@@ -1545,8 +1526,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        }
        swap_device_unlock(p);
        swap_list_unlock();
-       install_swap_bdev(p->bdev);
-       up(&swap_bdevs_sem);
        up(&swapon_sem);
        error = 0;
        goto out;