X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fswapfile.c;h=2519963f8b199f76ce936c6b15deb07aa4c793a7;hb=8e8ece46a861c84343256819eaec77e608ff9217;hp=fce4cd466ef47bd2b6902d912a5ca28fec3c3908;hpb=9213980e6a70d8473e0ffd4b39ab5b6caaba9ff5;p=linux-2.6.git diff --git a/mm/swapfile.c b/mm/swapfile.c index fce4cd466..2519963f8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -24,16 +24,18 @@ #include #include #include +#include #include +#include #include #include #include -#include +#include -spinlock_t swaplock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(swaplock); unsigned int nr_swapfiles; -int total_swap_pages; +long total_swap_pages; static int swap_overflow; EXPORT_SYMBOL(total_swap_pages); @@ -111,7 +113,7 @@ static inline int scan_swap_map(struct swap_info_struct *si) check_next_cluster: if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) { - int nr; + unsigned long nr; for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) if (si->swap_map[nr]) { @@ -431,21 +433,21 @@ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page *page) { - // vma->vm_mm->rss++; vx_rsspages_inc(vma->vm_mm); get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, address); swap_free(entry); + acct_update_integrals(); + update_mem_hiwater(); } /* vma->vm_mm->page_table_lock is held */ -static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, - unsigned long address, unsigned long size, unsigned long offset, +static unsigned long unuse_pmd(struct vm_area_struct *vma, pmd_t *dir, + unsigned long address, unsigned long end, swp_entry_t entry, struct page *page) { - pte_t * pte; - unsigned long end; + pte_t *pte; pte_t swp_pte = swp_entry_to_pte(entry); if (pmd_none(*dir)) @@ -456,81 +458,120 @@ static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, return 0; } pte = pte_offset_map(dir, address); - offset += address & PMD_MASK; - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; do { /* * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, offset + address, pte, entry, page); + unuse_pte(vma, address, pte, entry, page); pte_unmap(pte); + + /* + * Move the page to the active list so it is not + * immediately swapped out again after swapon. + */ + activate_page(page); + /* add 1 since address may be 0 */ - return 1 + offset + address; + return 1 + address; } address += PAGE_SIZE; pte++; - } while (address && (address < end)); + } while (address < end); pte_unmap(pte - 1); return 0; } /* vma->vm_mm->page_table_lock is held */ -static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, - unsigned long address, unsigned long size, +static unsigned long unuse_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address, unsigned long end, swp_entry_t entry, struct page *page) { - pmd_t * pmd; - unsigned long offset, end; + pmd_t *pmd; + unsigned long next; unsigned long foundaddr; - if (pgd_none(*dir)) + if (pud_none(*pud)) return 0; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); + if (pud_bad(*pud)) { + pud_ERROR(*pud); + pud_clear(pud); return 0; } - pmd = pmd_offset(dir, address); - offset = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - if (address >= end) - BUG(); + pmd = pmd_offset(pud, address); do { - foundaddr = unuse_pmd(vma, pmd, address, end - address, - offset, entry, page); + next = (address + PMD_SIZE) & PMD_MASK; + if (next > end || !next) + next = end; + foundaddr = unuse_pmd(vma, pmd, address, next, entry, page); if (foundaddr) return foundaddr; - address = (address + PMD_SIZE) & PMD_MASK; + address = next; pmd++; - } while (address && (address < end)); + } while (address < end); return 0; } /* vma->vm_mm->page_table_lock is held */ -static unsigned long unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, +static unsigned long unuse_pgd(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long address, unsigned long end, swp_entry_t entry, struct page *page) { - unsigned long start = vma->vm_start, end = vma->vm_end; + pud_t *pud; + unsigned long next; unsigned long foundaddr; - if (start >= end) - BUG(); + if (pgd_none(*pgd)) + return 0; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return 0; + } + pud = pud_offset(pgd, address); + do { + next = (address + PUD_SIZE) & PUD_MASK; + if (next > end || !next) + next = end; + foundaddr = unuse_pud(vma, pud, address, next, entry, page); + if (foundaddr) + return foundaddr; + address = next; + pud++; + } while (address < end); + return 0; +} + +/* vma->vm_mm->page_table_lock is held */ +static unsigned long unuse_vma(struct vm_area_struct *vma, + swp_entry_t entry, struct page *page) +{ + pgd_t *pgd; + unsigned long address, next, end; + unsigned long foundaddr; + + if (page->mapping) { + address = page_address_in_vma(page, vma); + if (address == -EFAULT) + return 0; + else + end = address + PAGE_SIZE; + } else { + address = vma->vm_start; + end = vma->vm_end; + } + pgd = pgd_offset(vma->vm_mm, address); do { - foundaddr = unuse_pgd(vma, pgdir, start, end - start, - entry, page); + next = (address + PGDIR_SIZE) & PGDIR_MASK; + if (next > end || !next) + next = end; + foundaddr = unuse_pgd(vma, pgd, address, next, entry, page); if (foundaddr) return foundaddr; - start = (start + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (start && (start < end)); + address = next; + pgd++; + } while (address < end); return 0; } @@ -543,12 +584,19 @@ static int unuse_process(struct mm_struct * mm, /* * Go through process' page directory. */ - down_read(&mm->mmap_sem); + if (!down_read_trylock(&mm->mmap_sem)) { + /* + * Our reference to the page stops try_to_unmap_one from + * unmapping its ptes, so swapoff can make progress. + */ + unlock_page(page); + down_read(&mm->mmap_sem); + lock_page(page); + } spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (!is_vm_hugetlb_page(vma)) { - pgd_t * pgd = pgd_offset(mm, vma->vm_start); - foundaddr = unuse_vma(vma, pgd, entry, page); + if (vma->anon_vma) { + foundaddr = unuse_vma(vma, entry, page); if (foundaddr) break; } @@ -625,11 +673,12 @@ static int try_to_unuse(unsigned int type) * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less - * advantage of mmlist ordering (now preserved by swap_out()), - * which clusters forked address spaces together, most recent - * child immediately after parent. If we race with dup_mmap(), - * we very much want to resolve parent before child, otherwise - * we may miss some entries: using last mm would invert that. + * advantage of mmlist ordering, which clusters forked mms + * together, child after parent. If we race with dup_mmap(), we + * prefer to resolve parent before child, lest we miss entries + * duplicated after we scanned child: using last mm would invert + * that. Though it's only a serious concern when an overflowed + * swap count is reset from SWAP_MAP_MAX, preventing a rescan. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -637,17 +686,9 @@ static int try_to_unuse(unsigned int type) /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: - * mmput() removes mm from mmlist before exit_mmap() and its - * zap_page_range(). That's not too bad, those entries are - * on their way out, and handled faster there than here. - * do_munmap() behaves similarly, taking the range out of mm's - * vma list before zap_page_range(). But unfortunately, when - * unmapping a part of a vma, it takes the whole out first, - * then reinserts what's left after (might even reschedule if - * open() method called) - so swap entries may be invisible - * to swapoff for a while, then reappear - but that is rare. + * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i))) { + while ((i = find_next_to_unuse(si, i)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -697,7 +738,7 @@ static int try_to_unuse(unsigned int type) wait_on_page_writeback(page); /* - * Remove all references to entry, without blocking. + * Remove all references to entry. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ @@ -722,7 +763,10 @@ static int try_to_unuse(unsigned int type) while (*swap_map > 1 && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); - atomic_inc(&mm->mm_users); + if (atomic_inc_return(&mm->mm_users) == 1) { + atomic_dec(&mm->mm_users); + continue; + } spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; @@ -835,6 +879,26 @@ static int try_to_unuse(unsigned int type) return retval; } +/* + * After a successful try_to_unuse, if no swap is now in use, we know we + * can empty the mmlist. swap_list_lock must be held on entry and exit. + * Note that mmlist_lock nests inside swap_list_lock, and an mm must be + * added to the mmlist just after page_duplicate - before would be racy. + */ +static void drain_mmlist(void) +{ + struct list_head *p, *next; + unsigned int i; + + for (i = 0; i < nr_swapfiles; i++) + if (swap_info[i].inuse_pages) + return; + spin_lock(&mmlist_lock); + list_for_each_safe(p, next, &init_mm.mmlist) + list_del_init(p); + spin_unlock(&mmlist_lock); +} + /* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which * corresponds to page offset `offset'. @@ -1067,6 +1131,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) unsigned short *swap_map; struct file *swap_file, *victim; struct address_space *mapping; + struct inode *inode; char * pathname; int i, type, prev; int err; @@ -1079,7 +1144,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) if (IS_ERR(pathname)) goto out; - victim = filp_open(pathname, O_RDWR, 0); + victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); putname(pathname); err = PTR_ERR(victim); if (IS_ERR(victim)) @@ -1148,6 +1213,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) } down(&swapon_sem); swap_list_lock(); + drain_mmlist(); swap_device_lock(p); swap_file = p->swap_file; p->swap_file = NULL; @@ -1160,12 +1226,15 @@ asmlinkage long sys_swapoff(const char __user * specialfile) swap_list_unlock(); up(&swapon_sem); vfree(swap_map); - if (S_ISBLK(mapping->host->i_mode)) { - struct block_device *bdev = I_BDEV(mapping->host); + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, p->old_block_size); bd_release(bdev); } else { - up(&mapping->host->i_sem); + down(&inode->i_sem); + inode->i_flags &= ~S_SWAPFILE; + up(&inode->i_sem); } filp_close(swap_file, NULL); err = 0; @@ -1284,7 +1353,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) int i, prev; int error; static int least_priority; - union swap_header *swap_header = 0; + union swap_header *swap_header = NULL; int swap_header_version; int nr_good_pages = 0; unsigned long maxpages = 1; @@ -1330,7 +1399,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) p->highest_bit = 0; p->cluster_nr = 0; p->inuse_pages = 0; - p->sdev_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&p->sdev_lock); p->next = -1; if (swap_flags & SWAP_FLAG_PREFER) { p->prio = @@ -1345,7 +1414,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) name = NULL; goto bad_swap_2; } - swap_file = filp_open(name, O_RDWR, 0); + swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); error = PTR_ERR(swap_file); if (IS_ERR(swap_file)) { swap_file = NULL; @@ -1383,6 +1452,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) p->bdev = inode->i_sb->s_bdev; down(&inode->i_sem); did_down = 1; + if (IS_SWAPFILE(inode)) { + error = -EBUSY; + goto bad_swap; + } } else { goto bad_swap; } @@ -1544,8 +1617,7 @@ bad_swap_2: ++least_priority; swap_list_unlock(); destroy_swap_extents(p); - if (swap_map) - vfree(swap_map); + vfree(swap_map); if (swap_file) filp_close(swap_file, NULL); out: @@ -1555,8 +1627,11 @@ out: } if (name) putname(name); - if (error && did_down) + if (did_down) { + if (!error) + inode->i_flags |= S_SWAPFILE; up(&inode->i_sem); + } return error; } @@ -1575,8 +1650,8 @@ void si_swapinfo(struct sysinfo *val) val->freeswap = nr_swap_pages + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; swap_list_unlock(); - if (vx_flags(VXF_VIRT_MEM, 0)) - vx_vsi_swapinfo(val); + if (vx_flags(VXF_VIRT_MEM, 0)) + vx_vsi_swapinfo(val); } /*