X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fpage-writeback.c;h=75d7f48b79bba537d522cbd709138d48196d6782;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=343998d46bb03344d27845a79b6c20096e1508ca;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 343998d46..75d7f48b7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -46,7 +46,7 @@ static long ratelimit_pages = 32; static long total_pages; /* The total number of pages in the machine. */ -static int dirty_exceeded; /* Dirty mem may be over limit */ +static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ /* * When balance_dirty_pages decides that the caller needs to perform some @@ -72,15 +72,14 @@ int dirty_background_ratio = 10; int vm_dirty_ratio = 40; /* - * The interval between `kupdate'-style writebacks, in centiseconds - * (hundredths of a second) + * The interval between `kupdate'-style writebacks, in jiffies */ -int dirty_writeback_centisecs = 5 * 100; +int dirty_writeback_interval = 5 * HZ; /* - * The longest number of centiseconds for which data is allowed to remain dirty + * The longest number of jiffies for which data is allowed to remain dirty */ -int dirty_expire_centisecs = 30 * 100; +int dirty_expire_interval = 30 * HZ; /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -88,7 +87,8 @@ int dirty_expire_centisecs = 30 * 100; int block_dump; /* - * Flag that puts the machine in "laptop mode". + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; @@ -133,17 +133,29 @@ static void get_writeback_state(struct writeback_state *wbs) * clamping level. */ static void -get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty) +get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, + struct address_space *mapping) { int background_ratio; /* Percentages */ int dirty_ratio; int unmapped_ratio; long background; long dirty; + unsigned long available_memory = total_pages; struct task_struct *tsk; get_writeback_state(wbs); +#ifdef CONFIG_HIGHMEM + /* + * If this mapping can only allocate from low memory, + * we exclude high memory from our count. + */ + if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) + available_memory -= totalhigh_pages; +#endif + + unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; dirty_ratio = vm_dirty_ratio; @@ -157,8 +169,8 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty) if (background_ratio >= dirty_ratio) background_ratio = dirty_ratio / 2; - background = (background_ratio * total_pages) / 100; - dirty = (dirty_ratio * total_pages) / 100; + background = (background_ratio * available_memory) / 100; + dirty = (dirty_ratio * available_memory) / 100; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { background += background / 4; @@ -194,12 +206,14 @@ static void balance_dirty_pages(struct address_space *mapping) .nr_to_write = write_chunk, }; - get_dirty_limits(&wbs, &background_thresh, &dirty_thresh); + get_dirty_limits(&wbs, &background_thresh, + &dirty_thresh, mapping); nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) break; - dirty_exceeded = 1; + if (!dirty_exceeded) + dirty_exceeded = 1; /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. * Unstable writes are a feature of certain networked @@ -210,7 +224,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (nr_reclaimable) { writeback_inodes(&wbc); get_dirty_limits(&wbs, &background_thresh, - &dirty_thresh); + &dirty_thresh, mapping); nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) break; @@ -221,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping) blk_congestion_wait(WRITE, HZ/10); } - if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) + if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) dirty_exceeded = 0; if (writeback_in_progress(bdi)) @@ -241,8 +255,9 @@ static void balance_dirty_pages(struct address_space *mapping) } /** - * balance_dirty_pages_ratelimited - balance dirty memory state - * @mapping - address_space which was dirtied + * balance_dirty_pages_ratelimited_nr - balance dirty memory state + * @mapping: address_space which was dirtied + * @nr_pages_dirtied: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -253,10 +268,12 @@ static void balance_dirty_pages(struct address_space *mapping) * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(int, ratelimits) = 0; - long ratelimit; + static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; + unsigned long ratelimit; + unsigned long *p; ratelimit = ratelimit_pages; if (dirty_exceeded) @@ -266,15 +283,40 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ - if (get_cpu_var(ratelimits)++ >= ratelimit) { - __get_cpu_var(ratelimits) = 0; - put_cpu_var(ratelimits); + preempt_disable(); + p = &__get_cpu_var(ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + *p = 0; + preempt_enable(); balance_dirty_pages(mapping); return; } - put_cpu_var(ratelimits); + preempt_enable(); +} +EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); + +void throttle_vm_writeout(void) +{ + struct writeback_state wbs; + long background_thresh; + long dirty_thresh; + + for ( ; ; ) { + get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); + + /* + * Boost the allowable dirty threshold a bit for page + * allocators so they don't get DoS'ed by heavy writers + */ + dirty_thresh += dirty_thresh / 10; /* wheeee... */ + + if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) + break; + blk_congestion_wait(WRITE, HZ/10); + } } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited); + /* * writeback at least _min_pages, and keep writing until the amount of dirty @@ -296,7 +338,7 @@ static void background_writeout(unsigned long _min_pages) long background_thresh; long dirty_thresh; - get_dirty_limits(&wbs, &background_thresh, &dirty_thresh); + get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); if (wbs.nr_dirty + wbs.nr_unstable < background_thresh && min_pages <= 0) break; @@ -319,7 +361,7 @@ static void background_writeout(unsigned long _min_pages) * the whole world. Returns 0 if a pdflush thread was dispatched. Returns * -1 if all pdflush threads were busy. */ -int wakeup_bdflush(long nr_pages) +int wakeup_pdflush(long nr_pages) { if (nr_pages == 0) { struct writeback_state wbs; @@ -333,10 +375,8 @@ int wakeup_bdflush(long nr_pages) static void wb_timer_fn(unsigned long unused); static void laptop_timer_fn(unsigned long unused); -static struct timer_list wb_timer = - TIMER_INITIALIZER(wb_timer_fn, 0, 0); -static struct timer_list laptop_mode_wb_timer = - TIMER_INITIALIZER(laptop_timer_fn, 0, 0); +static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); +static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* * Periodic writeback of "old" data. @@ -346,8 +386,8 @@ static struct timer_list laptop_mode_wb_timer = * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * - * Try to run once per dirty_writeback_centisecs. But if a writeback event - * takes longer than a dirty_writeback_centisecs interval, then leave a + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * older_than_this takes precedence over nr_to_write. So we'll only write back @@ -372,9 +412,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); get_writeback_state(&wbs); - oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; + oldest_jif = jiffies - dirty_expire_interval; start_jif = jiffies; - next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; + next_jif = start_jif + dirty_writeback_interval; nr_to_write = wbs.nr_dirty + wbs.nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { @@ -391,7 +431,7 @@ static void wb_kupdate(unsigned long arg) } if (time_before(next_jif, jiffies + HZ)) next_jif = jiffies + HZ; - if (dirty_writeback_centisecs) + if (dirty_writeback_interval) mod_timer(&wb_timer, next_jif); } @@ -401,11 +441,11 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_centisecs) { + proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + if (dirty_writeback_interval) { mod_timer(&wb_timer, - jiffies + (dirty_writeback_centisecs * HZ) / 100); - } else { + jiffies + dirty_writeback_interval); + } else { del_timer(&wb_timer); } return 0; @@ -434,7 +474,7 @@ static void laptop_timer_fn(unsigned long unused) */ void laptop_io_completion(void) { - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); } /* @@ -504,26 +544,37 @@ void __init page_writeback_init(void) dirty_background_ratio /= 100; vm_dirty_ratio *= correction; vm_dirty_ratio /= 100; + + if (dirty_background_ratio <= 0) + dirty_background_ratio = 1; + if (vm_dirty_ratio <= 0) + vm_dirty_ratio = 1; } - mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); set_ratelimit(); register_cpu_notifier(&ratelimit_nb); } int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { + int ret; + if (wbc->nr_to_write <= 0) return 0; + wbc->for_writepages = 1; if (mapping->a_ops->writepages) - return mapping->a_ops->writepages(mapping, wbc); - return generic_writepages(mapping, wbc); + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + wbc->for_writepages = 0; + return ret; } /** * write_one_page - write out a single page and optionally wait on I/O * - * @page - the page to write - * @wait - if true, wait on writeout + * @page: the page to write + * @wait: if true, wait on writeout * * The page must be locked by the caller and will be unlocked upon return. * @@ -576,30 +627,30 @@ EXPORT_SYMBOL(write_one_page); */ int __set_page_dirty_nobuffers(struct page *page) { - int ret = 0; - if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); + struct address_space *mapping2; if (mapping) { - spin_lock_irq(&mapping->tree_lock); - mapping = page_mapping(page); - if (page_mapping(page)) { /* Race with truncate? */ - BUG_ON(page_mapping(page) != mapping); - if (!mapping->backing_dev_info->memory_backed) + write_lock_irq(&mapping->tree_lock); + mapping2 = page_mapping(page); + if (mapping2) { /* Race with truncate? */ + BUG_ON(mapping2 != mapping); + if (mapping_cap_account_dirty(mapping)) inc_page_state(nr_dirty); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + write_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } + return 1; } - return ret; + return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -629,8 +680,10 @@ int fastcall set_page_dirty(struct page *page) return (*spd)(page); return __set_page_dirty_buffers(page); } - if (!PageDirty(page)) - SetPageDirty(page); + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; + } return 0; } EXPORT_SYMBOL(set_page_dirty); @@ -666,17 +719,17 @@ int test_clear_page_dirty(struct page *page) unsigned long flags; if (mapping) { - spin_lock_irqsave(&mapping->tree_lock, flags); + write_lock_irqsave(&mapping->tree_lock, flags); if (TestClearPageDirty(page)) { radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - if (!mapping->backing_dev_info->memory_backed) + write_unlock_irqrestore(&mapping->tree_lock, flags); + if (mapping_cap_account_dirty(mapping)) dec_page_state(nr_dirty); return 1; } - spin_unlock_irqrestore(&mapping->tree_lock, flags); + write_unlock_irqrestore(&mapping->tree_lock, flags); return 0; } return TestClearPageDirty(page); @@ -703,7 +756,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping) { if (TestClearPageDirty(page)) { - if (!mapping->backing_dev_info->memory_backed) + if (mapping_cap_account_dirty(mapping)) dec_page_state(nr_dirty); return 1; } @@ -713,30 +766,6 @@ int clear_page_dirty_for_io(struct page *page) } EXPORT_SYMBOL(clear_page_dirty_for_io); -/* - * Clear a page's dirty flag while ignoring dirty memory accounting - */ -int __clear_page_dirty(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (mapping) { - unsigned long flags; - - spin_lock_irqsave(&mapping->tree_lock, flags); - if (TestClearPageDirty(page)) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - return 1; - } - spin_unlock_irqrestore(&mapping->tree_lock, flags); - return 0; - } - return TestClearPageDirty(page); -} - int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -745,13 +774,13 @@ int test_clear_page_writeback(struct page *page) if (mapping) { unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + write_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); if (ret) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + write_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); } @@ -766,7 +795,7 @@ int test_set_page_writeback(struct page *page) if (mapping) { unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + write_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) radix_tree_tag_set(&mapping->page_tree, @@ -776,7 +805,7 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + write_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); } @@ -794,9 +823,9 @@ int mapping_tagged(struct address_space *mapping, int tag) unsigned long flags; int ret; - spin_lock_irqsave(&mapping->tree_lock, flags); + read_lock_irqsave(&mapping->tree_lock, flags); ret = radix_tree_tagged(&mapping->page_tree, tag); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + read_unlock_irqrestore(&mapping->tree_lock, flags); return ret; } EXPORT_SYMBOL(mapping_tagged);