#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
+#include <linux/rmap.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
static long ratelimit_pages = 32;
static long total_pages; /* The total number of pages in the machine. */
-static int dirty_exceeded; /* Dirty mem may be over limit */
+static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
/*
* When balance_dirty_pages decides that the caller needs to perform some
int vm_dirty_ratio = 40;
/*
- * The interval between `kupdate'-style writebacks, in centiseconds
- * (hundredths of a second)
+ * The interval between `kupdate'-style writebacks, in jiffies
*/
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
/*
- * The longest number of centiseconds for which data is allowed to remain dirty
+ * The longest number of jiffies for which data is allowed to remain dirty
*/
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
int block_dump;
/*
- * Flag that puts the machine in "laptop mode".
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
*/
int laptop_mode;
+EXPORT_SYMBOL(laptop_mode);
+
/* End of sysctl-exported parameters */
* clamping level.
*/
static void
-get_dirty_limits(struct page_state *ps, long *pbackground, long *pdirty)
+get_dirty_limits(long *pbackground, long *pdirty,
+ struct address_space *mapping)
{
int background_ratio; /* Percentages */
int dirty_ratio;
int unmapped_ratio;
long background;
long dirty;
+ unsigned long available_memory = total_pages;
struct task_struct *tsk;
- get_page_state(ps);
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If this mapping can only allocate from low memory,
+ * we exclude high memory from our count.
+ */
+ if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
+ available_memory -= totalhigh_pages;
+#endif
+
- unmapped_ratio = 100 - (ps->nr_mapped * 100) / total_pages;
+ unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES)) * 100) /
+ total_pages;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
if (background_ratio >= dirty_ratio)
background_ratio = dirty_ratio / 2;
- background = (background_ratio * total_pages) / 100;
- dirty = (dirty_ratio * total_pages) / 100;
+ background = (background_ratio * available_memory) / 100;
+ dirty = (dirty_ratio * available_memory) / 100;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
*/
static void balance_dirty_pages(struct address_space *mapping)
{
- struct page_state ps;
long nr_reclaimable;
long background_thresh;
long dirty_thresh;
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = write_chunk,
+ .range_cyclic = 1,
};
- get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
- nr_reclaimable = ps.nr_dirty + ps.nr_unstable;
- if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
- break;
+ get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+ dirty_thresh)
+ break;
- dirty_exceeded = 1;
+ if (!dirty_exceeded)
+ dirty_exceeded = 1;
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
*/
if (nr_reclaimable) {
writeback_inodes(&wbc);
- get_dirty_limits(&ps, &background_thresh,
- &dirty_thresh);
- nr_reclaimable = ps.nr_dirty + ps.nr_unstable;
- if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
- break;
+ get_dirty_limits(&background_thresh,
+ &dirty_thresh, mapping);
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ if (nr_reclaimable +
+ global_page_state(NR_WRITEBACK)
+ <= dirty_thresh)
+ break;
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
break; /* We've done our duty */
blk_congestion_wait(WRITE, HZ/10);
}
- if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
- dirty_exceeded = 0;
+ if (nr_reclaimable + global_page_state(NR_WRITEBACK)
+ <= dirty_thresh && dirty_exceeded)
+ dirty_exceeded = 0;
if (writeback_in_progress(bdi))
return; /* pdflush is already working this queue */
pdflush_operation(background_writeout, 0);
}
+void set_page_dirty_balance(struct page *page)
+{
+ if (set_page_dirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping)
+ balance_dirty_pages_ratelimited(mapping);
+ }
+}
+
/**
- * balance_dirty_pages_ratelimited - balance dirty memory state
- * @mapping - address_space which was dirtied
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * @mapping: address_space which was dirtied
+ * @nr_pages_dirtied: number of pages which the caller has just dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
- * On really big machines, get_page_state is expensive, so try to avoid calling
- * it too often (ratelimiting). But once we're over the dirty memory limit we
- * decrease the ratelimiting by a lot, to prevent individual processes from
- * overshooting the limit by (ratelimit_pages) each.
+ * On really big machines, get_writeback_state is expensive, so try to avoid
+ * calling it too often (ratelimiting). But once we're over the dirty memory
+ * limit we decrease the ratelimiting by a lot, to prevent individual processes
+ * from overshooting the limit by (ratelimit_pages) each.
*/
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+ unsigned long nr_pages_dirtied)
{
- static DEFINE_PER_CPU(int, ratelimits) = 0;
- long ratelimit;
+ static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
+ unsigned long ratelimit;
+ unsigned long *p;
ratelimit = ratelimit_pages;
if (dirty_exceeded)
* Check the rate limiting. Also, we do not want to throttle real-time
* tasks in balance_dirty_pages(). Period.
*/
- if (get_cpu_var(ratelimits)++ >= ratelimit) {
- __get_cpu_var(ratelimits) = 0;
- put_cpu_var(ratelimits);
+ preempt_disable();
+ p = &__get_cpu_var(ratelimits);
+ *p += nr_pages_dirtied;
+ if (unlikely(*p >= ratelimit)) {
+ *p = 0;
+ preempt_enable();
balance_dirty_pages(mapping);
return;
}
- put_cpu_var(ratelimits);
+ preempt_enable();
+}
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
+
+void throttle_vm_writeout(void)
+{
+ long background_thresh;
+ long dirty_thresh;
+
+ for ( ; ; ) {
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+
+ /*
+ * Boost the allowable dirty threshold a bit for page
+ * allocators so they don't get DoS'ed by heavy writers
+ */
+ dirty_thresh += dirty_thresh / 10; /* wheeee... */
+
+ if (global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK) <= dirty_thresh)
+ break;
+ blk_congestion_wait(WRITE, HZ/10);
+ }
}
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+
/*
* writeback at least _min_pages, and keep writing until the amount of dirty
.older_than_this = NULL,
.nr_to_write = 0,
.nonblocking = 1,
+ .range_cyclic = 1,
};
for ( ; ; ) {
- struct page_state ps;
long background_thresh;
long dirty_thresh;
- get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
- if (ps.nr_dirty + ps.nr_unstable < background_thresh
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ if (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) < background_thresh
&& min_pages <= 0)
break;
wbc.encountered_congestion = 0;
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/
-int wakeup_bdflush(long nr_pages)
+int wakeup_pdflush(long nr_pages)
{
- if (nr_pages == 0) {
- struct page_state ps;
-
- get_page_state(&ps);
- nr_pages = ps.nr_dirty + ps.nr_unstable;
- }
+ if (nr_pages == 0)
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
return pdflush_operation(background_writeout, nr_pages);
}
static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
-static struct timer_list wb_timer =
- TIMER_INITIALIZER(wb_timer_fn, 0, 0);
-static struct timer_list laptop_mode_wb_timer =
- TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
+static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
+static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
/*
* Periodic writeback of "old" data.
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
- * Try to run once per dirty_writeback_centisecs. But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * Try to run once per dirty_writeback_interval. But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
unsigned long start_jif;
unsigned long next_jif;
long nr_to_write;
- struct page_state ps;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.nr_to_write = 0,
.nonblocking = 1,
.for_kupdate = 1,
+ .range_cyclic = 1,
};
sync_supers();
- get_page_state(&ps);
- oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+ oldest_jif = jiffies - dirty_expire_interval;
start_jif = jiffies;
- next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
- nr_to_write = ps.nr_dirty + ps.nr_unstable +
+ next_jif = start_jif + dirty_writeback_interval;
+ nr_to_write = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
wbc.encountered_congestion = 0;
}
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
- if (dirty_writeback_centisecs)
+ if (dirty_writeback_interval)
mod_timer(&wb_timer, next_jif);
}
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length)
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length);
- if (dirty_writeback_centisecs) {
+ proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+ if (dirty_writeback_interval) {
mod_timer(&wb_timer,
- jiffies + (dirty_writeback_centisecs * HZ) / 100);
- } else {
+ jiffies + dirty_writeback_interval);
+ } else {
del_timer(&wb_timer);
}
return 0;
*/
void laptop_io_completion(void)
{
- mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+ mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
* if a large number of processes all perform writes at the same time.
- * If it is too low then SMP machines will call the (expensive) get_page_state
- * too often.
+ * If it is too low then SMP machines will call the (expensive)
+ * get_writeback_state too often.
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
}
-static int
+static int __cpuinit
ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
{
set_ratelimit();
return 0;
}
-static struct notifier_block ratelimit_nb = {
+static struct notifier_block __cpuinitdata ratelimit_nb = {
.notifier_call = ratelimit_handler,
.next = NULL,
};
dirty_background_ratio /= 100;
vm_dirty_ratio *= correction;
vm_dirty_ratio /= 100;
+
+ if (dirty_background_ratio <= 0)
+ dirty_background_ratio = 1;
+ if (vm_dirty_ratio <= 0)
+ vm_dirty_ratio = 1;
}
- mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+ mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
}
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
+ int ret;
+
if (wbc->nr_to_write <= 0)
return 0;
+ wbc->for_writepages = 1;
if (mapping->a_ops->writepages)
- return mapping->a_ops->writepages(mapping, wbc);
- return generic_writepages(mapping, wbc);
+ ret = mapping->a_ops->writepages(mapping, wbc);
+ else
+ ret = generic_writepages(mapping, wbc);
+ wbc->for_writepages = 0;
+ return ret;
}
/**
* write_one_page - write out a single page and optionally wait on I/O
*
- * @page - the page to write
- * @wait - if true, wait on writeout
+ * @page: the page to write
+ * @wait: if true, wait on writeout
*
* The page must be locked by the caller and will be unlocked upon return.
*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
- * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page
- * is still safe, as long as it actually manages to find some blocks at
- * writeback time.
- *
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ *
+ * Most callers have locked the page, which pins the address_space in memory.
+ * But zap_pte_range() does not lock the page, however in that case the
+ * mapping is pinned by the vma's ->vm_file reference.
+ *
+ * We take care to handle the case where the page was truncated from the
+ * mapping by re-checking page_mapping() insode tree_lock.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
- int ret = 0;
-
if (!TestSetPageDirty(page)) {
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping2;
if (mapping) {
- spin_lock_irq(&mapping->tree_lock);
- if (page->mapping) { /* Race with truncate? */
- BUG_ON(page->mapping != mapping);
- if (!mapping->backing_dev_info->memory_backed)
- inc_page_state(nr_dirty);
+ write_lock_irq(&mapping->tree_lock);
+ mapping2 = page_mapping(page);
+ if (mapping2) { /* Race with truncate? */
+ BUG_ON(mapping2 != mapping);
+ if (mapping_cap_account_dirty(mapping))
+ __inc_zone_page_state(page,
+ NR_FILE_DIRTY);
radix_tree_tag_set(&mapping->page_tree,
- page->index, PAGECACHE_TAG_DIRTY);
+ page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&mapping->tree_lock);
- if (!PageSwapCache(page))
+ write_unlock_irq(&mapping->tree_lock);
+ if (mapping->host) {
+ /* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host,
I_DIRTY_PAGES);
+ }
}
+ return 1;
}
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
int fastcall set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- int (*spd)(struct page *);
- if (!mapping) {
- SetPageDirty(page);
- return 0;
+ if (likely(mapping)) {
+ int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+ if (spd)
+ return (*spd)(page);
+ return __set_page_dirty_buffers(page);
}
- spd = mapping->a_ops->set_page_dirty;
- return spd? (*spd)(page): __set_page_dirty_buffers(page);
+ if (!PageDirty(page)) {
+ if (!TestSetPageDirty(page))
+ return 1;
+ }
+ return 0;
}
EXPORT_SYMBOL(set_page_dirty);
struct address_space *mapping = page_mapping(page);
unsigned long flags;
+ WARN_ON_ONCE(!PageLocked(page));
if (mapping) {
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ write_lock_irqsave(&mapping->tree_lock, flags);
if (TestClearPageDirty(page)) {
- radix_tree_tag_clear(&mapping->page_tree, page->index,
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- if (!mapping->backing_dev_info->memory_backed)
- dec_page_state(nr_dirty);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
+ /*
+ * We can continue to use `mapping' here because the
+ * page is locked, which pins the address_space
+ */
+ if (mapping_cap_account_dirty(mapping)) {
+ page_mkclean(page);
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ }
return 1;
}
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
return 0;
}
return TestClearPageDirty(page);
{
struct address_space *mapping = page_mapping(page);
+ WARN_ON_ONCE(!PageLocked(page));
if (mapping) {
if (TestClearPageDirty(page)) {
- if (!mapping->backing_dev_info->memory_backed)
- dec_page_state(nr_dirty);
+ if (mapping_cap_account_dirty(mapping)) {
+ page_mkclean(page);
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ }
return 1;
}
return 0;
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
-/*
- * Clear a page's dirty flag while ignoring dirty memory accounting
- */
-int __clear_page_dirty(struct page *page)
-{
- struct address_space *mapping = page_mapping(page);
-
- if (mapping) {
- unsigned long flags;
-
- spin_lock_irqsave(&mapping->tree_lock, flags);
- if (TestClearPageDirty(page)) {
- radix_tree_tag_clear(&mapping->page_tree, page->index,
- PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- return 1;
- }
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- return 0;
- }
- return TestClearPageDirty(page);
-}
-
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (mapping) {
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
if (ret)
- radix_tree_tag_clear(&mapping->page_tree, page->index,
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
PAGECACHE_TAG_WRITEBACK);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
}
if (mapping) {
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
if (!ret)
- radix_tree_tag_set(&mapping->page_tree, page->index,
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (!PageDirty(page))
- radix_tree_tag_clear(&mapping->page_tree, page->index,
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
}
unsigned long flags;
int ret;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ read_lock_irqsave(&mapping->tree_lock, flags);
ret = radix_tree_tagged(&mapping->page_tree, tag);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ read_unlock_irqrestore(&mapping->tree_lock, flags);
return ret;
}
EXPORT_SYMBOL(mapping_tagged);