#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
+#include <linux/rmap.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
static long ratelimit_pages = 32;
static long total_pages; /* The total number of pages in the machine. */
-static int dirty_exceeded; /* Dirty mem may be over limit */
+static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
/*
* When balance_dirty_pages decides that the caller needs to perform some
int vm_dirty_ratio = 40;
/*
- * The interval between `kupdate'-style writebacks, in centiseconds
- * (hundredths of a second)
+ * The interval between `kupdate'-style writebacks, in jiffies
*/
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
/*
- * The longest number of centiseconds for which data is allowed to remain dirty
+ * The longest number of jiffies for which data is allowed to remain dirty
*/
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
int block_dump;
/*
- * Flag that puts the machine in "laptop mode".
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
*/
int laptop_mode;
static void background_writeout(unsigned long _min_pages);
-struct writeback_state
-{
- unsigned long nr_dirty;
- unsigned long nr_unstable;
- unsigned long nr_mapped;
- unsigned long nr_writeback;
-};
-
-static void get_writeback_state(struct writeback_state *wbs)
-{
- wbs->nr_dirty = read_page_state(nr_dirty);
- wbs->nr_unstable = read_page_state(nr_unstable);
- wbs->nr_mapped = read_page_state(nr_mapped);
- wbs->nr_writeback = read_page_state(nr_writeback);
-}
-
/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
* clamping level.
*/
static void
-get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
- struct address_space *mapping)
+get_dirty_limits(long *pbackground, long *pdirty,
+ struct address_space *mapping)
{
int background_ratio; /* Percentages */
int dirty_ratio;
unsigned long available_memory = total_pages;
struct task_struct *tsk;
- get_writeback_state(wbs);
-
#ifdef CONFIG_HIGHMEM
/*
* If this mapping can only allocate from low memory,
#endif
- unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;
+ unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES)) * 100) /
+ total_pages;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
*/
static void balance_dirty_pages(struct address_space *mapping)
{
- struct writeback_state wbs;
long nr_reclaimable;
long background_thresh;
long dirty_thresh;
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = write_chunk,
+ .range_cyclic = 1,
};
- get_dirty_limits(&wbs, &background_thresh,
- &dirty_thresh, mapping);
- nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
- if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
- break;
+ get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+ dirty_thresh)
+ break;
- dirty_exceeded = 1;
+ if (!dirty_exceeded)
+ dirty_exceeded = 1;
/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
*/
if (nr_reclaimable) {
writeback_inodes(&wbc);
- get_dirty_limits(&wbs, &background_thresh,
- &dirty_thresh, mapping);
- nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
- if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
- break;
+ get_dirty_limits(&background_thresh,
+ &dirty_thresh, mapping);
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ if (nr_reclaimable +
+ global_page_state(NR_WRITEBACK)
+ <= dirty_thresh)
+ break;
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
break; /* We've done our duty */
blk_congestion_wait(WRITE, HZ/10);
}
- if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
- dirty_exceeded = 0;
+ if (nr_reclaimable + global_page_state(NR_WRITEBACK)
+ <= dirty_thresh && dirty_exceeded)
+ dirty_exceeded = 0;
if (writeback_in_progress(bdi))
return; /* pdflush is already working this queue */
pdflush_operation(background_writeout, 0);
}
+void set_page_dirty_balance(struct page *page)
+{
+ if (set_page_dirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping)
+ balance_dirty_pages_ratelimited(mapping);
+ }
+}
+
/**
- * balance_dirty_pages_ratelimited - balance dirty memory state
- * @mapping - address_space which was dirtied
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * @mapping: address_space which was dirtied
+ * @nr_pages_dirtied: number of pages which the caller has just dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+ unsigned long nr_pages_dirtied)
{
- static DEFINE_PER_CPU(int, ratelimits) = 0;
- long ratelimit;
+ static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
+ unsigned long ratelimit;
+ unsigned long *p;
ratelimit = ratelimit_pages;
if (dirty_exceeded)
* Check the rate limiting. Also, we do not want to throttle real-time
* tasks in balance_dirty_pages(). Period.
*/
- if (get_cpu_var(ratelimits)++ >= ratelimit) {
- __get_cpu_var(ratelimits) = 0;
- put_cpu_var(ratelimits);
+ preempt_disable();
+ p = &__get_cpu_var(ratelimits);
+ *p += nr_pages_dirtied;
+ if (unlikely(*p >= ratelimit)) {
+ *p = 0;
+ preempt_enable();
balance_dirty_pages(mapping);
return;
}
- put_cpu_var(ratelimits);
+ preempt_enable();
+}
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
+
+void throttle_vm_writeout(void)
+{
+ long background_thresh;
+ long dirty_thresh;
+
+ for ( ; ; ) {
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+
+ /*
+ * Boost the allowable dirty threshold a bit for page
+ * allocators so they don't get DoS'ed by heavy writers
+ */
+ dirty_thresh += dirty_thresh / 10; /* wheeee... */
+
+ if (global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK) <= dirty_thresh)
+ break;
+ blk_congestion_wait(WRITE, HZ/10);
+ }
}
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+
/*
* writeback at least _min_pages, and keep writing until the amount of dirty
.older_than_this = NULL,
.nr_to_write = 0,
.nonblocking = 1,
+ .range_cyclic = 1,
};
for ( ; ; ) {
- struct writeback_state wbs;
long background_thresh;
long dirty_thresh;
- get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
- if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ if (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) < background_thresh
&& min_pages <= 0)
break;
wbc.encountered_congestion = 0;
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/
-int wakeup_bdflush(long nr_pages)
+int wakeup_pdflush(long nr_pages)
{
- if (nr_pages == 0) {
- struct writeback_state wbs;
-
- get_writeback_state(&wbs);
- nr_pages = wbs.nr_dirty + wbs.nr_unstable;
- }
+ if (nr_pages == 0)
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
return pdflush_operation(background_writeout, nr_pages);
}
static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
-static struct timer_list wb_timer =
- TIMER_INITIALIZER(wb_timer_fn, 0, 0);
-static struct timer_list laptop_mode_wb_timer =
- TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
+static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
+static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
/*
* Periodic writeback of "old" data.
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
- * Try to run once per dirty_writeback_centisecs. But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * Try to run once per dirty_writeback_interval. But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
unsigned long start_jif;
unsigned long next_jif;
long nr_to_write;
- struct writeback_state wbs;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.nr_to_write = 0,
.nonblocking = 1,
.for_kupdate = 1,
+ .range_cyclic = 1,
};
sync_supers();
- get_writeback_state(&wbs);
- oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+ oldest_jif = jiffies - dirty_expire_interval;
start_jif = jiffies;
- next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
- nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
+ next_jif = start_jif + dirty_writeback_interval;
+ nr_to_write = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
wbc.encountered_congestion = 0;
}
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
- if (dirty_writeback_centisecs)
+ if (dirty_writeback_interval)
mod_timer(&wb_timer, next_jif);
}
int dirty_writeback_centisecs_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
- if (dirty_writeback_centisecs) {
+ proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+ if (dirty_writeback_interval) {
mod_timer(&wb_timer,
- jiffies + (dirty_writeback_centisecs * HZ) / 100);
- } else {
+ jiffies + dirty_writeback_interval);
+ } else {
del_timer(&wb_timer);
}
return 0;
*/
void laptop_io_completion(void)
{
- mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+ mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
}
-static int
+static int __cpuinit
ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
{
set_ratelimit();
return 0;
}
-static struct notifier_block ratelimit_nb = {
+static struct notifier_block __cpuinitdata ratelimit_nb = {
.notifier_call = ratelimit_handler,
.next = NULL,
};
if (vm_dirty_ratio <= 0)
vm_dirty_ratio = 1;
}
- mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+ mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
}
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
+ int ret;
+
if (wbc->nr_to_write <= 0)
return 0;
+ wbc->for_writepages = 1;
if (mapping->a_ops->writepages)
- return mapping->a_ops->writepages(mapping, wbc);
- return generic_writepages(mapping, wbc);
+ ret = mapping->a_ops->writepages(mapping, wbc);
+ else
+ ret = generic_writepages(mapping, wbc);
+ wbc->for_writepages = 0;
+ return ret;
}
/**
* write_one_page - write out a single page and optionally wait on I/O
*
- * @page - the page to write
- * @wait - if true, wait on writeout
+ * @page: the page to write
+ * @wait: if true, wait on writeout
*
* The page must be locked by the caller and will be unlocked upon return.
*
*/
int __set_page_dirty_nobuffers(struct page *page)
{
- int ret = 0;
-
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
if (mapping) {
- spin_lock_irq(&mapping->tree_lock);
+ write_lock_irq(&mapping->tree_lock);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
- if (!mapping->backing_dev_info->memory_backed)
- inc_page_state(nr_dirty);
+ if (mapping_cap_account_dirty(mapping))
+ __inc_zone_page_state(page,
+ NR_FILE_DIRTY);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&mapping->tree_lock);
+ write_unlock_irq(&mapping->tree_lock);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host,
I_DIRTY_PAGES);
}
}
+ return 1;
}
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
return (*spd)(page);
return __set_page_dirty_buffers(page);
}
- if (!PageDirty(page))
- SetPageDirty(page);
+ if (!PageDirty(page)) {
+ if (!TestSetPageDirty(page))
+ return 1;
+ }
return 0;
}
EXPORT_SYMBOL(set_page_dirty);
struct address_space *mapping = page_mapping(page);
unsigned long flags;
+ WARN_ON_ONCE(!PageLocked(page));
if (mapping) {
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ write_lock_irqsave(&mapping->tree_lock, flags);
if (TestClearPageDirty(page)) {
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- if (!mapping->backing_dev_info->memory_backed)
- dec_page_state(nr_dirty);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
+ /*
+ * We can continue to use `mapping' here because the
+ * page is locked, which pins the address_space
+ */
+ if (mapping_cap_account_dirty(mapping)) {
+ page_mkclean(page);
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ }
return 1;
}
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
return 0;
}
return TestClearPageDirty(page);
{
struct address_space *mapping = page_mapping(page);
+ WARN_ON_ONCE(!PageLocked(page));
if (mapping) {
if (TestClearPageDirty(page)) {
- if (!mapping->backing_dev_info->memory_backed)
- dec_page_state(nr_dirty);
+ if (mapping_cap_account_dirty(mapping)) {
+ page_mkclean(page);
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ }
return 1;
}
return 0;
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
-/*
- * Clear a page's dirty flag while ignoring dirty memory accounting
- */
-int __clear_page_dirty(struct page *page)
-{
- struct address_space *mapping = page_mapping(page);
-
- if (mapping) {
- unsigned long flags;
-
- spin_lock_irqsave(&mapping->tree_lock, flags);
- if (TestClearPageDirty(page)) {
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- return 1;
- }
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
- return 0;
- }
- return TestClearPageDirty(page);
-}
-
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (mapping) {
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
if (ret)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
}
if (mapping) {
unsigned long flags;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
if (!ret)
radix_tree_tag_set(&mapping->page_tree,
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
}
unsigned long flags;
int ret;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ read_lock_irqsave(&mapping->tree_lock, flags);
ret = radix_tree_tagged(&mapping->page_tree, tag);
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ read_unlock_irqrestore(&mapping->tree_lock, flags);
return ret;
}
EXPORT_SYMBOL(mapping_tagged);