#include <asm/div64.h>
#include <linux/swapops.h>
-#include <linux/ckrm_mem.h>
-
-#ifndef AT_LIMIT_SUPPORT
-#warning "ckrm_at_limit disabled due to problems with memory hog tests -- seting ckrm_shrink_list_empty to true"
-#undef ckrm_shrink_list_empty
-#define ckrm_shrink_list_empty() (1)
-#endif
-
-/* possible outcome of pageout() */
-typedef enum {
- /* failed to write page out, page is locked */
- PAGE_KEEP,
- /* move page to the active list, page is locked */
- PAGE_ACTIVATE,
- /* page has been sent to the disk successfully, page is unlocked */
- PAGE_SUCCESS,
- /* page is clean and locked */
- PAGE_CLEAN,
-} pageout_t;
-
-struct scan_control {
- /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
- unsigned long nr_to_scan;
-
- /* Incremented by the number of inactive pages that were scanned */
- unsigned long nr_scanned;
-
- /* Incremented by the number of pages reclaimed */
- unsigned long nr_reclaimed;
-
- unsigned long nr_mapped; /* From page_state */
-
- /* How many pages shrink_cache() should reclaim */
- int nr_to_reclaim;
-
- /* Ask shrink_caches, or shrink_zone to scan at this priority */
- unsigned int priority;
-
- /* This context's GFP mask */
- unsigned int gfp_mask;
-
- /* Flag used by CKRM */
- unsigned int ckrm_flags;
-
- int may_writepage;
-};
/*
- * The list of shrinker callbacks used by to apply pressure to
- * ageable caches.
+ * From 0 .. 100. Higher means more swappy.
*/
-struct shrinker {
- shrinker_t shrinker;
- struct list_head list;
- int seeks; /* seeks to recreate an obj */
- long nr; /* objs pending delete */
-};
+int vm_swappiness = 60;
+static long total_memory;
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
- prev = lru_to_page(&(_page->lru)); \
+ prev = lru_to_page(&(_page->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#endif
/*
- * From 0 .. 100. Higher means more swappy.
+ * The list of shrinker callbacks used by to apply pressure to
+ * ageable caches.
*/
-int vm_swappiness = 60;
-static long total_memory;
+struct shrinker {
+ shrinker_t shrinker;
+ struct list_head list;
+ int seeks; /* seeks to recreate an obj */
+ long nr; /* objs pending delete */
+};
static LIST_HEAD(shrinker_list);
static DECLARE_MUTEX(shrinker_sem);
}
return shrinker;
}
+
EXPORT_SYMBOL(set_shrinker);
/*
up(&shrinker_sem);
kfree(shrinker);
}
+
EXPORT_SYMBOL(remove_shrinker);
#define SHRINK_BATCH 128
* slab to avoid swapping.
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
- *
- * `lru_pages' represents the number of on-LRU pages in all the zones which
- * are eligible for the caller's allocation attempt. It is used for balancing
- * slab reclaim versus page reclaim.
*/
-static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
- unsigned long lru_pages)
+static int shrink_slab(unsigned long scanned, unsigned int gfp_mask)
{
struct shrinker *shrinker;
+ long pages;
if (down_trylock(&shrinker_sem))
return 0;
+ pages = nr_used_zone_pages();
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
delta = (4 * scanned) / shrinker->seeks;
delta *= (*shrinker->shrinker)(0, gfp_mask);
- do_div(delta, lru_pages + 1);
+ do_div(delta, pages + 1);
shrinker->nr += delta;
if (shrinker->nr < 0)
shrinker->nr = LONG_MAX; /* It wrapped! */
unlock_page(page);
}
+/* possible outcome of pageout() */
+typedef enum {
+ /* failed to write page out, page is locked */
+ PAGE_KEEP,
+ /* move page to the active list, page is locked */
+ PAGE_ACTIVATE,
+ /* page has been sent to the disk successfully, page is unlocked */
+ PAGE_SUCCESS,
+ /* page is clean and locked */
+ PAGE_CLEAN,
+} pageout_t;
+
/*
* pageout is called by shrink_list() for each dirty page. Calls ->writepage().
*/
return PAGE_CLEAN;
}
+struct scan_control {
+ /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
+ unsigned long nr_to_scan;
+
+ /* Incremented by the number of inactive pages that were scanned */
+ unsigned long nr_scanned;
+
+ /* Incremented by the number of pages reclaimed */
+ unsigned long nr_reclaimed;
+
+ unsigned long nr_mapped; /* From page_state */
+
+ /* Ask shrink_caches, or shrink_zone to scan at this priority */
+ unsigned int priority;
+
+ /* This context's GFP mask */
+ unsigned int gfp_mask;
+
+ int may_writepage;
+};
+
/*
* shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
*/
int may_enter_fs;
int referenced;
- cond_resched();
-
page = lru_to_page(page_list);
list_del(&page->lru);
{
LIST_HEAD(page_list);
struct pagevec pvec;
- int max_scan = sc->nr_to_scan, nr_pass;
- unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
+ int max_scan = sc->nr_to_scan;
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
-redo:
- ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
- nr_pass = zone->nr_inactive;
while (max_scan > 0) {
struct page *page;
int nr_taken = 0;
int nr_scan = 0;
int nr_freed;
- while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX &&
+ while (nr_scan++ < SWAP_CLUSTER_MAX &&
!list_empty(&zone->inactive_list)) {
page = lru_to_page(&zone->inactive_list);
SetPageLRU(page);
list_add(&page->lru, &zone->inactive_list);
continue;
- } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
- __put_page(page);
- SetPageLRU(page);
-#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
- list_add_tail(&page->lru, &zone->inactive_list);
-#else
- list_add(&page->lru, &zone->inactive_list);
-#endif
- continue;
}
list_add(&page->lru, &page_list);
- ckrm_mem_dec_inactive(page);
nr_taken++;
}
zone->nr_inactive -= nr_taken;
zone->pages_scanned += nr_taken;
spin_unlock_irq(&zone->lru_lock);
- if ((bit_flag == 0) && (nr_taken == 0))
+ if (nr_taken == 0)
goto done;
max_scan -= nr_scan;
if (current_is_kswapd())
mod_page_state(kswapd_steal, nr_freed);
mod_page_state_zone(zone, pgsteal, nr_freed);
- sc->nr_to_reclaim -= nr_freed;
spin_lock_irq(&zone->lru_lock);
/*
spin_lock_irq(&zone->lru_lock);
}
}
- if (ckrm_flags && (nr_pass <= 0)) {
- goto redo;
- }
}
spin_unlock_irq(&zone->lru_lock);
done:
long mapped_ratio;
long distress;
long swap_tendency;
- unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
- int nr_pass;
lru_add_drain();
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
-redo:
- ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
- nr_pass = zone->nr_active;
- while (pgscanned < nr_pages && !list_empty(&zone->active_list) &&
- nr_pass) {
+ while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
page = lru_to_page(&zone->active_list);
prefetchw_prev_lru_page(page, &zone->active_list, flags);
if (!TestClearPageLRU(page))
__put_page(page);
SetPageLRU(page);
list_add(&page->lru, &zone->active_list);
- pgscanned++;
- } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
- __put_page(page);
- SetPageLRU(page);
-#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
- list_add_tail(&page->lru, &zone->active_list);
-#else
- list_add(&page->lru, &zone->active_list);
-#endif
} else {
list_add(&page->lru, &l_hold);
- ckrm_mem_dec_active(page);
pgmoved++;
- pgscanned++;
- }
- if (!--nr_pass && ckrm_flags) {
- goto redo;
}
+ pgscanned++;
}
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
reclaim_mapped = 1;
while (!list_empty(&l_hold)) {
- cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (page_mapped(page)) {
if (!TestClearPageActive(page))
BUG();
list_move(&page->lru, &zone->inactive_list);
- ckrm_mem_inc_inactive(page);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_inactive += pgmoved;
BUG();
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
- ckrm_mem_inc_active(page);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_active += pgmoved;
}
/*
+ * Scan `nr_pages' from this zone. Returns the number of reclaimed pages.
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void
shrink_zone(struct zone *zone, struct scan_control *sc)
{
- unsigned long nr_active;
- unsigned long nr_inactive;
+ unsigned long scan_active, scan_inactive;
+ int count;
+
+ scan_inactive = (zone->nr_active + zone->nr_inactive) >> sc->priority;
/*
- * Add one to `nr_to_scan' just to make sure that the kernel will
- * slowly sift through the active list.
+ * Try to keep the active list 2/3 of the size of the cache. And
+ * make sure that refill_inactive is given a decent number of pages.
+ *
+ * The "scan_active + 1" here is important. With pagecache-intensive
+ * workloads the inactive list is huge, and `ratio' evaluates to zero
+ * all the time. Which pins the active list memory. So we add one to
+ * `scan_active' just to make sure that the kernel will slowly sift
+ * through the active list.
*/
- zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
- nr_active = zone->nr_scan_active;
- if (nr_active >= SWAP_CLUSTER_MAX)
- zone->nr_scan_active = 0;
- else
- nr_active = 0;
-
- zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
- nr_inactive = zone->nr_scan_inactive;
- if (nr_inactive >= SWAP_CLUSTER_MAX)
- zone->nr_scan_inactive = 0;
- else
- nr_inactive = 0;
-
- sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
-
- while (nr_active || nr_inactive) {
- sc->ckrm_flags = ckrm_setup_reclamation();
- if (nr_active) {
- sc->nr_to_scan = min(nr_active,
- (unsigned long)SWAP_CLUSTER_MAX);
- nr_active -= sc->nr_to_scan;
- refill_inactive_zone(zone, sc);
- }
+ if (zone->nr_active >= 4*(zone->nr_inactive*2 + 1)) {
+ /* Don't scan more than 4 times the inactive list scan size */
+ scan_active = 4*scan_inactive;
+ } else {
+ unsigned long long tmp;
- if (nr_inactive) {
- sc->nr_to_scan = min(nr_inactive,
- (unsigned long)SWAP_CLUSTER_MAX);
- nr_inactive -= sc->nr_to_scan;
- shrink_cache(zone, sc);
- if (sc->nr_to_reclaim <= 0)
- break;
- }
- ckrm_teardown_reclamation();
- }
-}
-
-#if defined(CONFIG_CKRM_RES_MEM) && defined(AT_LIMIT_SUPPORT)
-// This function needs to be given more thought.
-// Shrink the class to be at 90% of its limit
-static void
-ckrm_shrink_class(ckrm_mem_res_t *cls)
-{
- struct scan_control sc;
- struct zone *zone;
- int zindex = 0, active_credit = 0, inactive_credit = 0;
+ /* Cast to long long so the multiply doesn't overflow */
- if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically
- // if it is already set somebody is working on it. so... leave
- return;
+ tmp = (unsigned long long)scan_inactive * zone->nr_active;
+ do_div(tmp, zone->nr_inactive*2 + 1);
+ scan_active = (unsigned long)tmp;
}
- sc.nr_mapped = read_page_state(nr_mapped);
- sc.nr_scanned = 0;
- sc.ckrm_flags = ckrm_get_reclaim_flags(cls);
- sc.nr_reclaimed = 0;
- sc.priority = 0; // always very high priority
-
- for_each_zone(zone) {
- int zone_total, zone_limit, active_limit, inactive_limit;
- int active_over, inactive_over;
- unsigned long nr_active, nr_inactive;
- u64 temp;
-
- zone->temp_priority = zone->prev_priority;
- zone->prev_priority = sc.priority;
-
- zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
-
- temp = (u64) cls->pg_limit * zone_total;
- do_div(temp, ckrm_tot_lru_pages);
- zone_limit = (int) temp;
- active_limit = (6 * zone_limit) / 10; // 2/3rd in active list
- inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list
-
- active_over = cls->nr_active[zindex] - active_limit + active_credit;
- inactive_over = active_over +
- (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit;
-
- if (active_over > 0) {
- zone->nr_scan_active += active_over + 1;
- nr_active = zone->nr_scan_active;
- active_credit = 0;
- } else {
- active_credit += active_over;
- nr_active = 0;
- }
- if (inactive_over > 0) {
- zone->nr_scan_inactive += inactive_over;
- nr_inactive = zone->nr_scan_inactive;
- inactive_credit = 0;
- } else {
- inactive_credit += inactive_over;
- nr_inactive = 0;
- }
- while (nr_active || nr_inactive) {
- if (nr_active) {
- sc.nr_to_scan = min(nr_active,
- (unsigned long)SWAP_CLUSTER_MAX);
- nr_active -= sc.nr_to_scan;
- refill_inactive_zone(zone, &sc);
- }
-
- if (nr_inactive) {
- sc.nr_to_scan = min(nr_inactive,
- (unsigned long)SWAP_CLUSTER_MAX);
- nr_inactive -= sc.nr_to_scan;
- shrink_cache(zone, &sc);
- if (sc.nr_to_reclaim <= 0)
- break;
- }
- }
- zone->prev_priority = zone->temp_priority;
- zindex++;
+ atomic_add(scan_active + 1, &zone->nr_scan_active);
+ count = atomic_read(&zone->nr_scan_active);
+ if (count >= SWAP_CLUSTER_MAX) {
+ atomic_set(&zone->nr_scan_active, 0);
+ sc->nr_to_scan = count;
+ refill_inactive_zone(zone, sc);
}
- ckrm_clear_shrink(cls);
-}
-static void
-ckrm_shrink_classes(void)
-{
- ckrm_mem_res_t *cls;
-
- spin_lock(&ckrm_mem_lock);
- while (!ckrm_shrink_list_empty()) {
- cls = list_entry(ckrm_shrink_list.next, ckrm_mem_res_t,
- shrink_list);
- spin_unlock(&ckrm_mem_lock);
- ckrm_shrink_class(cls);
- spin_lock(&ckrm_mem_lock);
- list_del(&cls->shrink_list);
- cls->flags &= ~MEM_AT_LIMIT;
+ atomic_add(scan_inactive, &zone->nr_scan_inactive);
+ count = atomic_read(&zone->nr_scan_inactive);
+ if (count >= SWAP_CLUSTER_MAX) {
+ atomic_set(&zone->nr_scan_inactive, 0);
+ sc->nr_to_scan = count;
+ shrink_cache(zone, sc);
}
- spin_unlock(&ckrm_mem_lock);
}
-#else
-
-#if defined(CONFIG_CKRM_RES_MEM) && !defined(AT_LIMIT_SUPPORT)
-#warning "disabling ckrm_at_limit -- setting ckrm_shrink_classes to noop "
-#endif
-
-#define ckrm_shrink_classes() do { } while(0)
-#endif
-
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
int total_scanned = 0, total_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc;
- unsigned long lru_pages = 0;
int i;
sc.gfp_mask = gfp_mask;
inc_page_state(allocstall);
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *zone = zones[i];
-
- zone->temp_priority = DEF_PRIORITY;
- lru_pages += zone->nr_active + zone->nr_inactive;
- }
+ for (i = 0; zones[i] != 0; i++)
+ zones[i]->temp_priority = DEF_PRIORITY;
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_reclaimed = 0;
sc.priority = priority;
shrink_caches(zones, &sc);
- shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+ shrink_slab(sc.nr_scanned, gfp_mask);
if (reclaim_state) {
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
blk_congestion_wait(WRITE, HZ/10);
}
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
- out_of_memory(gfp_mask);
+ out_of_memory();
out:
for (i = 0; zones[i] != 0; i++)
zones[i]->prev_priority = zones[i]->temp_priority;
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int all_zones_ok = 1;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
- unsigned long lru_pages = 0;
+
if (nr_pages == 0) {
/*
end_zone = pgdat->nr_zones - 1;
}
scan:
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- lru_pages += zone->nr_active + zone->nr_inactive;
- }
-
/*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
sc.priority = priority;
shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+ shrink_slab(sc.nr_scanned, GFP_KERNEL);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_reclaimed += sc.nr_reclaimed;
if (zone->all_unreclaimable)
* If there are applications that are active memory-allocators
* (most normal use), this basically shouldn't matter.
*/
-static int kswapd(void *p)
+int kswapd(void *p)
{
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
finish_wait(&pgdat->kswapd_wait, &wait);
try_to_clip_inodes();
- if (!ckrm_shrink_list_empty())
- ckrm_shrink_classes();
- else
balance_pgdat(pgdat, 0);
}
- return 0;
}
/*
*/
void wakeup_kswapd(struct zone *zone)
{
- if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty())
+ if (zone->free_pages > zone->pages_low)
return;
if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
return;