#include <linux/rmap.h>
#include <linux/ckrm.h>
#include <linux/vs_memory.h>
+#include <linux/ckrm_mem.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
activate_mm(active_mm, mm);
task_unlock(tsk);
arch_pick_mmap_layout(mm);
+#ifdef CONFIG_CKRM_RES_MEM
+ if (old_mm) {
+ spin_lock(&old_mm->peertask_lock);
+ list_del(&tsk->mm_peers);
+ ckrm_mem_evaluate_mm(old_mm);
+ spin_unlock(&old_mm->peertask_lock);
+ }
+ spin_lock(&mm->peertask_lock);
+ list_add_tail(&tsk->mm_peers, &mm->tasklist);
+ ckrm_mem_evaluate_mm(mm);
+ spin_unlock(&mm->peertask_lock);
+#endif
if (old_mm) {
if (active_mm != old_mm) BUG();
mmput(old_mm);
return -(b != NULL) ;
if (b == NULL)
return 0;
+ if (a->pg_guar == CKRM_SHARE_DONTCARE)
+ return 1;
+ if (b->pg_guar == CKRM_SHARE_DONTCARE)
+ return -1;
return (a->pg_unused - b->pg_unused);
}
static inline void
mem_class_put(ckrm_mem_res_t *cls)
{
+
if (cls && atomic_dec_and_test(&(cls->nr_users)) ) {
printk("freeing memclass %p of <core:%s>\n", cls, cls->core->name);
+ BUG_ON(ckrm_memclass_valid(cls));
//kfree(cls);
}
}
-static inline int
+static inline void
incr_use_count(ckrm_mem_res_t *cls, int borrow)
{
- int over_limit;
-
atomic_inc(&cls->pg_total);
- over_limit = (atomic_read(&cls->pg_total) > ((9 * cls->pg_limit) / 10));
if (borrow)
cls->pg_lent++;
- if ((cls->pg_guar != CKRM_SHARE_DONTCARE) &&
+ if ((cls->pg_guar == CKRM_SHARE_DONTCARE) ||
(atomic_read(&cls->pg_total) > cls->pg_unused)) {
ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
mem_rcbs.resid, ckrm_mem_res_t);
if (parcls) {
- over_limit |= incr_use_count(parcls, 1);
+ incr_use_count(parcls, 1);
cls->pg_borrowed++;
- return over_limit;
}
+ } else {
+ atomic_inc(&ckrm_mem_real_count);
}
- atomic_inc(&ckrm_mem_real_count);
- return over_limit;
+ if ((cls->pg_limit != CKRM_SHARE_DONTCARE) &&
+ (atomic_read(&cls->pg_total) >= cls->pg_limit) &&
+ ((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT)) {
+ ckrm_at_limit(cls);
+ }
+ return;
}
static inline void
}
static inline void
-ckrm_change_page_class(struct page *page, ckrm_mem_res_t *cls)
+ckrm_change_page_class(struct page *page, ckrm_mem_res_t *newcls)
{
+ ckrm_mem_res_t *oldcls = page_class(page);
+
+ if (!newcls || oldcls == newcls)
+ return;
+
ckrm_clear_page_class(page);
- ckrm_set_page_class(page, cls);
+ ckrm_set_page_class(page, newcls);
+ if (test_bit(PG_ckrm_account, &page->flags)) {
+ decr_use_count(oldcls, 0);
+ incr_use_count(newcls, 0);
+ if (PageActive(page)) {
+ oldcls->nr_active[page_zonenum(page)]--;
+ newcls->nr_active[page_zonenum(page)]++;
+ } else {
+ oldcls->nr_inactive[page_zonenum(page)]--;
+ newcls->nr_inactive[page_zonenum(page)]++;
+ }
+ }
}
static inline void
static inline void
ckrm_mem_inc_active(struct page *page)
{
- ckrm_mem_res_t *cls = page_class(page);
- BUG_ON(cls == NULL);
- cls->nr_active[page_zonenum(page)]++;
- if (incr_use_count(cls, 0)) {
- ckrm_near_limit(cls);
+ ckrm_mem_res_t *cls = page_class(page), *curcls;
+ if (likely(cls != NULL)) {
+ BUG_ON(test_bit(PG_ckrm_account, &page->flags));
+ if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) {
+ cls = curcls;
+ ckrm_change_page_class(page, cls);
+ }
+ cls->nr_active[page_zonenum(page)]++;
+ incr_use_count(cls, 0);
+ set_bit(PG_ckrm_account, &page->flags);
}
}
ckrm_mem_dec_active(struct page *page)
{
ckrm_mem_res_t *cls = page_class(page);
- BUG_ON(cls == NULL);
- cls->nr_active[page_zonenum(page)]--;
- decr_use_count(cls, 0);
+ if (likely(cls != NULL)) {
+ BUG_ON(!test_bit(PG_ckrm_account, &page->flags));
+ cls->nr_active[page_zonenum(page)]--;
+ decr_use_count(cls, 0);
+ clear_bit(PG_ckrm_account, &page->flags);
+ }
}
static inline void
ckrm_mem_inc_inactive(struct page *page)
{
- ckrm_mem_res_t *cls = page_class(page);
- BUG_ON(cls == NULL);
- cls->nr_inactive[page_zonenum(page)]++;
- if (incr_use_count(cls, 0) &&
- ((cls->flags & MEM_NEAR_LIMIT) != MEM_NEAR_LIMIT)) {
- ckrm_near_limit(cls);
+ ckrm_mem_res_t *cls = page_class(page), *curcls;
+ if (likely(cls != NULL)) {
+ BUG_ON(test_bit(PG_ckrm_account, &page->flags));
+ if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) {
+ cls = curcls;
+ ckrm_change_page_class(page, cls);
+ }
+ cls->nr_inactive[page_zonenum(page)]++;
+ incr_use_count(cls, 0);
+ set_bit(PG_ckrm_account, &page->flags);
}
}
ckrm_mem_dec_inactive(struct page *page)
{
ckrm_mem_res_t *cls = page_class(page);
- BUG_ON(cls == NULL);
- cls->nr_inactive[page_zonenum(page)]--;
- decr_use_count(cls, 0);
+ if (likely(cls != NULL)) {
+ BUG_ON(!test_bit(PG_ckrm_account, &page->flags));
+ cls->nr_inactive[page_zonenum(page)]--;
+ decr_use_count(cls, 0);
+ clear_bit(PG_ckrm_account, &page->flags);
+ }
}
static inline int
if ((mem_rcbs.resid == -1) || !cls) {
return 1;
}
- return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10);
+ if (cls->pg_limit == CKRM_SHARE_DONTCARE) {
+ ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent,
+ mem_rcbs.resid, ckrm_mem_res_t);
+ return (!parcls ?: ckrm_class_limit_ok(parcls));
+ } else {
+ return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10);
+ }
}
#else // !CONFIG_CKRM_RES_MEM
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_CKRM_RES_MEM
+ void *memclass;
+#endif // CONFIG_CKRM_RES_MEM
};
/*
+#include <linux/ckrm_mem_inline.h>
static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
list_add(&page->lru, &zone->active_list);
zone->nr_active++;
+ ckrm_mem_inc_active(page);
}
static inline void
{
list_add(&page->lru, &zone->inactive_list);
zone->nr_inactive++;
+ ckrm_mem_inc_inactive(page);
}
static inline void
{
list_del(&page->lru);
zone->nr_active--;
+ ckrm_mem_dec_active(page);
}
static inline void
{
list_del(&page->lru);
zone->nr_inactive--;
+ ckrm_mem_dec_inactive(page);
}
static inline void
if (PageActive(page)) {
ClearPageActive(page);
zone->nr_active--;
+ ckrm_mem_dec_active(page);
} else {
zone->nr_inactive--;
+ ckrm_mem_dec_inactive(page);
}
}
#define PG_compound 19 /* Part of a compound page */
#define PG_anon 20 /* Anonymous: anon_vma in mapping */
+#define PG_ckrm_account 21 /* This page is accounted by CKRM */
/*
struct kioctx *ioctx_list;
struct kioctx default_kioctx;
+#ifdef CONFIG_CKRM_RES_MEM
+ struct ckrm_mem_res *memclass;
+ struct list_head tasklist; /* list of all tasks sharing this address space */
+ spinlock_t peertask_lock; /* protect above tasklist */
+#endif
};
extern int mmlist_nr;
struct ckrm_cpu_class *cpu_class;
#endif
#endif // CONFIG_CKRM_TYPE_TASKCLASS
+#ifdef CONFIG_CKRM_RES_MEM
+ struct list_head mm_peers; // list of tasks using same mm_struct
+#endif // CONFIG_CKRM_RES_MEM
#endif // CONFIG_CKRM
-
struct task_delay_info delays;
};
Say N if unsure, Y to use the feature.
+config CKRM_RES_MEM
+ bool "Class based physical memory controller"
+ default y
+ depends on CKRM
+ help
+ Provide the basic support for collecting physical memory usage information
+ among classes. Say Y if you want to know the memory usage of each class.
+
+config CKRM_MEM_LRUORDER_CHANGE
+ bool "Change the LRU ordering of scanned pages"
+ default n
+ depends on CKRM_RES_MEM
+ help
+ While trying to free pages, by default(n), scanned pages are left were they
+ are found if they belong to relatively under-used class. In this case the
+ LRU ordering of the memory subsystemis left intact. If this option is chosen,
+ then the scanned pages are moved to the tail of the list(active or inactive).
+ Changing this to yes reduces the checking overhead but violates the approximate
+ LRU order that is maintained by the paging subsystem.
+
config CKRM_TYPE_SOCKETCLASS
bool "Class Manager for socket groups"
depends on CKRM
obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o
obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o
obj-$(CONFIG_CKRM_CPU_MONITOR) += ckrm_cpu_monitor.o
+ obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o
#include <linux/ckrm.h>
#include <linux/ckrm_tsk.h>
#include <linux/vs_limit.h>
+#include <linux/ckrm_mem.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
task_lock(tsk);
tsk->mm = NULL;
up_read(&mm->mmap_sem);
+#ifdef CONFIG_CKRM_RES_MEM
+ spin_lock(&mm->peertask_lock);
+ list_del_init(&tsk->mm_peers);
+ ckrm_mem_evaluate_mm(mm);
+ spin_unlock(&mm->peertask_lock);
+#endif
enter_lazy_tlb(mm, current);
task_unlock(tsk);
mmput(mm);
#include <linux/vs_memory.h>
#include <linux/ckrm.h>
#include <linux/ckrm_tsk.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
ckrm_cb_newtask(tsk);
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
+#ifdef CONFIG_CKRM_RES_MEM
+ INIT_LIST_HEAD(&tsk->mm_peers);
+#endif
return tsk;
}
mm->ioctx_list = NULL;
mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
mm->free_area_cache = TASK_UNMAPPED_BASE;
+#ifdef CONFIG_CKRM_RES_MEM
+ INIT_LIST_HEAD(&mm->tasklist);
+ mm->peertask_lock = SPIN_LOCK_UNLOCKED;
+#endif
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
if (mm) {
memset(mm, 0, sizeof(*mm));
mm = mm_init(mm);
+#ifdef CONFIG_CKRM_RES_MEM
+ mm->memclass = GET_MEM_CLASS(current);
+ mem_class_get(mm->memclass);
+#endif
}
return mm;
}
mm_free_pgd(mm);
destroy_context(mm);
clr_vx_info(&mm->mm_vx_info);
+#ifdef CONFIG_CKRM_RES_MEM
+ /* class can be null and mm's tasklist can be empty here */
+ if (mm->memclass) {
+ mem_class_put(mm->memclass);
+ mm->memclass = NULL;
+ }
+#endif
free_mm(mm);
}
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
+ ckrm_init_mm_to_task(mm, tsk);
return 0;
free_pt:
#include <linux/cpu.h>
#include <linux/vs_base.h>
#include <linux/vs_limit.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/tlbflush.h>
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
__free_pages_bulk(page, base, zone, area, order);
+ ckrm_clear_page_class(page);
ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
might_sleep_if(wait);
+ if (!ckrm_class_limit_ok((GET_MEM_CLASS(current)))) {
+ return NULL;
+ }
+
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
if (zones[0] == NULL) /* no zones in the zonelist */
return NULL;
return NULL;
got_pg:
kernel_map_pages(page, 1 << order, 1);
+ ckrm_set_pages_class(page, 1 << order, GET_MEM_CLASS(current));
return page;
}
#include <asm/div64.h>
#include <linux/swapops.h>
+#include <linux/ckrm_mem.h>
/* possible outcome of pageout() */
typedef enum {
/* This context's GFP mask */
unsigned int gfp_mask;
+ /* Flag used by CKRM */
+ unsigned int ckrm_flags;
+
int may_writepage;
};
{
LIST_HEAD(page_list);
struct pagevec pvec;
- int max_scan = sc->nr_to_scan;
+ int max_scan = sc->nr_to_scan, nr_pass;
+ unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
+redo:
+ ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
+ nr_pass = zone->nr_inactive;
while (max_scan > 0) {
struct page *page;
int nr_taken = 0;
int nr_scan = 0;
int nr_freed;
- while (nr_scan++ < SWAP_CLUSTER_MAX &&
+ while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX &&
!list_empty(&zone->inactive_list)) {
page = lru_to_page(&zone->inactive_list);
SetPageLRU(page);
list_add(&page->lru, &zone->inactive_list);
continue;
+ } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
+ __put_page(page);
+ SetPageLRU(page);
+#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
+ list_add_tail(&page->lru, &zone->inactive_list);
+#else
+ list_add(&page->lru, &zone->inactive_list);
+#endif
+ continue;
}
list_add(&page->lru, &page_list);
+ ckrm_mem_dec_inactive(page);
nr_taken++;
}
zone->nr_inactive -= nr_taken;
zone->pages_scanned += nr_taken;
spin_unlock_irq(&zone->lru_lock);
- if (nr_taken == 0)
+ if ((bit_flag == 0) && (nr_taken == 0))
goto done;
max_scan -= nr_scan;
spin_lock_irq(&zone->lru_lock);
}
}
+ if (ckrm_flags && (nr_pass <= 0)) {
+ goto redo;
+ }
}
spin_unlock_irq(&zone->lru_lock);
done:
long mapped_ratio;
long distress;
long swap_tendency;
+ unsigned int ckrm_flags = sc->ckrm_flags, bit_flag;
+ int nr_pass;
lru_add_drain();
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
- while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
+redo:
+ ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag);
+ nr_pass = zone->nr_active;
+ while (pgscanned < nr_pages && !list_empty(&zone->active_list) &&
+ nr_pass) {
page = lru_to_page(&zone->active_list);
prefetchw_prev_lru_page(page, &zone->active_list, flags);
if (!TestClearPageLRU(page))
__put_page(page);
SetPageLRU(page);
list_add(&page->lru, &zone->active_list);
+ pgscanned++;
+ } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) {
+ __put_page(page);
+ SetPageLRU(page);
+#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE
+ list_add_tail(&page->lru, &zone->active_list);
+#else
+ list_add(&page->lru, &zone->active_list);
+#endif
} else {
list_add(&page->lru, &l_hold);
+ ckrm_mem_dec_active(page);
pgmoved++;
- }
pgscanned++;
+ }
+ if (!--nr_pass && ckrm_flags) {
+ goto redo;
+ }
}
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
if (!TestClearPageActive(page))
BUG();
list_move(&page->lru, &zone->inactive_list);
+ ckrm_mem_inc_inactive(page);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_inactive += pgmoved;
BUG();
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
+ ckrm_mem_inc_active(page);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
zone->nr_active += pgmoved;
sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
while (nr_active || nr_inactive) {
+ sc->ckrm_flags = ckrm_setup_reclamation();
if (nr_active) {
sc->nr_to_scan = min(nr_active,
(unsigned long)SWAP_CLUSTER_MAX);
if (sc->nr_to_reclaim <= 0)
break;
}
+ ckrm_teardown_reclamation();
+ }
+}
+
+#ifdef CONFIG_CKRM_RES_MEM
+// This function needs to be given more thought.
+// Shrink the class to be at 90% of its limit
+static void
+ckrm_shrink_class(ckrm_mem_res_t *cls)
+{
+ struct scan_control sc;
+ struct zone *zone;
+ int zindex = 0, active_credit = 0, inactive_credit = 0;
+
+ if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically
+ // if it is already set somebody is working on it. so... leave
+ return;
+ }
+ sc.nr_mapped = read_page_state(nr_mapped);
+ sc.nr_scanned = 0;
+ sc.ckrm_flags = ckrm_get_reclaim_flags(cls);
+ sc.nr_reclaimed = 0;
+ sc.priority = 0; // always very high priority
+
+ for_each_zone(zone) {
+ int zone_total, zone_limit, active_limit, inactive_limit;
+ int active_over, inactive_over;
+ unsigned long nr_active, nr_inactive;
+ u64 temp;
+
+ zone->temp_priority = zone->prev_priority;
+ zone->prev_priority = sc.priority;
+
+ zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
+
+ temp = (u64) cls->pg_limit * zone_total;
+ do_div(temp, ckrm_tot_lru_pages);
+ zone_limit = (int) temp;
+ active_limit = (6 * zone_limit) / 10; // 2/3rd in active list
+ inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list
+
+ active_over = cls->nr_active[zindex] - active_limit + active_credit;
+ inactive_over = active_over +
+ (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit;
+
+ if (active_over > 0) {
+ zone->nr_scan_active += active_over + 1;
+ nr_active = zone->nr_scan_active;
+ active_credit = 0;
+ } else {
+ active_credit += active_over;
+ nr_active = 0;
+ }
+
+ if (inactive_over > 0) {
+ zone->nr_scan_inactive += inactive_over;
+ nr_inactive = zone->nr_scan_inactive;
+ inactive_credit = 0;
+ } else {
+ inactive_credit += inactive_over;
+ nr_inactive = 0;
+ }
+ while (nr_active || nr_inactive) {
+ if (nr_active) {
+ sc.nr_to_scan = min(nr_active,
+ (unsigned long)SWAP_CLUSTER_MAX);
+ nr_active -= sc.nr_to_scan;
+ refill_inactive_zone(zone, &sc);
+ }
+
+ if (nr_inactive) {
+ sc.nr_to_scan = min(nr_inactive,
+ (unsigned long)SWAP_CLUSTER_MAX);
+ nr_inactive -= sc.nr_to_scan;
+ shrink_cache(zone, &sc);
+ if (sc.nr_to_reclaim <= 0)
+ break;
+ }
+ }
+ zone->prev_priority = zone->temp_priority;
+ zindex++;
}
+ ckrm_clear_shrink(cls);
}
+static void
+ckrm_shrink_classes(void)
+{
+ ckrm_mem_res_t *cls;
+
+ spin_lock(&ckrm_mem_lock);
+ while (!ckrm_shrink_list_empty()) {
+ cls = list_entry(ckrm_shrink_list.next, ckrm_mem_res_t,
+ shrink_list);
+ spin_unlock(&ckrm_mem_lock);
+ ckrm_shrink_class(cls);
+ spin_lock(&ckrm_mem_lock);
+ list_del(&cls->shrink_list);
+ cls->flags &= ~MEM_AT_LIMIT;
+ }
+ spin_unlock(&ckrm_mem_lock);
+}
+
+#else
+#define ckrm_shrink_classes() do { } while(0)
+#endif
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
finish_wait(&pgdat->kswapd_wait, &wait);
try_to_clip_inodes();
+ if (!ckrm_shrink_list_empty())
+ ckrm_shrink_classes();
+ else
balance_pgdat(pgdat, 0);
}
return 0;
*/
void wakeup_kswapd(struct zone *zone)
{
- if (zone->free_pages > zone->pages_low)
+ if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty())
return;
if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
return;