1 /* ckrm_memctlr.c - Basic routines for the CKRM memory controller
3 * Copyright (C) Jiantao Kong, IBM Corp. 2003
4 * (C) Chandra Seetharaman, IBM Corp. 2004
6 * Provides a Memory Resource controller for CKRM
8 * Latest version, more details at http://ckrm.sf.net
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
17 #include <linux/swap.h>
18 #include <linux/pagemap.h>
19 #include <linux/ckrm_mem_inline.h>
22 ckrm_mem_evaluate_page_anon(struct page* page)
24 struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
25 struct ckrm_mem_res* maxshareclass = NULL;
26 struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
27 struct vm_area_struct *vma;
31 if (!spin_trylock(&anon_vma->lock))
33 BUG_ON(list_empty(&anon_vma->head));
34 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
36 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
38 maxshareclass = mm->memclass;
41 spin_unlock(&anon_vma->lock);
44 maxshareclass = ckrm_mem_root_class;
46 if (pgcls != maxshareclass) {
47 ckrm_change_page_class(page, maxshareclass);
54 ckrm_mem_evaluate_page_file(struct page* page)
56 struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
57 struct ckrm_mem_res* maxshareclass = NULL;
58 struct address_space *mapping = page->mapping;
59 struct vm_area_struct *vma = NULL;
60 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
61 struct prio_tree_iter iter;
68 if (!spin_trylock(&mapping->i_mmap_lock))
71 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
74 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
76 maxshareclass = mm->memclass;
78 spin_unlock(&mapping->i_mmap_lock);
81 maxshareclass = ckrm_mem_root_class;
83 if (pgcls != maxshareclass) {
84 ckrm_change_page_class(page, maxshareclass);
91 ckrm_mem_evaluate_page(struct page* page)
96 ret = ckrm_mem_evaluate_page_anon(page);
98 ret = ckrm_mem_evaluate_page_file(page);
104 ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from, struct ckrm_mem_res* def)
109 struct list_head *pos, *next;
110 struct ckrm_zone *ckrm_zone;
112 for (i = 0; i < MAX_NR_ZONES; i++) {
113 ckrm_zone = &from->ckrm_zone[i];
114 zone = ckrm_zone->zone;
115 spin_lock_irq(&zone->lru_lock);
116 pos = ckrm_zone->inactive_list.next;
117 while (pos != &ckrm_zone->inactive_list) {
119 page = list_entry(pos, struct page, lru);
120 if (ckrm_mem_evaluate_page(page))
121 ckrm_change_page_class(page, def);
124 pos = ckrm_zone->active_list.next;
125 while (pos != &ckrm_zone->active_list) {
127 page = list_entry(pos, struct page, lru);
128 if (ckrm_mem_evaluate_page(page))
129 ckrm_change_page_class(page, def);
132 spin_unlock_irq(&zone->lru_lock);
138 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
139 pmd_t* pmdir, unsigned long address, unsigned long end)
142 unsigned long pmd_end;
144 if (pmd_none(*pmdir))
146 BUG_ON(pmd_bad(*pmdir));
148 pmd_end = (address+ PMD_SIZE) & PMD_MASK;
153 pte = pte_offset_map(pmdir, address);
154 if (pte_present(*pte)) {
155 struct page *page = pte_page(*pte);
156 struct ckrm_zone *czone = page_ckrmzone(page);
157 if (page->mapping && czone) {
158 struct zone *zone = czone->zone;
159 spin_lock_irq(&zone->lru_lock);
160 ckrm_change_page_class(page, mm->memclass);
161 spin_unlock_irq(&zone->lru_lock);
164 address += PAGE_SIZE;
167 } while(address && (address < end));
172 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
173 pgd_t* pgdir, unsigned long address, unsigned long end)
176 unsigned long pgd_end;
178 if (pgd_none(*pgdir))
180 BUG_ON(pgd_bad(*pgdir));
182 pmd = pmd_offset(pgdir, address);
183 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
185 if (pgd_end && (end > pgd_end))
189 class_migrate_pmd(mm, vma, pmd, address, end);
190 address = (address + PMD_SIZE) & PMD_MASK;
192 } while (address && (address < end));
197 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
200 unsigned long address, end;
202 address = vma->vm_start;
205 pgdir = pgd_offset(vma->vm_mm, address);
207 class_migrate_pgd(mm, vma, pgdir, address, end);
208 address = (address + PGDIR_SIZE) & PGDIR_MASK;
210 } while(address && (address < end));
214 /* this function is called with mm->peertask_lock hold */
216 ckrm_mem_migrate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
218 struct task_struct *task;
219 struct vm_area_struct *vma;
220 struct ckrm_mem_res *maxshareclass = def;
222 if (list_empty(&mm->tasklist)) {
223 /* We leave the mm->memclass untouched since we believe that one
224 * mm with no task associated will be deleted soon or attach
225 * with another task later.
230 list_for_each_entry(task, &mm->tasklist, mm_peers) {
231 struct ckrm_mem_res* cls = ckrm_get_mem_class(task);
234 if (!maxshareclass ||
235 ckrm_mem_share_compare(maxshareclass,cls)<0 )
239 if (maxshareclass && (mm->memclass != maxshareclass)) {
241 kref_put(&mm->memclass->nr_users, memclass_release);
243 mm->memclass = maxshareclass;
244 kref_get(&maxshareclass->nr_users);
246 /* Go through all VMA to migrate pages */
247 down_read(&mm->mmap_sem);
250 class_migrate_vma(mm, vma);
253 up_read(&mm->mmap_sem);
259 shrink_weight(struct ckrm_zone *czone)
262 struct zone *zone = czone->zone;
263 struct ckrm_mem_res *cls = czone->memcls;
264 int zone_usage, zone_guar, zone_total, guar, ret, cnt;
266 zone_usage = czone->nr_active + czone->nr_inactive;
267 czone->active_over = czone->inactive_over = 0;
269 if (zone_usage < SWAP_CLUSTER_MAX * 4)
272 if (cls->pg_guar == CKRM_SHARE_DONTCARE) {
273 // no guarantee for this class. use implicit guarantee
274 guar = cls->impl_guar / cls->nr_dontcare;
276 guar = cls->pg_unused / cls->nr_dontcare;
278 zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
279 temp = (u64) guar * zone_total;
280 do_div(temp, ckrm_tot_lru_pages);
281 zone_guar = (int) temp;
283 ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ?
284 (zone_usage - zone_guar) : 0;
286 cnt = czone->nr_active - (2 * zone_guar / 3);
288 czone->active_over = cnt;
289 cnt = czone->active_over + czone->nr_inactive
292 czone->inactive_over = cnt;
297 /* insert an entry to the list and sort decendently*/
299 list_add_sort(struct list_head *entry, struct list_head *head)
301 struct ckrm_zone *czone, *new =
302 list_entry(entry, struct ckrm_zone, victim_list);
303 struct list_head* pos = head->next;
305 while (pos != head) {
306 czone = list_entry(pos, struct ckrm_zone, victim_list);
307 if (new->shrink_weight > czone->shrink_weight) {
308 __list_add(entry, pos->prev, pos);
313 list_add_tail(entry, head);
318 shrink_choose_victims(struct list_head *victims,
319 unsigned long nr_active, unsigned long nr_inactive)
322 struct ckrm_zone* czone;
323 struct list_head *pos, *next;
326 while ((pos != victims) && (nr_active || nr_inactive)) {
327 czone = list_entry(pos, struct ckrm_zone, victim_list);
329 if (nr_active && czone->active_over) {
330 nr = min(nr_active, czone->active_over);
331 czone->shrink_active += nr;
332 czone->active_over -= nr;
336 if (nr_inactive && czone->inactive_over) {
337 nr = min(nr_inactive, czone->inactive_over);
338 czone->shrink_inactive += nr;
339 czone->inactive_over -= nr;
346 while (pos != victims) {
347 czone = list_entry(pos, struct ckrm_zone, victim_list);
349 if (czone->shrink_active == 0 && czone->shrink_inactive == 0) {
351 ckrm_clear_shrink(czone);
359 shrink_get_victims(struct zone *zone, unsigned long nr_active,
360 unsigned long nr_inactive, struct list_head *victims)
362 struct list_head *pos;
363 struct ckrm_mem_res *cls;
364 struct ckrm_zone *czone;
365 int zoneindex = zone_idx(zone);
367 if (ckrm_nr_mem_classes <= 1) {
368 if (ckrm_mem_root_class) {
369 czone = ckrm_mem_root_class->ckrm_zone + zoneindex;
370 if (!ckrm_test_set_shrink(czone)) {
371 list_add(&czone->victim_list, victims);
372 czone->shrink_active = nr_active;
373 czone->shrink_inactive = nr_inactive;
378 spin_lock(&ckrm_mem_lock);
379 list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) {
380 czone = cls->ckrm_zone + zoneindex;
381 if (ckrm_test_set_shrink(czone))
384 czone->shrink_active = 0;
385 czone->shrink_inactive = 0;
386 czone->shrink_weight = shrink_weight(czone);
387 if (czone->shrink_weight) {
388 list_add_sort(&czone->victim_list, victims);
390 ckrm_clear_shrink(czone);
394 while (pos != victims) {
395 czone = list_entry(pos, struct ckrm_zone, victim_list);
398 shrink_choose_victims(victims, nr_active, nr_inactive);
399 spin_unlock(&ckrm_mem_lock);
401 while (pos != victims) {
402 czone = list_entry(pos, struct ckrm_zone, victim_list);
407 LIST_HEAD(ckrm_shrink_list);
409 ckrm_shrink_atlimit(struct ckrm_mem_res *cls)
412 unsigned long now = jiffies;
415 if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) ||
416 ((cls->flags & CLS_AT_LIMIT) == CLS_AT_LIMIT)) {
419 if ((cls->last_shrink > now) /* jiffies wrapped around */ ||
420 (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) {
421 cls->last_shrink = now;
422 cls->shrink_count = 0;
425 if (cls->shrink_count > ckrm_mem_shrink_count) {
428 spin_lock(&ckrm_mem_lock);
429 list_add(&cls->shrink_list, &ckrm_shrink_list);
430 spin_unlock(&ckrm_mem_lock);
431 cls->flags |= CLS_AT_LIMIT;
432 for_each_zone(zone) {
433 /* This is just a number to get to wakeup kswapd */
434 order = atomic_read(&cls->pg_total) -
435 ((ckrm_mem_shrink_to * cls->pg_limit) / 100);
437 break; // only once is enough