This commit was manufactured by cvs2svn to create branch
[linux-2.6.git] / kernel / ckrm / ckrm_memctlr.c
1 /* ckrm_memctlr.c - Basic routines for the CKRM memory controller
2  *
3  * Copyright (C) Jiantao Kong, IBM Corp. 2003
4  *           (C) Chandra Seetharaman, IBM Corp. 2004
5  *
6  * Provides a Memory Resource controller for CKRM
7  *
8  * Latest version, more details at http://ckrm.sf.net
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  */
16
17 #include <linux/swap.h>
18 #include <linux/pagemap.h>
19 #include <linux/ckrm_mem_inline.h>
20
21 static int
22 ckrm_mem_evaluate_page_anon(struct page* page)
23 {
24         struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
25         struct ckrm_mem_res* maxshareclass = NULL;
26         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
27         struct vm_area_struct *vma;
28         struct mm_struct* mm;
29         int ret = 0;
30
31         if (!spin_trylock(&anon_vma->lock))
32                 return 0;
33         BUG_ON(list_empty(&anon_vma->head));
34         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
35                 mm = vma->vm_mm;
36                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
37                                 mm->memclass) < 0) {
38                         maxshareclass = mm->memclass;
39                 }
40         }
41         spin_unlock(&anon_vma->lock);
42
43         if (!maxshareclass) {
44                 maxshareclass = ckrm_mem_root_class;
45         }
46         if (pgcls != maxshareclass) {
47                 ckrm_change_page_class(page, maxshareclass);
48                 ret = 1;
49         }
50         return ret;
51 }
52
53 static int
54 ckrm_mem_evaluate_page_file(struct page* page)
55 {
56         struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
57         struct ckrm_mem_res* maxshareclass = NULL;
58         struct address_space *mapping = page->mapping;
59         struct vm_area_struct *vma = NULL;
60         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
61         struct prio_tree_iter iter;
62         struct mm_struct* mm;
63         int ret = 0;
64
65         if (!mapping)
66                 return 0;
67
68         if (!spin_trylock(&mapping->i_mmap_lock))
69                 return 0;
70
71         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
72                                         pgoff, pgoff) {
73                 mm = vma->vm_mm;
74                 if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
75                                 mm->memclass)<0)
76                         maxshareclass = mm->memclass;
77         }
78         spin_unlock(&mapping->i_mmap_lock);
79
80         if (!maxshareclass) {
81                 maxshareclass = ckrm_mem_root_class;
82         }
83         if (pgcls != maxshareclass) {
84                 ckrm_change_page_class(page, maxshareclass);
85                 ret = 1;
86         }
87         return ret;
88 }
89
90 static int
91 ckrm_mem_evaluate_page(struct page* page)
92 {
93         int ret = 0;
94         if (page->mapping) {
95                 if (PageAnon(page))
96                         ret = ckrm_mem_evaluate_page_anon(page);
97                 else
98                         ret = ckrm_mem_evaluate_page_file(page);
99         }
100         return ret;
101 }
102
103 void
104 ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from, struct ckrm_mem_res* def)
105 {
106         int i;
107         struct page *page;
108         struct zone *zone;
109         struct list_head *pos, *next;
110         struct ckrm_zone *ckrm_zone;
111
112         for (i = 0; i < MAX_NR_ZONES; i++) {
113                 ckrm_zone = &from->ckrm_zone[i];
114                 zone = ckrm_zone->zone;
115                 spin_lock_irq(&zone->lru_lock);
116                 pos = ckrm_zone->inactive_list.next;
117                 while (pos != &ckrm_zone->inactive_list) {
118                         next = pos->next;
119                         page = list_entry(pos, struct page, lru);
120                         if (ckrm_mem_evaluate_page(page))
121                                 ckrm_change_page_class(page, def);
122                         pos = next;
123                 }
124                 pos = ckrm_zone->active_list.next;
125                 while (pos != &ckrm_zone->active_list) {
126                         next = pos->next;
127                         page = list_entry(pos, struct page, lru);
128                         if (ckrm_mem_evaluate_page(page))
129                                 ckrm_change_page_class(page, def);
130                         pos = next;
131                 }
132                 spin_unlock_irq(&zone->lru_lock);
133         }
134         return;
135 }
136
137 static inline int
138 class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma,
139                 pmd_t* pmdir, unsigned long address, unsigned long end)
140 {
141         pte_t *pte;
142         unsigned long pmd_end;
143
144         if (pmd_none(*pmdir))
145                 return 0;
146         BUG_ON(pmd_bad(*pmdir));
147
148         pmd_end = (address+ PMD_SIZE) & PMD_MASK;
149         if (end > pmd_end)
150                 end = pmd_end;
151
152         do {
153                 pte = pte_offset_map(pmdir, address);
154                 if (pte_present(*pte)) {
155                         struct page *page = pte_page(*pte);
156                         struct ckrm_zone *czone = page_ckrmzone(page);
157                         if (page->mapping && czone) {
158                                 struct zone *zone = czone->zone;
159                                 spin_lock_irq(&zone->lru_lock);
160                                 ckrm_change_page_class(page, mm->memclass);
161                                 spin_unlock_irq(&zone->lru_lock);
162                         }
163                 }
164                 address += PAGE_SIZE;
165                 pte_unmap(pte);
166                 pte++;
167         } while(address && (address < end));
168         return 0;
169 }
170
171 static inline int
172 class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma,
173                 pgd_t* pgdir, unsigned long address, unsigned long end)
174 {
175         pmd_t* pmd;
176         unsigned long pgd_end;
177
178         if (pgd_none(*pgdir))
179                 return 0;
180         BUG_ON(pgd_bad(*pgdir));
181
182         pmd = pmd_offset(pgdir, address);
183         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
184
185         if (pgd_end && (end > pgd_end))
186                 end = pgd_end;
187
188         do {
189                 class_migrate_pmd(mm, vma, pmd, address, end);
190                 address = (address + PMD_SIZE) & PMD_MASK;
191                 pmd++;
192         } while (address && (address < end));
193         return 0;
194 }
195
196 static inline int
197 class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma)
198 {
199         pgd_t* pgdir;
200         unsigned long address, end;
201
202         address = vma->vm_start;
203         end = vma->vm_end;
204
205         pgdir = pgd_offset(vma->vm_mm, address);
206         do {
207                 class_migrate_pgd(mm, vma, pgdir, address, end);
208                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
209                 pgdir++;
210         } while(address && (address < end));
211         return 0;
212 }
213
214 /* this function is called with mm->peertask_lock hold */
215 void
216 ckrm_mem_migrate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
217 {
218         struct task_struct *task;
219         struct vm_area_struct *vma;
220         struct ckrm_mem_res *maxshareclass = def;
221
222         if (list_empty(&mm->tasklist)) {
223                 /* We leave the mm->memclass untouched since we believe that one
224                  * mm with no task associated will be deleted soon or attach
225                  * with another task later.
226                  */
227                 return;
228         }
229
230         list_for_each_entry(task, &mm->tasklist, mm_peers) {
231                 struct ckrm_mem_res* cls = ckrm_get_mem_class(task);
232                 if (!cls)
233                         continue;
234                 if (!maxshareclass ||
235                                 ckrm_mem_share_compare(maxshareclass,cls)<0 )
236                         maxshareclass = cls;
237         }
238
239         if (maxshareclass && (mm->memclass != maxshareclass)) {
240                 if (mm->memclass) {
241                         kref_put(&mm->memclass->nr_users, memclass_release);
242                 }
243                 mm->memclass = maxshareclass;
244                 kref_get(&maxshareclass->nr_users);
245
246                 /* Go through all VMA to migrate pages */
247                 down_read(&mm->mmap_sem);
248                 vma = mm->mmap;
249                 while(vma) {
250                         class_migrate_vma(mm, vma);
251                         vma = vma->vm_next;
252                 }
253                 up_read(&mm->mmap_sem);
254         }
255         return;
256 }
257
258 static int
259 shrink_weight(struct ckrm_zone *czone)
260 {
261         u64 temp;
262         struct zone *zone = czone->zone;
263         struct ckrm_mem_res *cls = czone->memcls;
264         int zone_usage, zone_guar, zone_total, guar, ret, cnt;
265
266         zone_usage = czone->nr_active + czone->nr_inactive;
267         czone->active_over = czone->inactive_over = 0;
268
269         if (zone_usage < SWAP_CLUSTER_MAX * 4)
270                 return 0;
271
272         if (cls->pg_guar == CKRM_SHARE_DONTCARE) {
273                 // no guarantee for this class. use implicit guarantee
274                 guar = cls->impl_guar / cls->nr_dontcare;
275         } else {
276                 guar = cls->pg_unused / cls->nr_dontcare;
277         }
278         zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
279         temp = (u64) guar * zone_total;
280         do_div(temp, ckrm_tot_lru_pages);
281         zone_guar = (int) temp;
282
283         ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ?
284                                 (zone_usage - zone_guar) : 0;
285         if (ret) {
286                 cnt = czone->nr_active - (2 * zone_guar / 3);
287                 if (cnt > 0)
288                         czone->active_over = cnt;
289                 cnt = czone->active_over + czone->nr_inactive
290                                         - zone_guar / 3;
291                 if (cnt > 0)
292                         czone->inactive_over = cnt;
293         }
294         return ret;
295 }
296
297 /* insert an entry to the list and sort decendently*/
298 static void
299 list_add_sort(struct list_head *entry, struct list_head *head)
300 {
301         struct ckrm_zone *czone, *new =
302                         list_entry(entry, struct ckrm_zone, victim_list);
303         struct list_head* pos = head->next;
304
305         while (pos != head) {
306                 czone = list_entry(pos, struct ckrm_zone, victim_list);
307                 if (new->shrink_weight > czone->shrink_weight) {
308                         __list_add(entry, pos->prev, pos);
309                         return;
310                 }
311                 pos = pos->next;
312         }
313         list_add_tail(entry, head);
314         return; 
315 }
316
317 static void
318 shrink_choose_victims(struct list_head *victims,
319                 unsigned long nr_active, unsigned long nr_inactive)
320 {
321         unsigned long nr;
322         struct ckrm_zone* czone;
323         struct list_head *pos, *next;
324
325         pos = victims->next;
326         while ((pos != victims) && (nr_active || nr_inactive)) {
327                 czone = list_entry(pos, struct ckrm_zone, victim_list);
328                 
329                 if (nr_active && czone->active_over) {
330                         nr = min(nr_active, czone->active_over);
331                         czone->shrink_active += nr;
332                         czone->active_over -= nr;
333                         nr_active -= nr;
334                 }
335
336                 if (nr_inactive && czone->inactive_over) {
337                         nr = min(nr_inactive, czone->inactive_over);
338                         czone->shrink_inactive += nr;
339                         czone->inactive_over -= nr;
340                         nr_inactive -= nr;
341                 }
342                 pos = pos->next;
343         }
344
345         pos = victims->next;
346         while (pos != victims) {
347                 czone = list_entry(pos, struct ckrm_zone, victim_list);
348                 next = pos->next;
349                 if (czone->shrink_active == 0 && czone->shrink_inactive == 0) {
350                         list_del_init(pos);
351                         ckrm_clear_shrink(czone);
352                 }
353                 pos = next;
354         }       
355         return;
356 }
357
358 void
359 shrink_get_victims(struct zone *zone, unsigned long nr_active,
360                 unsigned long nr_inactive, struct list_head *victims)
361 {
362         struct list_head *pos;
363         struct ckrm_mem_res *cls;
364         struct ckrm_zone *czone;
365         int zoneindex = zone_idx(zone);
366         
367         if (ckrm_nr_mem_classes <= 1) {
368                 if (ckrm_mem_root_class) {
369                         czone = ckrm_mem_root_class->ckrm_zone + zoneindex;
370                         if (!ckrm_test_set_shrink(czone)) {
371                                 list_add(&czone->victim_list, victims);
372                                 czone->shrink_active = nr_active;
373                                 czone->shrink_inactive = nr_inactive;
374                         }
375                 }
376                 return;
377         }
378         spin_lock(&ckrm_mem_lock);
379         list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) {
380                 czone = cls->ckrm_zone + zoneindex;
381                 if (ckrm_test_set_shrink(czone))
382                         continue;
383
384                 czone->shrink_active = 0;
385                 czone->shrink_inactive = 0;
386                 czone->shrink_weight = shrink_weight(czone);
387                 if (czone->shrink_weight) {
388                         list_add_sort(&czone->victim_list, victims);
389                 } else {
390                         ckrm_clear_shrink(czone);
391                 }
392         }
393         pos = victims->next;
394         while (pos != victims) {
395                 czone = list_entry(pos, struct ckrm_zone, victim_list);
396                 pos = pos->next;
397         }
398         shrink_choose_victims(victims, nr_active, nr_inactive);
399         spin_unlock(&ckrm_mem_lock);
400         pos = victims->next;
401         while (pos != victims) {
402                 czone = list_entry(pos, struct ckrm_zone, victim_list);
403                 pos = pos->next;
404         }
405 }
406
407 LIST_HEAD(ckrm_shrink_list);
408 void
409 ckrm_shrink_atlimit(struct ckrm_mem_res *cls)
410 {
411         struct zone *zone;
412         unsigned long now = jiffies;
413         int order;
414
415         if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) ||
416                         ((cls->flags & CLS_AT_LIMIT) == CLS_AT_LIMIT)) {
417                 return;
418         }
419         if ((cls->last_shrink > now) /* jiffies wrapped around */ ||
420                    (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) {
421                 cls->last_shrink = now;
422                 cls->shrink_count = 0;
423         }
424         cls->shrink_count++;
425         if (cls->shrink_count > ckrm_mem_shrink_count) {
426                 return;
427         }
428         spin_lock(&ckrm_mem_lock);
429         list_add(&cls->shrink_list, &ckrm_shrink_list);
430         spin_unlock(&ckrm_mem_lock);
431         cls->flags |= CLS_AT_LIMIT;
432         for_each_zone(zone) {
433                 /* This is just a number to get to wakeup kswapd */
434                 order = atomic_read(&cls->pg_total) -
435                         ((ckrm_mem_shrink_to * cls->pg_limit) / 100);
436                 wakeup_kswapd(zone);
437                 break; // only once is enough
438         }
439 }