fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / arch / i386 / mm / pgtable-xen.c
1 /*
2  *  linux/arch/i386/mm/pgtable.c
3  */
4
5 #include <linux/sched.h>
6 #include <linux/kernel.h>
7 #include <linux/errno.h>
8 #include <linux/mm.h>
9 #include <linux/swap.h>
10 #include <linux/smp.h>
11 #include <linux/highmem.h>
12 #include <linux/slab.h>
13 #include <linux/pagemap.h>
14 #include <linux/spinlock.h>
15 #include <linux/module.h>
16
17 #include <asm/system.h>
18 #include <asm/pgtable.h>
19 #include <asm/pgalloc.h>
20 #include <asm/fixmap.h>
21 #include <asm/e820.h>
22 #include <asm/tlb.h>
23 #include <asm/tlbflush.h>
24 #include <asm/io.h>
25 #include <asm/mmu_context.h>
26
27 #include <xen/features.h>
28 #include <xen/foreign_page.h>
29 #include <asm/hypervisor.h>
30
31 static void pgd_test_and_unpin(pgd_t *pgd);
32
33 void show_mem(void)
34 {
35         int total = 0, reserved = 0;
36         int shared = 0, cached = 0;
37         int highmem = 0;
38         struct page *page;
39         pg_data_t *pgdat;
40         unsigned long i;
41         unsigned long flags;
42
43         printk(KERN_INFO "Mem-info:\n");
44         show_free_areas();
45         printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
46         for_each_online_pgdat(pgdat) {
47                 pgdat_resize_lock(pgdat, &flags);
48                 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
49                         page = pgdat_page_nr(pgdat, i);
50                         total++;
51                         if (PageHighMem(page))
52                                 highmem++;
53                         if (PageReserved(page))
54                                 reserved++;
55                         else if (PageSwapCache(page))
56                                 cached++;
57                         else if (page_count(page))
58                                 shared += page_count(page) - 1;
59                 }
60                 pgdat_resize_unlock(pgdat, &flags);
61         }
62         printk(KERN_INFO "%d pages of RAM\n", total);
63         printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
64         printk(KERN_INFO "%d reserved pages\n", reserved);
65         printk(KERN_INFO "%d pages shared\n", shared);
66         printk(KERN_INFO "%d pages swap cached\n", cached);
67
68         printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
69         printk(KERN_INFO "%lu pages writeback\n",
70                                         global_page_state(NR_WRITEBACK));
71         printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
72         printk(KERN_INFO "%lu pages slab\n",
73                 global_page_state(NR_SLAB_RECLAIMABLE) +
74                 global_page_state(NR_SLAB_UNRECLAIMABLE));
75         printk(KERN_INFO "%lu pages pagetables\n",
76                                         global_page_state(NR_PAGETABLE));
77 }
78
79 /*
80  * Associate a virtual page frame with a given physical page frame 
81  * and protection flags for that frame.
82  */ 
83 static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
84 {
85         pgd_t *pgd;
86         pud_t *pud;
87         pmd_t *pmd;
88         pte_t *pte;
89
90         pgd = swapper_pg_dir + pgd_index(vaddr);
91         if (pgd_none(*pgd)) {
92                 BUG();
93                 return;
94         }
95         pud = pud_offset(pgd, vaddr);
96         if (pud_none(*pud)) {
97                 BUG();
98                 return;
99         }
100         pmd = pmd_offset(pud, vaddr);
101         if (pmd_none(*pmd)) {
102                 BUG();
103                 return;
104         }
105         pte = pte_offset_kernel(pmd, vaddr);
106         if (pgprot_val(flags))
107                 /* <pfn,flags> stored as-is, to permit clearing entries */
108                 set_pte(pte, pfn_pte(pfn, flags));
109         else
110                 pte_clear(&init_mm, vaddr, pte);
111
112         /*
113          * It's enough to flush this one mapping.
114          * (PGE mappings get flushed as well)
115          */
116         __flush_tlb_one(vaddr);
117 }
118
119 /*
120  * Associate a virtual page frame with a given physical page frame 
121  * and protection flags for that frame.
122  */ 
123 static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
124                            pgprot_t flags)
125 {
126         pgd_t *pgd;
127         pud_t *pud;
128         pmd_t *pmd;
129         pte_t *pte;
130
131         pgd = swapper_pg_dir + pgd_index(vaddr);
132         if (pgd_none(*pgd)) {
133                 BUG();
134                 return;
135         }
136         pud = pud_offset(pgd, vaddr);
137         if (pud_none(*pud)) {
138                 BUG();
139                 return;
140         }
141         pmd = pmd_offset(pud, vaddr);
142         if (pmd_none(*pmd)) {
143                 BUG();
144                 return;
145         }
146         pte = pte_offset_kernel(pmd, vaddr);
147         /* <pfn,flags> stored as-is, to permit clearing entries */
148         set_pte(pte, pfn_pte_ma(pfn, flags));
149
150         /*
151          * It's enough to flush this one mapping.
152          * (PGE mappings get flushed as well)
153          */
154         __flush_tlb_one(vaddr);
155 }
156
157 /*
158  * Associate a large virtual page frame with a given physical page frame 
159  * and protection flags for that frame. pfn is for the base of the page,
160  * vaddr is what the page gets mapped to - both must be properly aligned. 
161  * The pmd must already be instantiated. Assumes PAE mode.
162  */ 
163 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
164 {
165         pgd_t *pgd;
166         pud_t *pud;
167         pmd_t *pmd;
168
169         if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
170                 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
171                 return; /* BUG(); */
172         }
173         if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
174                 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
175                 return; /* BUG(); */
176         }
177         pgd = swapper_pg_dir + pgd_index(vaddr);
178         if (pgd_none(*pgd)) {
179                 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
180                 return; /* BUG(); */
181         }
182         pud = pud_offset(pgd, vaddr);
183         pmd = pmd_offset(pud, vaddr);
184         set_pmd(pmd, pfn_pmd(pfn, flags));
185         /*
186          * It's enough to flush this one mapping.
187          * (PGE mappings get flushed as well)
188          */
189         __flush_tlb_one(vaddr);
190 }
191
192 static int fixmaps;
193 #ifndef CONFIG_COMPAT_VDSO
194 unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
195 EXPORT_SYMBOL(__FIXADDR_TOP);
196 #endif
197
198 void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
199 {
200         unsigned long address = __fix_to_virt(idx);
201
202         if (idx >= __end_of_fixed_addresses) {
203                 BUG();
204                 return;
205         }
206         switch (idx) {
207         case FIX_WP_TEST:
208 #ifdef CONFIG_X86_F00F_BUG
209         case FIX_F00F_IDT:
210 #endif
211                 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
212                 break;
213         default:
214                 set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
215                 break;
216         }
217         fixmaps++;
218 }
219
220 /**
221  * reserve_top_address - reserves a hole in the top of kernel address space
222  * @reserve - size of hole to reserve
223  *
224  * Can be used to relocate the fixmap area and poke a hole in the top
225  * of kernel address space to make room for a hypervisor.
226  */
227 void reserve_top_address(unsigned long reserve)
228 {
229         BUG_ON(fixmaps > 0);
230 #ifdef CONFIG_COMPAT_VDSO
231         BUG_ON(reserve != 0);
232 #else
233         __FIXADDR_TOP = -reserve - PAGE_SIZE;
234         __VMALLOC_RESERVE += reserve;
235 #endif
236 }
237
238 void set_fixaddr_top(unsigned long top)
239 {
240         BUG_ON(fixmaps > 0);
241         __FIXADDR_TOP = top - PAGE_SIZE;
242 }
243
244 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
245 {
246         pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
247         if (pte)
248                 make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
249         return pte;
250 }
251
252 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
253 {
254         struct page *pte;
255
256 #ifdef CONFIG_HIGHPTE
257         pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
258 #else
259         pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
260         if (pte) {
261                 SetPageForeign(pte, pte_free);
262                 init_page_count(pte);
263         }
264 #endif
265         return pte;
266 }
267
268 void pte_free(struct page *pte)
269 {
270         unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
271
272         if (!pte_write(*virt_to_ptep(va)))
273                 BUG_ON(HYPERVISOR_update_va_mapping(
274                         va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
275
276         ClearPageForeign(pte);
277         init_page_count(pte);
278
279         __free_page(pte);
280 }
281
282 void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
283 {
284         memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
285 }
286
287 /*
288  * List of all pgd's needed for non-PAE so it can invalidate entries
289  * in both cached and uncached pgd's; not needed for PAE since the
290  * kernel pmd is shared. If PAE were not to share the pmd a similar
291  * tactic would be needed. This is essentially codepath-based locking
292  * against pageattr.c; it is the unique case in which a valid change
293  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
294  * vmalloc faults work because attached pagetables are never freed.
295  * The locking scheme was chosen on the basis of manfred's
296  * recommendations and having no core impact whatsoever.
297  * -- wli
298  */
299 DEFINE_SPINLOCK(pgd_lock);
300 struct page *pgd_list;
301
302 static inline void pgd_list_add(pgd_t *pgd)
303 {
304         struct page *page = virt_to_page(pgd);
305         page->index = (unsigned long)pgd_list;
306         if (pgd_list)
307                 set_page_private(pgd_list, (unsigned long)&page->index);
308         pgd_list = page;
309         set_page_private(page, (unsigned long)&pgd_list);
310 }
311
312 static inline void pgd_list_del(pgd_t *pgd)
313 {
314         struct page *next, **pprev, *page = virt_to_page(pgd);
315         next = (struct page *)page->index;
316         pprev = (struct page **)page_private(page);
317         *pprev = next;
318         if (next)
319                 set_page_private(next, (unsigned long)pprev);
320 }
321
322 void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
323 {
324         unsigned long flags;
325
326         if (PTRS_PER_PMD > 1) {
327                 if (HAVE_SHARED_KERNEL_PMD)
328                         clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
329                                         swapper_pg_dir + USER_PTRS_PER_PGD,
330                                         KERNEL_PGD_PTRS);
331         } else {
332                 spin_lock_irqsave(&pgd_lock, flags);
333                 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
334                                 swapper_pg_dir + USER_PTRS_PER_PGD,
335                                 KERNEL_PGD_PTRS);
336                 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
337                 pgd_list_add(pgd);
338                 spin_unlock_irqrestore(&pgd_lock, flags);
339         }
340 }
341
342 /* never called when PTRS_PER_PMD > 1 */
343 void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
344 {
345         unsigned long flags; /* can be called from interrupt context */
346
347         spin_lock_irqsave(&pgd_lock, flags);
348         pgd_list_del(pgd);
349         spin_unlock_irqrestore(&pgd_lock, flags);
350
351         pgd_test_and_unpin(pgd);
352 }
353
354 pgd_t *pgd_alloc(struct mm_struct *mm)
355 {
356         int i;
357         pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
358         pmd_t **pmd;
359         unsigned long flags;
360
361         pgd_test_and_unpin(pgd);
362
363         if (PTRS_PER_PMD == 1 || !pgd)
364                 return pgd;
365
366         if (HAVE_SHARED_KERNEL_PMD) {
367                 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
368                         pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
369                         if (!pmd)
370                                 goto out_oom;
371                         set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
372                 }
373                 return pgd;
374         }
375
376         /*
377          * We can race save/restore (if we sleep during a GFP_KERNEL memory
378          * allocation). We therefore store virtual addresses of pmds as they
379          * do not change across save/restore, and poke the machine addresses
380          * into the pgdir under the pgd_lock.
381          */
382         pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
383         if (!pmd) {
384                 kmem_cache_free(pgd_cache, pgd);
385                 return NULL;
386         }
387
388         /* Allocate pmds, remember virtual addresses. */
389         for (i = 0; i < PTRS_PER_PGD; ++i) {
390                 pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
391                 if (!pmd[i])
392                         goto out_oom;
393         }
394
395         spin_lock_irqsave(&pgd_lock, flags);
396
397         /* Protect against save/restore: move below 4GB under pgd_lock. */
398         if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
399                 int rc = xen_create_contiguous_region(
400                         (unsigned long)pgd, 0, 32);
401                 if (rc) {
402                         spin_unlock_irqrestore(&pgd_lock, flags);
403                         goto out_oom;
404                 }
405         }
406
407         /* Copy kernel pmd contents and write-protect the new pmds. */
408         for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
409                 unsigned long v = (unsigned long)i << PGDIR_SHIFT;
410                 pgd_t *kpgd = pgd_offset_k(v);
411                 pud_t *kpud = pud_offset(kpgd, v);
412                 pmd_t *kpmd = pmd_offset(kpud, v);
413                 memcpy(pmd[i], kpmd, PAGE_SIZE);
414                 make_lowmem_page_readonly(
415                         pmd[i], XENFEAT_writable_page_tables);
416         }
417
418         /* It is safe to poke machine addresses of pmds under the pmd_lock. */
419         for (i = 0; i < PTRS_PER_PGD; i++)
420                 set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
421
422         /* Ensure this pgd gets picked up and pinned on save/restore. */
423         pgd_list_add(pgd);
424
425         spin_unlock_irqrestore(&pgd_lock, flags);
426
427         kfree(pmd);
428
429         return pgd;
430
431 out_oom:
432         if (HAVE_SHARED_KERNEL_PMD) {
433                 for (i--; i >= 0; i--)
434                         kmem_cache_free(pmd_cache,
435                                         (void *)__va(pgd_val(pgd[i])-1));
436         } else {
437                 for (i--; i >= 0; i--)
438                         kmem_cache_free(pmd_cache, pmd[i]);
439                 kfree(pmd);
440         }
441         kmem_cache_free(pgd_cache, pgd);
442         return NULL;
443 }
444
445 void pgd_free(pgd_t *pgd)
446 {
447         int i;
448
449         /*
450          * After this the pgd should not be pinned for the duration of this
451          * function's execution. We should never sleep and thus never race:
452          *  1. User pmds will not become write-protected under our feet due
453          *     to a concurrent mm_pin_all().
454          *  2. The machine addresses in PGD entries will not become invalid
455          *     due to a concurrent save/restore.
456          */
457         pgd_test_and_unpin(pgd);
458
459         /* in the PAE case user pgd entries are overwritten before usage */
460         if (PTRS_PER_PMD > 1) {
461                 for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
462                         pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
463                         kmem_cache_free(pmd_cache, pmd);
464                 }
465
466                 if (!HAVE_SHARED_KERNEL_PMD) {
467                         unsigned long flags;
468                         spin_lock_irqsave(&pgd_lock, flags);
469                         pgd_list_del(pgd);
470                         spin_unlock_irqrestore(&pgd_lock, flags);
471
472                         for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
473                                 pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
474                                 make_lowmem_page_writable(
475                                         pmd, XENFEAT_writable_page_tables);
476                                 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
477                                 kmem_cache_free(pmd_cache, pmd);
478                         }
479
480                         if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
481                                 xen_destroy_contiguous_region(
482                                         (unsigned long)pgd, 0);
483                 }
484         }
485
486         /* in the non-PAE case, free_pgtables() clears user pgd entries */
487         kmem_cache_free(pgd_cache, pgd);
488 }
489
490 void make_lowmem_page_readonly(void *va, unsigned int feature)
491 {
492         pte_t *pte;
493         int rc;
494
495         if (xen_feature(feature))
496                 return;
497
498         pte = virt_to_ptep(va);
499         rc = HYPERVISOR_update_va_mapping(
500                 (unsigned long)va, pte_wrprotect(*pte), 0);
501         BUG_ON(rc);
502 }
503
504 void make_lowmem_page_writable(void *va, unsigned int feature)
505 {
506         pte_t *pte;
507         int rc;
508
509         if (xen_feature(feature))
510                 return;
511
512         pte = virt_to_ptep(va);
513         rc = HYPERVISOR_update_va_mapping(
514                 (unsigned long)va, pte_mkwrite(*pte), 0);
515         BUG_ON(rc);
516 }
517
518 void make_page_readonly(void *va, unsigned int feature)
519 {
520         pte_t *pte;
521         int rc;
522
523         if (xen_feature(feature))
524                 return;
525
526         pte = virt_to_ptep(va);
527         rc = HYPERVISOR_update_va_mapping(
528                 (unsigned long)va, pte_wrprotect(*pte), 0);
529         if (rc) /* fallback? */
530                 xen_l1_entry_update(pte, pte_wrprotect(*pte));
531         if ((unsigned long)va >= (unsigned long)high_memory) {
532                 unsigned long pfn = pte_pfn(*pte);
533 #ifdef CONFIG_HIGHMEM
534                 if (pfn >= highstart_pfn)
535                         kmap_flush_unused(); /* flush stale writable kmaps */
536                 else
537 #endif
538                         make_lowmem_page_readonly(
539                                 phys_to_virt(pfn << PAGE_SHIFT), feature); 
540         }
541 }
542
543 void make_page_writable(void *va, unsigned int feature)
544 {
545         pte_t *pte;
546         int rc;
547
548         if (xen_feature(feature))
549                 return;
550
551         pte = virt_to_ptep(va);
552         rc = HYPERVISOR_update_va_mapping(
553                 (unsigned long)va, pte_mkwrite(*pte), 0);
554         if (rc) /* fallback? */
555                 xen_l1_entry_update(pte, pte_mkwrite(*pte));
556         if ((unsigned long)va >= (unsigned long)high_memory) {
557                 unsigned long pfn = pte_pfn(*pte); 
558 #ifdef CONFIG_HIGHMEM
559                 if (pfn < highstart_pfn)
560 #endif
561                         make_lowmem_page_writable(
562                                 phys_to_virt(pfn << PAGE_SHIFT), feature);
563         }
564 }
565
566 void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
567 {
568         if (xen_feature(feature))
569                 return;
570
571         while (nr-- != 0) {
572                 make_page_readonly(va, feature);
573                 va = (void *)((unsigned long)va + PAGE_SIZE);
574         }
575 }
576
577 void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
578 {
579         if (xen_feature(feature))
580                 return;
581
582         while (nr-- != 0) {
583                 make_page_writable(va, feature);
584                 va = (void *)((unsigned long)va + PAGE_SIZE);
585         }
586 }
587
588 static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
589 {
590         struct page *page = virt_to_page(pt);
591         unsigned long pfn = page_to_pfn(page);
592
593         if (PageHighMem(page))
594                 return;
595         BUG_ON(HYPERVISOR_update_va_mapping(
596                 (unsigned long)__va(pfn << PAGE_SHIFT),
597                 pfn_pte(pfn, flags), 0));
598 }
599
600 static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
601 {
602         pgd_t *pgd = pgd_base;
603         pud_t *pud;
604         pmd_t *pmd;
605         pte_t *pte;
606         int    g, u, m;
607
608         if (xen_feature(XENFEAT_auto_translated_physmap))
609                 return;
610
611         for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
612                 if (pgd_none(*pgd))
613                         continue;
614                 pud = pud_offset(pgd, 0);
615                 if (PTRS_PER_PUD > 1) /* not folded */
616                         pgd_walk_set_prot(pud,flags);
617                 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
618                         if (pud_none(*pud))
619                                 continue;
620                         pmd = pmd_offset(pud, 0);
621                         if (PTRS_PER_PMD > 1) /* not folded */
622                                 pgd_walk_set_prot(pmd,flags);
623                         for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
624                                 if (pmd_none(*pmd))
625                                         continue;
626                                 pte = pte_offset_kernel(pmd,0);
627                                 pgd_walk_set_prot(pte,flags);
628                         }
629                 }
630         }
631
632         BUG_ON(HYPERVISOR_update_va_mapping(
633                 (unsigned long)pgd_base,
634                 pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
635                 UVMF_TLB_FLUSH));
636 }
637
638 static void __pgd_pin(pgd_t *pgd)
639 {
640         pgd_walk(pgd, PAGE_KERNEL_RO);
641         xen_pgd_pin(__pa(pgd));
642         set_bit(PG_pinned, &virt_to_page(pgd)->flags);
643 }
644
645 static void __pgd_unpin(pgd_t *pgd)
646 {
647         xen_pgd_unpin(__pa(pgd));
648         pgd_walk(pgd, PAGE_KERNEL);
649         clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
650 }
651
652 static void pgd_test_and_unpin(pgd_t *pgd)
653 {
654         if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
655                 __pgd_unpin(pgd);
656 }
657
658 void mm_pin(struct mm_struct *mm)
659 {
660         if (xen_feature(XENFEAT_writable_page_tables))
661                 return;
662         spin_lock(&mm->page_table_lock);
663         __pgd_pin(mm->pgd);
664         spin_unlock(&mm->page_table_lock);
665 }
666
667 void mm_unpin(struct mm_struct *mm)
668 {
669         if (xen_feature(XENFEAT_writable_page_tables))
670                 return;
671         spin_lock(&mm->page_table_lock);
672         __pgd_unpin(mm->pgd);
673         spin_unlock(&mm->page_table_lock);
674 }
675
676 void mm_pin_all(void)
677 {
678         struct page *page;
679
680         /* Only pgds on the pgd_list please: none hidden in the slab cache. */
681         kmem_cache_shrink(pgd_cache);
682
683         if (xen_feature(XENFEAT_writable_page_tables))
684                 return;
685
686         for (page = pgd_list; page; page = (struct page *)page->index) {
687                 if (!test_bit(PG_pinned, &page->flags))
688                         __pgd_pin((pgd_t *)page_address(page));
689         }
690 }
691
692 void _arch_dup_mmap(struct mm_struct *mm)
693 {
694         if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
695                 mm_pin(mm);
696 }
697
698 void _arch_exit_mmap(struct mm_struct *mm)
699 {
700         struct task_struct *tsk = current;
701
702         task_lock(tsk);
703
704         /*
705          * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
706          * *much* faster this way, as no tlb flushes means bigger wrpt batches.
707          */
708         if (tsk->active_mm == mm) {
709                 tsk->active_mm = &init_mm;
710                 atomic_inc(&init_mm.mm_count);
711
712                 switch_mm(mm, &init_mm, tsk);
713
714                 atomic_dec(&mm->mm_count);
715                 BUG_ON(atomic_read(&mm->mm_count) == 0);
716         }
717
718         task_unlock(tsk);
719
720         if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
721             (atomic_read(&mm->mm_count) == 1) &&
722             !mm->context.has_foreign_mappings)
723                 mm_unpin(mm);
724 }