2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
14 #include <linux/mempolicy.h>
15 #include <linux/cpuset.h>
16 #include <linux/mutex.h>
17 #include <linux/vs_base.h>
18 #include <linux/vs_memory.h>
21 #include <asm/pgtable.h>
23 #include <linux/hugetlb.h>
26 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
27 static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
28 unsigned long max_huge_pages;
29 static struct list_head hugepage_freelists[MAX_NUMNODES];
30 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
31 static unsigned int free_huge_pages_node[MAX_NUMNODES];
33 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
35 static DEFINE_SPINLOCK(hugetlb_lock);
37 static void clear_huge_page(struct page *page, unsigned long addr)
42 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
44 clear_user_highpage(page + i, addr);
48 static void copy_huge_page(struct page *dst, struct page *src,
54 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
56 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
60 static void enqueue_huge_page(struct page *page)
62 int nid = page_to_nid(page);
63 list_add(&page->lru, &hugepage_freelists[nid]);
65 free_huge_pages_node[nid]++;
68 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
69 unsigned long address)
71 int nid = numa_node_id();
72 struct page *page = NULL;
73 struct zonelist *zonelist = huge_zonelist(vma, address);
76 for (z = zonelist->zones; *z; z++) {
77 nid = (*z)->zone_pgdat->node_id;
78 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
79 !list_empty(&hugepage_freelists[nid]))
84 page = list_entry(hugepage_freelists[nid].next,
88 free_huge_pages_node[nid]--;
93 static void free_huge_page(struct page *page)
95 BUG_ON(page_count(page));
97 INIT_LIST_HEAD(&page->lru);
99 spin_lock(&hugetlb_lock);
100 enqueue_huge_page(page);
101 spin_unlock(&hugetlb_lock);
104 static int alloc_fresh_huge_page(void)
108 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
110 nid = next_node(nid, node_online_map);
111 if (nid == MAX_NUMNODES)
112 nid = first_node(node_online_map);
114 page[1].lru.next = (void *)free_huge_page; /* dtor */
115 spin_lock(&hugetlb_lock);
117 nr_huge_pages_node[page_to_nid(page)]++;
118 spin_unlock(&hugetlb_lock);
119 put_page(page); /* free it into the hugepage allocator */
125 static struct page *alloc_huge_page(struct vm_area_struct *vma,
128 struct inode *inode = vma->vm_file->f_dentry->d_inode;
133 spin_lock(&hugetlb_lock);
135 if (vma->vm_flags & VM_MAYSHARE) {
137 /* idx = radix tree index, i.e. offset into file in
138 * HPAGE_SIZE units */
139 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
140 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
142 /* The hugetlbfs specific inode info stores the number
143 * of "guaranteed available" (huge) pages. That is,
144 * the first 'prereserved_hpages' pages of the inode
145 * are either already instantiated, or have been
146 * pre-reserved (by hugetlb_reserve_for_inode()). Here
147 * we're in the process of instantiating the page, so
148 * we use this to determine whether to draw from the
149 * pre-reserved pool or the truly free pool. */
150 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
155 if (free_huge_pages <= reserved_huge_pages)
158 BUG_ON(reserved_huge_pages == 0);
159 reserved_huge_pages--;
162 page = dequeue_huge_page(vma, addr);
166 spin_unlock(&hugetlb_lock);
167 set_page_refcounted(page);
171 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
172 spin_unlock(&hugetlb_lock);
176 /* hugetlb_extend_reservation()
178 * Ensure that at least 'atleast' hugepages are, and will remain,
179 * available to instantiate the first 'atleast' pages of the given
180 * inode. If the inode doesn't already have this many pages reserved
181 * or instantiated, set aside some hugepages in the reserved pool to
182 * satisfy later faults (or fail now if there aren't enough, rather
183 * than getting the SIGBUS later).
185 int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
186 unsigned long atleast)
188 struct inode *inode = &info->vfs_inode;
189 unsigned long change_in_reserve = 0;
192 spin_lock(&hugetlb_lock);
193 read_lock_irq(&inode->i_mapping->tree_lock);
195 if (info->prereserved_hpages >= atleast)
198 /* Because we always call this on shared mappings, none of the
199 * pages beyond info->prereserved_hpages can have been
200 * instantiated, so we need to reserve all of them now. */
201 change_in_reserve = atleast - info->prereserved_hpages;
203 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
208 reserved_huge_pages += change_in_reserve;
209 info->prereserved_hpages = atleast;
212 read_unlock_irq(&inode->i_mapping->tree_lock);
213 spin_unlock(&hugetlb_lock);
218 /* hugetlb_truncate_reservation()
220 * This returns pages reserved for the given inode to the general free
221 * hugepage pool. If the inode has any pages prereserved, but not
222 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
225 void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
226 unsigned long atmost)
228 struct inode *inode = &info->vfs_inode;
229 struct address_space *mapping = inode->i_mapping;
231 unsigned long change_in_reserve = 0;
234 spin_lock(&hugetlb_lock);
235 read_lock_irq(&inode->i_mapping->tree_lock);
237 if (info->prereserved_hpages <= atmost)
240 /* Count pages which were reserved, but not instantiated, and
241 * which we can now release. */
242 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
243 page = radix_tree_lookup(&mapping->page_tree, idx);
245 /* Pages which are already instantiated can't
246 * be unreserved (and in fact have already
247 * been removed from the reserved pool) */
251 BUG_ON(reserved_huge_pages < change_in_reserve);
252 reserved_huge_pages -= change_in_reserve;
253 info->prereserved_hpages = atmost;
256 read_unlock_irq(&inode->i_mapping->tree_lock);
257 spin_unlock(&hugetlb_lock);
260 static int __init hugetlb_init(void)
264 if (HPAGE_SHIFT == 0)
267 for (i = 0; i < MAX_NUMNODES; ++i)
268 INIT_LIST_HEAD(&hugepage_freelists[i]);
270 for (i = 0; i < max_huge_pages; ++i) {
271 if (!alloc_fresh_huge_page())
274 max_huge_pages = free_huge_pages = nr_huge_pages = i;
275 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
278 module_init(hugetlb_init);
280 static int __init hugetlb_setup(char *s)
282 if (sscanf(s, "%lu", &max_huge_pages) <= 0)
286 __setup("hugepages=", hugetlb_setup);
289 static void update_and_free_page(struct page *page)
293 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
294 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
295 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
296 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
297 1 << PG_private | 1<< PG_writeback);
299 page[1].lru.next = NULL;
300 set_page_refcounted(page);
301 __free_pages(page, HUGETLB_PAGE_ORDER);
304 #ifdef CONFIG_HIGHMEM
305 static void try_to_free_low(unsigned long count)
308 for (i = 0; i < MAX_NUMNODES; ++i) {
309 struct page *page, *next;
310 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
311 if (PageHighMem(page))
313 list_del(&page->lru);
314 update_and_free_page(page);
315 nid = page_zone(page)->zone_pgdat->node_id;
317 free_huge_pages_node[nid]--;
318 if (count >= nr_huge_pages)
324 static inline void try_to_free_low(unsigned long count)
329 static unsigned long set_max_huge_pages(unsigned long count)
331 while (count > nr_huge_pages) {
332 if (!alloc_fresh_huge_page())
333 return nr_huge_pages;
335 if (count >= nr_huge_pages)
336 return nr_huge_pages;
338 spin_lock(&hugetlb_lock);
339 count = max(count, reserved_huge_pages);
340 try_to_free_low(count);
341 while (count < nr_huge_pages) {
342 struct page *page = dequeue_huge_page(NULL, 0);
345 update_and_free_page(page);
347 spin_unlock(&hugetlb_lock);
348 return nr_huge_pages;
351 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
352 struct file *file, void __user *buffer,
353 size_t *length, loff_t *ppos)
355 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
356 max_huge_pages = set_max_huge_pages(max_huge_pages);
359 #endif /* CONFIG_SYSCTL */
361 int hugetlb_report_meminfo(char *buf)
364 "HugePages_Total: %5lu\n"
365 "HugePages_Free: %5lu\n"
366 "HugePages_Rsvd: %5lu\n"
367 "Hugepagesize: %5lu kB\n",
374 int hugetlb_report_node_meminfo(int nid, char *buf)
377 "Node %d HugePages_Total: %5u\n"
378 "Node %d HugePages_Free: %5u\n",
379 nid, nr_huge_pages_node[nid],
380 nid, free_huge_pages_node[nid]);
383 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
384 unsigned long hugetlb_total_pages(void)
386 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
390 * We cannot handle pagefaults against hugetlb pages at all. They cause
391 * handle_mm_fault() to try to instantiate regular-sized pages in the
392 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
395 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
396 unsigned long address, int *unused)
402 struct vm_operations_struct hugetlb_vm_ops = {
403 .nopage = hugetlb_nopage,
406 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
413 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
415 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
417 entry = pte_mkyoung(entry);
418 entry = pte_mkhuge(entry);
423 static void set_huge_ptep_writable(struct vm_area_struct *vma,
424 unsigned long address, pte_t *ptep)
428 entry = pte_mkwrite(pte_mkdirty(*ptep));
429 ptep_set_access_flags(vma, address, ptep, entry, 1);
430 update_mmu_cache(vma, address, entry);
431 lazy_mmu_prot_update(entry);
435 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
436 struct vm_area_struct *vma)
438 pte_t *src_pte, *dst_pte, entry;
439 struct page *ptepage;
443 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
445 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
446 src_pte = huge_pte_offset(src, addr);
449 dst_pte = huge_pte_alloc(dst, addr);
452 spin_lock(&dst->page_table_lock);
453 spin_lock(&src->page_table_lock);
454 if (!pte_none(*src_pte)) {
456 ptep_set_wrprotect(src, addr, src_pte);
458 ptepage = pte_page(entry);
460 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
461 set_huge_pte_at(dst, addr, dst_pte, entry);
463 spin_unlock(&src->page_table_lock);
464 spin_unlock(&dst->page_table_lock);
472 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
475 struct mm_struct *mm = vma->vm_mm;
476 unsigned long address;
481 WARN_ON(!is_vm_hugetlb_page(vma));
482 BUG_ON(start & ~HPAGE_MASK);
483 BUG_ON(end & ~HPAGE_MASK);
485 spin_lock(&mm->page_table_lock);
487 /* Update high watermark before we lower rss */
488 update_hiwater_rss(mm);
490 for (address = start; address < end; address += HPAGE_SIZE) {
491 ptep = huge_pte_offset(mm, address);
495 pte = huge_ptep_get_and_clear(mm, address, ptep);
499 page = pte_page(pte);
501 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
504 spin_unlock(&mm->page_table_lock);
505 flush_tlb_range(vma, start, end);
508 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
509 unsigned long address, pte_t *ptep, pte_t pte)
511 struct page *old_page, *new_page;
514 old_page = pte_page(pte);
516 /* If no-one else is actually using this page, avoid the copy
517 * and just make the page writable */
518 avoidcopy = (page_count(old_page) == 1);
520 set_huge_ptep_writable(vma, address, ptep);
521 return VM_FAULT_MINOR;
524 page_cache_get(old_page);
525 new_page = alloc_huge_page(vma, address);
528 page_cache_release(old_page);
532 spin_unlock(&mm->page_table_lock);
533 copy_huge_page(new_page, old_page, address);
534 spin_lock(&mm->page_table_lock);
536 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
537 if (likely(pte_same(*ptep, pte))) {
539 set_huge_pte_at(mm, address, ptep,
540 make_huge_pte(vma, new_page, 1));
541 /* Make the old page be freed below */
544 page_cache_release(new_page);
545 page_cache_release(old_page);
546 return VM_FAULT_MINOR;
549 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
550 unsigned long address, pte_t *ptep, int write_access)
552 int ret = VM_FAULT_SIGBUS;
556 struct address_space *mapping;
559 mapping = vma->vm_file->f_mapping;
560 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
561 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
564 * Use page lock to guard against racing truncation
565 * before we get page_table_lock.
568 page = find_lock_page(mapping, idx);
570 if (hugetlb_get_quota(mapping))
572 page = alloc_huge_page(vma, address);
574 hugetlb_put_quota(mapping);
578 clear_huge_page(page, address);
580 if (vma->vm_flags & VM_SHARED) {
583 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
586 hugetlb_put_quota(mapping);
595 spin_lock(&mm->page_table_lock);
596 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
600 ret = VM_FAULT_MINOR;
601 if (!pte_none(*ptep))
604 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
605 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
606 && (vma->vm_flags & VM_SHARED)));
607 set_huge_pte_at(mm, address, ptep, new_pte);
609 if (write_access && !(vma->vm_flags & VM_SHARED)) {
610 /* Optimization, do the COW without a second fault */
611 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
614 spin_unlock(&mm->page_table_lock);
620 spin_unlock(&mm->page_table_lock);
621 hugetlb_put_quota(mapping);
627 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
628 unsigned long address, int write_access)
633 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
635 ptep = huge_pte_alloc(mm, address);
640 * Serialize hugepage allocation and instantiation, so that we don't
641 * get spurious allocation failures if two CPUs race to instantiate
642 * the same page in the page cache.
644 mutex_lock(&hugetlb_instantiation_mutex);
646 if (pte_none(entry)) {
647 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
648 mutex_unlock(&hugetlb_instantiation_mutex);
652 ret = VM_FAULT_MINOR;
654 spin_lock(&mm->page_table_lock);
655 /* Check for a racing update before calling hugetlb_cow */
656 if (likely(pte_same(entry, *ptep)))
657 if (write_access && !pte_write(entry))
658 ret = hugetlb_cow(mm, vma, address, ptep, entry);
659 spin_unlock(&mm->page_table_lock);
660 mutex_unlock(&hugetlb_instantiation_mutex);
665 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
666 struct page **pages, struct vm_area_struct **vmas,
667 unsigned long *position, int *length, int i)
669 unsigned long pfn_offset;
670 unsigned long vaddr = *position;
671 int remainder = *length;
673 spin_lock(&mm->page_table_lock);
674 while (vaddr < vma->vm_end && remainder) {
679 * Some archs (sparc64, sh*) have multiple pte_ts to
680 * each hugepage. We have to make * sure we get the
681 * first, for the page indexing below to work.
683 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
685 if (!pte || pte_none(*pte)) {
688 spin_unlock(&mm->page_table_lock);
689 ret = hugetlb_fault(mm, vma, vaddr, 0);
690 spin_lock(&mm->page_table_lock);
691 if (ret == VM_FAULT_MINOR)
700 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
701 page = pte_page(*pte);
705 pages[i] = page + pfn_offset;
715 if (vaddr < vma->vm_end && remainder &&
716 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
718 * We use pfn_offset to avoid touching the pageframes
719 * of this compound page.
724 spin_unlock(&mm->page_table_lock);
731 void hugetlb_change_protection(struct vm_area_struct *vma,
732 unsigned long address, unsigned long end, pgprot_t newprot)
734 struct mm_struct *mm = vma->vm_mm;
735 unsigned long start = address;
739 BUG_ON(address >= end);
740 flush_cache_range(vma, address, end);
742 spin_lock(&mm->page_table_lock);
743 for (; address < end; address += HPAGE_SIZE) {
744 ptep = huge_pte_offset(mm, address);
747 if (!pte_none(*ptep)) {
748 pte = huge_ptep_get_and_clear(mm, address, ptep);
749 pte = pte_mkhuge(pte_modify(pte, newprot));
750 set_huge_pte_at(mm, address, ptep, pte);
751 lazy_mmu_prot_update(pte);
754 spin_unlock(&mm->page_table_lock);
756 flush_tlb_range(vma, start, end);