2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
14 #include <linux/mempolicy.h>
15 #include <linux/cpuset.h>
16 #include <linux/mutex.h>
17 #include <linux/vs_base.h>
18 #include <linux/vs_memory.h>
21 #include <asm/pgtable.h>
23 #include <linux/hugetlb.h>
26 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
27 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
28 unsigned long max_huge_pages;
29 static struct list_head hugepage_freelists[MAX_NUMNODES];
30 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
31 static unsigned int free_huge_pages_node[MAX_NUMNODES];
33 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
35 static DEFINE_SPINLOCK(hugetlb_lock);
37 static void clear_huge_page(struct page *page, unsigned long addr)
42 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
44 clear_user_highpage(page + i, addr);
48 static void copy_huge_page(struct page *dst, struct page *src,
54 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
56 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
60 static void enqueue_huge_page(struct page *page)
62 int nid = page_to_nid(page);
63 list_add(&page->lru, &hugepage_freelists[nid]);
65 free_huge_pages_node[nid]++;
68 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
69 unsigned long address)
71 int nid = numa_node_id();
72 struct page *page = NULL;
73 struct zonelist *zonelist = huge_zonelist(vma, address);
76 for (z = zonelist->zones; *z; z++) {
77 nid = (*z)->zone_pgdat->node_id;
78 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
79 !list_empty(&hugepage_freelists[nid]))
84 page = list_entry(hugepage_freelists[nid].next,
88 free_huge_pages_node[nid]--;
93 static void free_huge_page(struct page *page)
95 BUG_ON(page_count(page));
97 INIT_LIST_HEAD(&page->lru);
99 spin_lock(&hugetlb_lock);
100 enqueue_huge_page(page);
101 spin_unlock(&hugetlb_lock);
104 static int alloc_fresh_huge_page(void)
108 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
110 nid = next_node(nid, node_online_map);
111 if (nid == MAX_NUMNODES)
112 nid = first_node(node_online_map);
114 page[1].lru.next = (void *)free_huge_page; /* dtor */
115 spin_lock(&hugetlb_lock);
117 nr_huge_pages_node[page_to_nid(page)]++;
118 spin_unlock(&hugetlb_lock);
119 put_page(page); /* free it into the hugepage allocator */
125 static struct page *alloc_huge_page(struct vm_area_struct *vma,
130 spin_lock(&hugetlb_lock);
131 if (vma->vm_flags & VM_MAYSHARE)
133 else if (free_huge_pages <= resv_huge_pages)
136 page = dequeue_huge_page(vma, addr);
140 spin_unlock(&hugetlb_lock);
141 set_page_refcounted(page);
145 spin_unlock(&hugetlb_lock);
149 static int __init hugetlb_init(void)
153 if (HPAGE_SHIFT == 0)
156 for (i = 0; i < MAX_NUMNODES; ++i)
157 INIT_LIST_HEAD(&hugepage_freelists[i]);
159 for (i = 0; i < max_huge_pages; ++i) {
160 if (!alloc_fresh_huge_page())
163 max_huge_pages = free_huge_pages = nr_huge_pages = i;
164 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
167 module_init(hugetlb_init);
169 static int __init hugetlb_setup(char *s)
171 if (sscanf(s, "%lu", &max_huge_pages) <= 0)
175 __setup("hugepages=", hugetlb_setup);
178 static void update_and_free_page(struct page *page)
182 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
183 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
184 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
185 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
186 1 << PG_private | 1<< PG_writeback);
188 page[1].lru.next = NULL;
189 set_page_refcounted(page);
190 __free_pages(page, HUGETLB_PAGE_ORDER);
193 #ifdef CONFIG_HIGHMEM
194 static void try_to_free_low(unsigned long count)
197 for (i = 0; i < MAX_NUMNODES; ++i) {
198 struct page *page, *next;
199 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
200 if (PageHighMem(page))
202 list_del(&page->lru);
203 update_and_free_page(page);
204 nid = page_zone(page)->zone_pgdat->node_id;
206 free_huge_pages_node[nid]--;
207 if (count >= nr_huge_pages)
213 static inline void try_to_free_low(unsigned long count)
218 static unsigned long set_max_huge_pages(unsigned long count)
220 while (count > nr_huge_pages) {
221 if (!alloc_fresh_huge_page())
222 return nr_huge_pages;
224 if (count >= nr_huge_pages)
225 return nr_huge_pages;
227 spin_lock(&hugetlb_lock);
228 count = max(count, resv_huge_pages);
229 try_to_free_low(count);
230 while (count < nr_huge_pages) {
231 struct page *page = dequeue_huge_page(NULL, 0);
234 update_and_free_page(page);
236 spin_unlock(&hugetlb_lock);
237 return nr_huge_pages;
240 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
241 struct file *file, void __user *buffer,
242 size_t *length, loff_t *ppos)
244 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
245 max_huge_pages = set_max_huge_pages(max_huge_pages);
248 #endif /* CONFIG_SYSCTL */
250 int hugetlb_report_meminfo(char *buf)
253 "HugePages_Total: %5lu\n"
254 "HugePages_Free: %5lu\n"
255 "HugePages_Rsvd: %5lu\n"
256 "Hugepagesize: %5lu kB\n",
263 int hugetlb_report_node_meminfo(int nid, char *buf)
266 "Node %d HugePages_Total: %5u\n"
267 "Node %d HugePages_Free: %5u\n",
268 nid, nr_huge_pages_node[nid],
269 nid, free_huge_pages_node[nid]);
272 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
273 unsigned long hugetlb_total_pages(void)
275 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
279 * We cannot handle pagefaults against hugetlb pages at all. They cause
280 * handle_mm_fault() to try to instantiate regular-sized pages in the
281 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
284 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
285 unsigned long address, int *unused)
291 struct vm_operations_struct hugetlb_vm_ops = {
292 .nopage = hugetlb_nopage,
295 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
302 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
304 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
306 entry = pte_mkyoung(entry);
307 entry = pte_mkhuge(entry);
312 static void set_huge_ptep_writable(struct vm_area_struct *vma,
313 unsigned long address, pte_t *ptep)
317 entry = pte_mkwrite(pte_mkdirty(*ptep));
318 ptep_set_access_flags(vma, address, ptep, entry, 1);
319 update_mmu_cache(vma, address, entry);
320 lazy_mmu_prot_update(entry);
324 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
325 struct vm_area_struct *vma)
327 pte_t *src_pte, *dst_pte, entry;
328 struct page *ptepage;
332 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
334 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
335 src_pte = huge_pte_offset(src, addr);
338 dst_pte = huge_pte_alloc(dst, addr);
341 spin_lock(&dst->page_table_lock);
342 spin_lock(&src->page_table_lock);
343 if (!pte_none(*src_pte)) {
345 ptep_set_wrprotect(src, addr, src_pte);
347 ptepage = pte_page(entry);
349 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
350 set_huge_pte_at(dst, addr, dst_pte, entry);
352 spin_unlock(&src->page_table_lock);
353 spin_unlock(&dst->page_table_lock);
361 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
364 struct mm_struct *mm = vma->vm_mm;
365 unsigned long address;
370 WARN_ON(!is_vm_hugetlb_page(vma));
371 BUG_ON(start & ~HPAGE_MASK);
372 BUG_ON(end & ~HPAGE_MASK);
374 spin_lock(&mm->page_table_lock);
376 /* Update high watermark before we lower rss */
377 update_hiwater_rss(mm);
379 for (address = start; address < end; address += HPAGE_SIZE) {
380 ptep = huge_pte_offset(mm, address);
384 pte = huge_ptep_get_and_clear(mm, address, ptep);
388 page = pte_page(pte);
390 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
393 spin_unlock(&mm->page_table_lock);
394 flush_tlb_range(vma, start, end);
397 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
398 unsigned long address, pte_t *ptep, pte_t pte)
400 struct page *old_page, *new_page;
403 old_page = pte_page(pte);
405 /* If no-one else is actually using this page, avoid the copy
406 * and just make the page writable */
407 avoidcopy = (page_count(old_page) == 1);
409 set_huge_ptep_writable(vma, address, ptep);
410 return VM_FAULT_MINOR;
413 page_cache_get(old_page);
414 new_page = alloc_huge_page(vma, address);
417 page_cache_release(old_page);
421 spin_unlock(&mm->page_table_lock);
422 copy_huge_page(new_page, old_page, address);
423 spin_lock(&mm->page_table_lock);
425 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
426 if (likely(pte_same(*ptep, pte))) {
428 set_huge_pte_at(mm, address, ptep,
429 make_huge_pte(vma, new_page, 1));
430 /* Make the old page be freed below */
433 page_cache_release(new_page);
434 page_cache_release(old_page);
435 return VM_FAULT_MINOR;
438 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
439 unsigned long address, pte_t *ptep, int write_access)
441 int ret = VM_FAULT_SIGBUS;
445 struct address_space *mapping;
448 mapping = vma->vm_file->f_mapping;
449 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
450 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
453 * Use page lock to guard against racing truncation
454 * before we get page_table_lock.
457 page = find_lock_page(mapping, idx);
459 if (hugetlb_get_quota(mapping))
461 page = alloc_huge_page(vma, address);
463 hugetlb_put_quota(mapping);
467 clear_huge_page(page, address);
469 if (vma->vm_flags & VM_SHARED) {
472 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
475 hugetlb_put_quota(mapping);
484 spin_lock(&mm->page_table_lock);
485 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
489 ret = VM_FAULT_MINOR;
490 if (!pte_none(*ptep))
493 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
494 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
495 && (vma->vm_flags & VM_SHARED)));
496 set_huge_pte_at(mm, address, ptep, new_pte);
498 if (write_access && !(vma->vm_flags & VM_SHARED)) {
499 /* Optimization, do the COW without a second fault */
500 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
503 spin_unlock(&mm->page_table_lock);
509 spin_unlock(&mm->page_table_lock);
510 hugetlb_put_quota(mapping);
516 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
517 unsigned long address, int write_access)
522 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
524 ptep = huge_pte_alloc(mm, address);
529 * Serialize hugepage allocation and instantiation, so that we don't
530 * get spurious allocation failures if two CPUs race to instantiate
531 * the same page in the page cache.
533 mutex_lock(&hugetlb_instantiation_mutex);
535 if (pte_none(entry)) {
536 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
537 mutex_unlock(&hugetlb_instantiation_mutex);
541 ret = VM_FAULT_MINOR;
543 spin_lock(&mm->page_table_lock);
544 /* Check for a racing update before calling hugetlb_cow */
545 if (likely(pte_same(entry, *ptep)))
546 if (write_access && !pte_write(entry))
547 ret = hugetlb_cow(mm, vma, address, ptep, entry);
548 spin_unlock(&mm->page_table_lock);
549 mutex_unlock(&hugetlb_instantiation_mutex);
554 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
555 struct page **pages, struct vm_area_struct **vmas,
556 unsigned long *position, int *length, int i)
558 unsigned long pfn_offset;
559 unsigned long vaddr = *position;
560 int remainder = *length;
562 spin_lock(&mm->page_table_lock);
563 while (vaddr < vma->vm_end && remainder) {
568 * Some archs (sparc64, sh*) have multiple pte_ts to
569 * each hugepage. We have to make * sure we get the
570 * first, for the page indexing below to work.
572 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
574 if (!pte || pte_none(*pte)) {
577 spin_unlock(&mm->page_table_lock);
578 ret = hugetlb_fault(mm, vma, vaddr, 0);
579 spin_lock(&mm->page_table_lock);
580 if (ret == VM_FAULT_MINOR)
589 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
590 page = pte_page(*pte);
594 pages[i] = page + pfn_offset;
604 if (vaddr < vma->vm_end && remainder &&
605 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
607 * We use pfn_offset to avoid touching the pageframes
608 * of this compound page.
613 spin_unlock(&mm->page_table_lock);
620 void hugetlb_change_protection(struct vm_area_struct *vma,
621 unsigned long address, unsigned long end, pgprot_t newprot)
623 struct mm_struct *mm = vma->vm_mm;
624 unsigned long start = address;
628 BUG_ON(address >= end);
629 flush_cache_range(vma, address, end);
631 spin_lock(&mm->page_table_lock);
632 for (; address < end; address += HPAGE_SIZE) {
633 ptep = huge_pte_offset(mm, address);
636 if (!pte_none(*ptep)) {
637 pte = huge_ptep_get_and_clear(mm, address, ptep);
638 pte = pte_mkhuge(pte_modify(pte, newprot));
639 set_huge_pte_at(mm, address, ptep, pte);
640 lazy_mmu_prot_update(pte);
643 spin_unlock(&mm->page_table_lock);
645 flush_tlb_range(vma, start, end);
649 struct list_head link;
654 static long region_add(struct list_head *head, long f, long t)
656 struct file_region *rg, *nrg, *trg;
658 /* Locate the region we are either in or before. */
659 list_for_each_entry(rg, head, link)
663 /* Round our left edge to the current segment if it encloses us. */
667 /* Check for and consume any regions we now overlap with. */
669 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
670 if (&rg->link == head)
675 /* If this area reaches higher then extend our area to
676 * include it completely. If this is not the first area
677 * which we intend to reuse, free it. */
690 static long region_chg(struct list_head *head, long f, long t)
692 struct file_region *rg, *nrg;
695 /* Locate the region we are before or in. */
696 list_for_each_entry(rg, head, link)
700 /* If we are below the current region then a new region is required.
701 * Subtle, allocate a new region at the position but make it zero
702 * size such that we can guarentee to record the reservation. */
703 if (&rg->link == head || t < rg->from) {
704 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
709 INIT_LIST_HEAD(&nrg->link);
710 list_add(&nrg->link, rg->link.prev);
715 /* Round our left edge to the current segment if it encloses us. */
720 /* Check for and consume any regions we now overlap with. */
721 list_for_each_entry(rg, rg->link.prev, link) {
722 if (&rg->link == head)
727 /* We overlap with this area, if it extends futher than
728 * us then we must extend ourselves. Account for its
729 * existing reservation. */
734 chg -= rg->to - rg->from;
739 static long region_truncate(struct list_head *head, long end)
741 struct file_region *rg, *trg;
744 /* Locate the region we are either in or before. */
745 list_for_each_entry(rg, head, link)
748 if (&rg->link == head)
751 /* If we are in the middle of a region then adjust it. */
752 if (end > rg->from) {
755 rg = list_entry(rg->link.next, typeof(*rg), link);
758 /* Drop any remaining regions. */
759 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
760 if (&rg->link == head)
762 chg += rg->to - rg->from;
769 static int hugetlb_acct_memory(long delta)
773 spin_lock(&hugetlb_lock);
774 if ((delta + resv_huge_pages) <= free_huge_pages) {
775 resv_huge_pages += delta;
778 spin_unlock(&hugetlb_lock);
782 int hugetlb_reserve_pages(struct inode *inode, long from, long to)
786 chg = region_chg(&inode->i_mapping->private_list, from, to);
789 ret = hugetlb_acct_memory(chg);
792 region_add(&inode->i_mapping->private_list, from, to);
796 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
798 long chg = region_truncate(&inode->i_mapping->private_list, offset);
799 hugetlb_acct_memory(freed - chg);