#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/cpuset.h>
-#include <linux/mutex.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <linux/hugetlb.h>
#include <linux/vs_memory.h>
-#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages;
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
+
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
static DEFINE_SPINLOCK(hugetlb_lock);
-static void clear_huge_page(struct page *page, unsigned long addr)
-{
- int i;
-
- might_sleep();
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
- cond_resched();
- clear_user_highpage(page + i, addr);
- }
-}
-
-static void copy_huge_page(struct page *dst, struct page *src,
- unsigned long addr)
-{
- int i;
-
- might_sleep();
- for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
- cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
- }
-}
-
static void enqueue_huge_page(struct page *page)
{
int nid = page_to_nid(page);
return page;
}
-static void free_huge_page(struct page *page)
-{
- BUG_ON(page_count(page));
-
- INIT_LIST_HEAD(&page->lru);
-
- spin_lock(&hugetlb_lock);
- enqueue_huge_page(page);
- spin_unlock(&hugetlb_lock);
-}
-
-static int alloc_fresh_huge_page(void)
+static struct page *alloc_fresh_huge_page(void)
{
static int nid = 0;
struct page *page;
page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
+ nid = (nid + 1) % num_online_nodes();
if (page) {
- page[1].lru.next = (void *)free_huge_page; /* dtor */
spin_lock(&hugetlb_lock);
nr_huge_pages++;
nr_huge_pages_node[page_to_nid(page)]++;
spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
- return 1;
}
- return 0;
+ return page;
}
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
- unsigned long addr)
+void free_huge_page(struct page *page)
{
- struct page *page;
+ BUG_ON(page_count(page));
+
+ INIT_LIST_HEAD(&page->lru);
+ page[1].lru.next = NULL; /* reset dtor */
spin_lock(&hugetlb_lock);
- if (vma->vm_flags & VM_MAYSHARE)
- resv_huge_pages--;
- else if (free_huge_pages <= resv_huge_pages)
- goto fail;
+ enqueue_huge_page(page);
+ spin_unlock(&hugetlb_lock);
+}
- page = dequeue_huge_page(vma, addr);
- if (!page)
- goto fail;
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ struct page *page;
+ int i;
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page(vma, addr);
+ if (!page) {
+ spin_unlock(&hugetlb_lock);
+ return NULL;
+ }
spin_unlock(&hugetlb_lock);
- set_page_refcounted(page);
+ set_page_count(page, 1);
+ page[1].lru.next = (void *)free_huge_page; /* set dtor */
+ for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
+ clear_user_highpage(&page[i], addr);
return page;
-
-fail:
- spin_unlock(&hugetlb_lock);
- return NULL;
}
static int __init hugetlb_init(void)
{
unsigned long i;
+ struct page *page;
if (HPAGE_SHIFT == 0)
return 0;
INIT_LIST_HEAD(&hugepage_freelists[i]);
for (i = 0; i < max_huge_pages; ++i) {
- if (!alloc_fresh_huge_page())
+ page = alloc_fresh_huge_page();
+ if (!page)
break;
+ spin_lock(&hugetlb_lock);
+ enqueue_huge_page(page);
+ spin_unlock(&hugetlb_lock);
}
max_huge_pages = free_huge_pages = nr_huge_pages = i;
printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1<< PG_writeback);
+ set_page_count(&page[i], 0);
}
- page[1].lru.next = NULL;
- set_page_refcounted(page);
+ set_page_count(page, 1);
__free_pages(page, HUGETLB_PAGE_ORDER);
}
static unsigned long set_max_huge_pages(unsigned long count)
{
while (count > nr_huge_pages) {
- if (!alloc_fresh_huge_page())
+ struct page *page = alloc_fresh_huge_page();
+ if (!page)
return nr_huge_pages;
+ spin_lock(&hugetlb_lock);
+ enqueue_huge_page(page);
+ spin_unlock(&hugetlb_lock);
}
if (count >= nr_huge_pages)
return nr_huge_pages;
spin_lock(&hugetlb_lock);
- count = max(count, resv_huge_pages);
try_to_free_low(count);
while (count < nr_huge_pages) {
struct page *page = dequeue_huge_page(NULL, 0);
return sprintf(buf,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
- "HugePages_Rsvd: %5lu\n"
"Hugepagesize: %5lu kB\n",
nr_huge_pages,
free_huge_pages,
- resv_huge_pages,
HPAGE_SIZE/1024);
}
nid, free_huge_pages_node[nid]);
}
+int is_hugepage_mem_enough(size_t size)
+{
+ return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
unsigned long address, pte_t *ptep, pte_t pte)
{
struct page *old_page, *new_page;
- int avoidcopy;
+ int i, avoidcopy;
old_page = pte_page(pte);
}
spin_unlock(&mm->page_table_lock);
- copy_huge_page(new_page, old_page, address);
+ for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+ copy_user_highpage(new_page + i, old_page + i,
+ address + i*PAGE_SIZE);
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
ret = VM_FAULT_OOM;
goto out;
}
- clear_huge_page(page, address);
if (vma->vm_flags & VM_SHARED) {
int err;
pte_t *ptep;
pte_t entry;
int ret;
- static DEFINE_MUTEX(hugetlb_instantiation_mutex);
ptep = huge_pte_alloc(mm, address);
if (!ptep)
return VM_FAULT_OOM;
- /*
- * Serialize hugepage allocation and instantiation, so that we don't
- * get spurious allocation failures if two CPUs race to instantiate
- * the same page in the page cache.
- */
- mutex_lock(&hugetlb_instantiation_mutex);
entry = *ptep;
- if (pte_none(entry)) {
- ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
- mutex_unlock(&hugetlb_instantiation_mutex);
- return ret;
- }
+ if (pte_none(entry))
+ return hugetlb_no_page(mm, vma, address, ptep, write_access);
ret = VM_FAULT_MINOR;
if (write_access && !pte_write(entry))
ret = hugetlb_cow(mm, vma, address, ptep, entry);
spin_unlock(&mm->page_table_lock);
- mutex_unlock(&hugetlb_instantiation_mutex);
return ret;
}
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i)
{
- unsigned long pfn_offset;
- unsigned long vaddr = *position;
+ unsigned long vpfn, vaddr = *position;
int remainder = *length;
+ vpfn = vaddr/PAGE_SIZE;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
break;
}
- pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
- page = pte_page(*pte);
-same_page:
if (pages) {
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
get_page(page);
- pages[i] = page + pfn_offset;
+ pages[i] = page;
}
if (vmas)
vmas[i] = vma;
vaddr += PAGE_SIZE;
- ++pfn_offset;
+ ++vpfn;
--remainder;
++i;
- if (vaddr < vma->vm_end && remainder &&
- pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
- /*
- * We use pfn_offset to avoid touching the pageframes
- * of this compound page.
- */
- goto same_page;
- }
}
spin_unlock(&mm->page_table_lock);
*length = remainder;
return i;
}
-
-void hugetlb_change_protection(struct vm_area_struct *vma,
- unsigned long address, unsigned long end, pgprot_t newprot)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long start = address;
- pte_t *ptep;
- pte_t pte;
-
- BUG_ON(address >= end);
- flush_cache_range(vma, address, end);
-
- spin_lock(&mm->page_table_lock);
- for (; address < end; address += HPAGE_SIZE) {
- ptep = huge_pte_offset(mm, address);
- if (!ptep)
- continue;
- if (!pte_none(*ptep)) {
- pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(pte_modify(pte, newprot));
- set_huge_pte_at(mm, address, ptep, pte);
- lazy_mmu_prot_update(pte);
- }
- }
- spin_unlock(&mm->page_table_lock);
-
- flush_tlb_range(vma, start, end);
-}
-
-struct file_region {
- struct list_head link;
- long from;
- long to;
-};
-
-static long region_add(struct list_head *head, long f, long t)
-{
- struct file_region *rg, *nrg, *trg;
-
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
-
- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- break;
-
- /* If this area reaches higher then extend our area to
- * include it completely. If this is not the first area
- * which we intend to reuse, free it. */
- if (rg->to > t)
- t = rg->to;
- if (rg != nrg) {
- list_del(&rg->link);
- kfree(rg);
- }
- }
- nrg->from = f;
- nrg->to = t;
- return 0;
-}
-
-static long region_chg(struct list_head *head, long f, long t)
-{
- struct file_region *rg, *nrg;
- long chg = 0;
-
- /* Locate the region we are before or in. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
-
- /* If we are below the current region then a new region is required.
- * Subtle, allocate a new region at the position but make it zero
- * size such that we can guarentee to record the reservation. */
- if (&rg->link == head || t < rg->from) {
- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
- if (nrg == 0)
- return -ENOMEM;
- nrg->from = f;
- nrg->to = f;
- INIT_LIST_HEAD(&nrg->link);
- list_add(&nrg->link, rg->link.prev);
-
- return t - f;
- }
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
- chg = t - f;
-
- /* Check for and consume any regions we now overlap with. */
- list_for_each_entry(rg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- return chg;
-
- /* We overlap with this area, if it extends futher than
- * us then we must extend ourselves. Account for its
- * existing reservation. */
- if (rg->to > t) {
- chg += rg->to - t;
- t = rg->to;
- }
- chg -= rg->to - rg->from;
- }
- return chg;
-}
-
-static long region_truncate(struct list_head *head, long end)
-{
- struct file_region *rg, *trg;
- long chg = 0;
-
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (end <= rg->to)
- break;
- if (&rg->link == head)
- return 0;
-
- /* If we are in the middle of a region then adjust it. */
- if (end > rg->from) {
- chg = rg->to - end;
- rg->to = end;
- rg = list_entry(rg->link.next, typeof(*rg), link);
- }
-
- /* Drop any remaining regions. */
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- chg += rg->to - rg->from;
- list_del(&rg->link);
- kfree(rg);
- }
- return chg;
-}
-
-static int hugetlb_acct_memory(long delta)
-{
- int ret = -ENOMEM;
-
- spin_lock(&hugetlb_lock);
- if ((delta + resv_huge_pages) <= free_huge_pages) {
- resv_huge_pages += delta;
- ret = 0;
- }
- spin_unlock(&hugetlb_lock);
- return ret;
-}
-
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
- long ret, chg;
-
- chg = region_chg(&inode->i_mapping->private_list, from, to);
- if (chg < 0)
- return chg;
- ret = hugetlb_acct_memory(chg);
- if (ret < 0)
- return ret;
- region_add(&inode->i_mapping->private_list, from, to);
- return 0;
-}
-
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
-{
- long chg = region_truncate(&inode->i_mapping->private_list, offset);
- hugetlb_acct_memory(freed - chg);
-}