mm/hugetlb.c

   1 /*
   2  * Generic hugetlb support.
   3  * (C) William Irwin, April 2004
   4  */
   5 #include <linux/gfp.h>
   6 #include <linux/list.h>
   7 #include <linux/init.h>
   8 #include <linux/module.h>
   9 #include <linux/mm.h>
  10 #include <linux/sysctl.h>
  11 #include <linux/highmem.h>
  12 #include <linux/nodemask.h>
  13 #include <linux/pagemap.h>
  14 #include <linux/mempolicy.h>
  15 #include <linux/cpuset.h>
  16 #include <linux/mutex.h>
  17
  18 #include <asm/page.h>
  19 #include <asm/pgtable.h>
  20
  21 #include <linux/hugetlb.h>
  22 #include <linux/vs_memory.h>
  23 #include "internal.h"
  24
  25 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
  26 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
  27 unsigned long max_huge_pages;
  28 static struct list_head hugepage_freelists[MAX_NUMNODES];
  29 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
  30 static unsigned int free_huge_pages_node[MAX_NUMNODES];
  31 /*
  32  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  33  */
  34 static DEFINE_SPINLOCK(hugetlb_lock);
  35
  36 static void clear_huge_page(struct page *page, unsigned long addr)
  37 {
  38         int i;
  39
  40         might_sleep();
  41         for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
  42                 cond_resched();
  43                 clear_user_highpage(page + i, addr);
  44         }
  45 }
  46
  47 static void copy_huge_page(struct page *dst, struct page *src,
  48                            unsigned long addr)
  49 {
  50         int i;
  51
  52         might_sleep();
  53         for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
  54                 cond_resched();
  55                 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
  56         }
  57 }
  58
  59 static void enqueue_huge_page(struct page *page)
  60 {
  61         int nid = page_to_nid(page);
  62         list_add(&page->lru, &hugepage_freelists[nid]);
  63         free_huge_pages++;
  64         free_huge_pages_node[nid]++;
  65 }
  66
  67 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
  68                                 unsigned long address)
  69 {
  70         int nid = numa_node_id();
  71         struct page *page = NULL;
  72         struct zonelist *zonelist = huge_zonelist(vma, address);
  73         struct zone **z;
  74
  75         for (z = zonelist->zones; *z; z++) {
  76                 nid = (*z)->zone_pgdat->node_id;
  77                 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
  78                     !list_empty(&hugepage_freelists[nid]))
  79                         break;
  80         }
  81
  82         if (*z) {
  83                 page = list_entry(hugepage_freelists[nid].next,
  84                                   struct page, lru);
  85                 list_del(&page->lru);
  86                 free_huge_pages--;
  87                 free_huge_pages_node[nid]--;
  88         }
  89         return page;
  90 }
  91
  92 static void free_huge_page(struct page *page)
  93 {
  94         BUG_ON(page_count(page));
  95
  96         INIT_LIST_HEAD(&page->lru);
  97
  98         spin_lock(&hugetlb_lock);
  99         enqueue_huge_page(page);
 100         spin_unlock(&hugetlb_lock);
 101 }
 102
 103 static int alloc_fresh_huge_page(void)
 104 {
 105         static int nid = 0;
 106         struct page *page;
 107         page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
 108                                         HUGETLB_PAGE_ORDER);
 109         nid = next_node(nid, node_online_map);
 110         if (nid == MAX_NUMNODES)
 111                 nid = first_node(node_online_map);
 112         if (page) {
 113                 page[1].lru.next = (void *)free_huge_page;      /* dtor */
 114                 spin_lock(&hugetlb_lock);
 115                 nr_huge_pages++;
 116                 nr_huge_pages_node[page_to_nid(page)]++;
 117                 spin_unlock(&hugetlb_lock);
 118                 put_page(page); /* free it into the hugepage allocator */
 119                 return 1;
 120         }
 121         return 0;
 122 }
 123
 124 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 125                                     unsigned long addr)
 126 {
 127         struct page *page;
 128
 129         spin_lock(&hugetlb_lock);
 130         if (vma->vm_flags & VM_MAYSHARE)
 131                 resv_huge_pages--;
 132         else if (free_huge_pages <= resv_huge_pages)
 133                 goto fail;
 134
 135         page = dequeue_huge_page(vma, addr);
 136         if (!page)
 137                 goto fail;
 138
 139         spin_unlock(&hugetlb_lock);
 140         set_page_refcounted(page);
 141         return page;
 142
 143 fail:
 144         spin_unlock(&hugetlb_lock);
 145         return NULL;
 146 }
 147
 148 static int __init hugetlb_init(void)
 149 {
 150         unsigned long i;
 151
 152         if (HPAGE_SHIFT == 0)
 153                 return 0;
 154
 155         for (i = 0; i < MAX_NUMNODES; ++i)
 156                 INIT_LIST_HEAD(&hugepage_freelists[i]);
 157
 158         for (i = 0; i < max_huge_pages; ++i) {
 159                 if (!alloc_fresh_huge_page())
 160                         break;
 161         }
 162         max_huge_pages = free_huge_pages = nr_huge_pages = i;
 163         printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
 164         return 0;
 165 }
 166 module_init(hugetlb_init);
 167
 168 static int __init hugetlb_setup(char *s)
 169 {
 170         if (sscanf(s, "%lu", &max_huge_pages) <= 0)
 171                 max_huge_pages = 0;
 172         return 1;
 173 }
 174 __setup("hugepages=", hugetlb_setup);
 175
 176 #ifdef CONFIG_SYSCTL
 177 static void update_and_free_page(struct page *page)
 178 {
 179         int i;
 180         nr_huge_pages--;
 181         nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
 182         for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
 183                 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 184                                 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 185                                 1 << PG_private | 1<< PG_writeback);
 186         }
 187         page[1].lru.next = NULL;
 188         set_page_refcounted(page);
 189         __free_pages(page, HUGETLB_PAGE_ORDER);
 190 }
 191
 192 #ifdef CONFIG_HIGHMEM
 193 static void try_to_free_low(unsigned long count)
 194 {
 195         int i, nid;
 196         for (i = 0; i < MAX_NUMNODES; ++i) {
 197                 struct page *page, *next;
 198                 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
 199                         if (PageHighMem(page))
 200                                 continue;
 201                         list_del(&page->lru);
 202                         update_and_free_page(page);
 203                         nid = page_zone(page)->zone_pgdat->node_id;
 204                         free_huge_pages--;
 205                         free_huge_pages_node[nid]--;
 206                         if (count >= nr_huge_pages)
 207                                 return;
 208                 }
 209         }
 210 }
 211 #else
 212 static inline void try_to_free_low(unsigned long count)
 213 {
 214 }
 215 #endif
 216
 217 static unsigned long set_max_huge_pages(unsigned long count)
 218 {
 219         while (count > nr_huge_pages) {
 220                 if (!alloc_fresh_huge_page())
 221                         return nr_huge_pages;
 222         }
 223         if (count >= nr_huge_pages)
 224                 return nr_huge_pages;
 225
 226         spin_lock(&hugetlb_lock);
 227         count = max(count, resv_huge_pages);
 228         try_to_free_low(count);
 229         while (count < nr_huge_pages) {
 230                 struct page *page = dequeue_huge_page(NULL, 0);
 231                 if (!page)
 232                         break;
 233                 update_and_free_page(page);
 234         }
 235         spin_unlock(&hugetlb_lock);
 236         return nr_huge_pages;
 237 }
 238
 239 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 240                            struct file *file, void __user *buffer,
 241                            size_t *length, loff_t *ppos)
 242 {
 243         proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
 244         max_huge_pages = set_max_huge_pages(max_huge_pages);
 245         return 0;
 246 }
 247 #endif /* CONFIG_SYSCTL */
 248
 249 int hugetlb_report_meminfo(char *buf)
 250 {
 251         return sprintf(buf,
 252                         "HugePages_Total: %5lu\n"
 253                         "HugePages_Free:  %5lu\n"
 254                         "HugePages_Rsvd:  %5lu\n"
 255                         "Hugepagesize:    %5lu kB\n",
 256                         nr_huge_pages,
 257                         free_huge_pages,
 258                         resv_huge_pages,
 259                         HPAGE_SIZE/1024);
 260 }
 261
 262 int hugetlb_report_node_meminfo(int nid, char *buf)
 263 {
 264         return sprintf(buf,
 265                 "Node %d HugePages_Total: %5u\n"
 266                 "Node %d HugePages_Free:  %5u\n",
 267                 nid, nr_huge_pages_node[nid],
 268                 nid, free_huge_pages_node[nid]);
 269 }
 270
 271 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 272 unsigned long hugetlb_total_pages(void)
 273 {
 274         return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
 275 }
 276
 277 /*
 278  * We cannot handle pagefaults against hugetlb pages at all.  They cause
 279  * handle_mm_fault() to try to instantiate regular-sized pages in the
 280  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 281  * this far.
 282  */
 283 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
 284                                 unsigned long address, int *unused)
 285 {
 286         BUG();
 287         return NULL;
 288 }
 289
 290 struct vm_operations_struct hugetlb_vm_ops = {
 291         .nopage = hugetlb_nopage,
 292 };
 293
 294 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
 295                                 int writable)
 296 {
 297         pte_t entry;
 298
 299         if (writable) {
 300                 entry =
 301                     pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 302         } else {
 303                 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
 304         }
 305         entry = pte_mkyoung(entry);
 306         entry = pte_mkhuge(entry);
 307
 308         return entry;
 309 }
 310
 311 static void set_huge_ptep_writable(struct vm_area_struct *vma,
 312                                    unsigned long address, pte_t *ptep)
 313 {
 314         pte_t entry;
 315
 316         entry = pte_mkwrite(pte_mkdirty(*ptep));
 317         ptep_set_access_flags(vma, address, ptep, entry, 1);
 318         update_mmu_cache(vma, address, entry);
 319         lazy_mmu_prot_update(entry);
 320 }
 321
 322
 323 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 324                             struct vm_area_struct *vma)
 325 {
 326         pte_t *src_pte, *dst_pte, entry;
 327         struct page *ptepage;
 328         unsigned long addr;
 329         int cow;
 330
 331         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 332
 333         for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 334                 src_pte = huge_pte_offset(src, addr);
 335                 if (!src_pte)
 336                         continue;
 337                 dst_pte = huge_pte_alloc(dst, addr);
 338                 if (!dst_pte)
 339                         goto nomem;
 340                 spin_lock(&dst->page_table_lock);
 341                 spin_lock(&src->page_table_lock);
 342                 if (!pte_none(*src_pte)) {
 343                         if (cow)
 344                                 ptep_set_wrprotect(src, addr, src_pte);
 345                         entry = *src_pte;
 346                         ptepage = pte_page(entry);
 347                         get_page(ptepage);
 348                         add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 349                         set_huge_pte_at(dst, addr, dst_pte, entry);
 350                 }
 351                 spin_unlock(&src->page_table_lock);
 352                 spin_unlock(&dst->page_table_lock);
 353         }
 354         return 0;
 355
 356 nomem:
 357         return -ENOMEM;
 358 }
 359
 360 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 361                           unsigned long end)
 362 {
 363         struct mm_struct *mm = vma->vm_mm;
 364         unsigned long address;
 365         pte_t *ptep;
 366         pte_t pte;
 367         struct page *page;
 368
 369         WARN_ON(!is_vm_hugetlb_page(vma));
 370         BUG_ON(start & ~HPAGE_MASK);
 371         BUG_ON(end & ~HPAGE_MASK);
 372
 373         spin_lock(&mm->page_table_lock);
 374
 375         /* Update high watermark before we lower rss */
 376         update_hiwater_rss(mm);
 377
 378         for (address = start; address < end; address += HPAGE_SIZE) {
 379                 ptep = huge_pte_offset(mm, address);
 380                 if (!ptep)
 381                         continue;
 382
 383                 pte = huge_ptep_get_and_clear(mm, address, ptep);
 384                 if (pte_none(pte))
 385                         continue;
 386
 387                 page = pte_page(pte);
 388                 put_page(page);
 389                 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 390         }
 391
 392         spin_unlock(&mm->page_table_lock);
 393         flush_tlb_range(vma, start, end);
 394 }
 395
 396 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 397                         unsigned long address, pte_t *ptep, pte_t pte)
 398 {
 399         struct page *old_page, *new_page;
 400         int avoidcopy;
 401
 402         old_page = pte_page(pte);
 403
 404         /* If no-one else is actually using this page, avoid the copy
 405          * and just make the page writable */
 406         avoidcopy = (page_count(old_page) == 1);
 407         if (avoidcopy) {
 408                 set_huge_ptep_writable(vma, address, ptep);
 409                 return VM_FAULT_MINOR;
 410         }
 411
 412         page_cache_get(old_page);
 413         new_page = alloc_huge_page(vma, address);
 414
 415         if (!new_page) {
 416                 page_cache_release(old_page);
 417                 return VM_FAULT_OOM;
 418         }
 419
 420         spin_unlock(&mm->page_table_lock);
 421         copy_huge_page(new_page, old_page, address);
 422         spin_lock(&mm->page_table_lock);
 423
 424         ptep = huge_pte_offset(mm, address & HPAGE_MASK);
 425         if (likely(pte_same(*ptep, pte))) {
 426                 /* Break COW */
 427                 set_huge_pte_at(mm, address, ptep,
 428                                 make_huge_pte(vma, new_page, 1));
 429                 /* Make the old page be freed below */
 430                 new_page = old_page;
 431         }
 432         page_cache_release(new_page);
 433         page_cache_release(old_page);
 434         return VM_FAULT_MINOR;
 435 }
 436
 437 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 438                         unsigned long address, pte_t *ptep, int write_access)
 439 {
 440         int ret = VM_FAULT_SIGBUS;
 441         unsigned long idx;
 442         unsigned long size;
 443         struct page *page;
 444         struct address_space *mapping;
 445         pte_t new_pte;
 446
 447         mapping = vma->vm_file->f_mapping;
 448         idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
 449                 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
 450
 451         /*
 452          * Use page lock to guard against racing truncation
 453          * before we get page_table_lock.
 454          */
 455 retry:
 456         page = find_lock_page(mapping, idx);
 457         if (!page) {
 458                 if (hugetlb_get_quota(mapping))
 459                         goto out;
 460                 page = alloc_huge_page(vma, address);
 461                 if (!page) {
 462                         hugetlb_put_quota(mapping);
 463                         ret = VM_FAULT_OOM;
 464                         goto out;
 465                 }
 466                 clear_huge_page(page, address);
 467
 468                 if (vma->vm_flags & VM_SHARED) {
 469                         int err;
 470
 471                         err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
 472                         if (err) {
 473                                 put_page(page);
 474                                 hugetlb_put_quota(mapping);
 475                                 if (err == -EEXIST)
 476                                         goto retry;
 477                                 goto out;
 478                         }
 479                 } else
 480                         lock_page(page);
 481         }
 482
 483         spin_lock(&mm->page_table_lock);
 484         size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 485         if (idx >= size)
 486                 goto backout;
 487
 488         ret = VM_FAULT_MINOR;
 489         if (!pte_none(*ptep))
 490                 goto backout;
 491
 492         add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 493         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 494                                 && (vma->vm_flags & VM_SHARED)));
 495         set_huge_pte_at(mm, address, ptep, new_pte);
 496
 497         if (write_access && !(vma->vm_flags & VM_SHARED)) {
 498                 /* Optimization, do the COW without a second fault */
 499                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
 500         }
 501
 502         spin_unlock(&mm->page_table_lock);
 503         unlock_page(page);
 504 out:
 505         return ret;
 506
 507 backout:
 508         spin_unlock(&mm->page_table_lock);
 509         hugetlb_put_quota(mapping);
 510         unlock_page(page);
 511         put_page(page);
 512         goto out;
 513 }
 514
 515 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 516                         unsigned long address, int write_access)
 517 {
 518         pte_t *ptep;
 519         pte_t entry;
 520         int ret;
 521         static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 522
 523         ptep = huge_pte_alloc(mm, address);
 524         if (!ptep)
 525                 return VM_FAULT_OOM;
 526
 527         /*
 528          * Serialize hugepage allocation and instantiation, so that we don't
 529          * get spurious allocation failures if two CPUs race to instantiate
 530          * the same page in the page cache.
 531          */
 532         mutex_lock(&hugetlb_instantiation_mutex);
 533         entry = *ptep;
 534         if (pte_none(entry)) {
 535                 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
 536                 mutex_unlock(&hugetlb_instantiation_mutex);
 537                 return ret;
 538         }
 539
 540         ret = VM_FAULT_MINOR;
 541
 542         spin_lock(&mm->page_table_lock);
 543         /* Check for a racing update before calling hugetlb_cow */
 544         if (likely(pte_same(entry, *ptep)))
 545                 if (write_access && !pte_write(entry))
 546                         ret = hugetlb_cow(mm, vma, address, ptep, entry);
 547         spin_unlock(&mm->page_table_lock);
 548         mutex_unlock(&hugetlb_instantiation_mutex);
 549
 550         return ret;
 551 }
 552
 553 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 554                         struct page **pages, struct vm_area_struct **vmas,
 555                         unsigned long *position, int *length, int i)
 556 {
 557         unsigned long pfn_offset;
 558         unsigned long vaddr = *position;
 559         int remainder = *length;
 560
 561         spin_lock(&mm->page_table_lock);
 562         while (vaddr < vma->vm_end && remainder) {
 563                 pte_t *pte;
 564                 struct page *page;
 565
 566                 /*
 567                  * Some archs (sparc64, sh*) have multiple pte_ts to
 568                  * each hugepage.  We have to make * sure we get the
 569                  * first, for the page indexing below to work.
 570                  */
 571                 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
 572
 573                 if (!pte || pte_none(*pte)) {
 574                         int ret;
 575
 576                         spin_unlock(&mm->page_table_lock);
 577                         ret = hugetlb_fault(mm, vma, vaddr, 0);
 578                         spin_lock(&mm->page_table_lock);
 579                         if (ret == VM_FAULT_MINOR)
 580                                 continue;
 581
 582                         remainder = 0;
 583                         if (!i)
 584                                 i = -EFAULT;
 585                         break;
 586                 }
 587
 588                 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
 589                 page = pte_page(*pte);
 590 same_page:
 591                 if (pages) {
 592                         get_page(page);
 593                         pages[i] = page + pfn_offset;
 594                 }
 595
 596                 if (vmas)
 597                         vmas[i] = vma;
 598
 599                 vaddr += PAGE_SIZE;
 600                 ++pfn_offset;
 601                 --remainder;
 602                 ++i;
 603                 if (vaddr < vma->vm_end && remainder &&
 604                                 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
 605                         /*
 606                          * We use pfn_offset to avoid touching the pageframes
 607                          * of this compound page.
 608                          */
 609                         goto same_page;
 610                 }
 611         }
 612         spin_unlock(&mm->page_table_lock);
 613         *length = remainder;
 614         *position = vaddr;
 615
 616         return i;
 617 }
 618
 619 void hugetlb_change_protection(struct vm_area_struct *vma,
 620                 unsigned long address, unsigned long end, pgprot_t newprot)
 621 {
 622         struct mm_struct *mm = vma->vm_mm;
 623         unsigned long start = address;
 624         pte_t *ptep;
 625         pte_t pte;
 626
 627         BUG_ON(address >= end);
 628         flush_cache_range(vma, address, end);
 629
 630         spin_lock(&mm->page_table_lock);
 631         for (; address < end; address += HPAGE_SIZE) {
 632                 ptep = huge_pte_offset(mm, address);
 633                 if (!ptep)
 634                         continue;
 635                 if (!pte_none(*ptep)) {
 636                         pte = huge_ptep_get_and_clear(mm, address, ptep);
 637                         pte = pte_mkhuge(pte_modify(pte, newprot));
 638                         set_huge_pte_at(mm, address, ptep, pte);
 639                         lazy_mmu_prot_update(pte);
 640                 }
 641         }
 642         spin_unlock(&mm->page_table_lock);
 643
 644         flush_tlb_range(vma, start, end);
 645 }
 646
 647 struct file_region {
 648         struct list_head link;
 649         long from;
 650         long to;
 651 };
 652
 653 static long region_add(struct list_head *head, long f, long t)
 654 {
 655         struct file_region *rg, *nrg, *trg;
 656
 657         /* Locate the region we are either in or before. */
 658         list_for_each_entry(rg, head, link)
 659                 if (f <= rg->to)
 660                         break;
 661
 662         /* Round our left edge to the current segment if it encloses us. */
 663         if (f > rg->from)
 664                 f = rg->from;
 665
 666         /* Check for and consume any regions we now overlap with. */
 667         nrg = rg;
 668         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 669                 if (&rg->link == head)
 670                         break;
 671                 if (rg->from > t)
 672                         break;
 673
 674                 /* If this area reaches higher then extend our area to
 675                  * include it completely.  If this is not the first area
 676                  * which we intend to reuse, free it. */
 677                 if (rg->to > t)
 678                         t = rg->to;
 679                 if (rg != nrg) {
 680                         list_del(&rg->link);
 681                         kfree(rg);
 682                 }
 683         }
 684         nrg->from = f;
 685         nrg->to = t;
 686         return 0;
 687 }
 688
 689 static long region_chg(struct list_head *head, long f, long t)
 690 {
 691         struct file_region *rg, *nrg;
 692         long chg = 0;
 693
 694         /* Locate the region we are before or in. */
 695         list_for_each_entry(rg, head, link)
 696                 if (f <= rg->to)
 697                         break;
 698
 699         /* If we are below the current region then a new region is required.
 700          * Subtle, allocate a new region at the position but make it zero
 701          * size such that we can guarentee to record the reservation. */
 702         if (&rg->link == head || t < rg->from) {
 703                 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
 704                 if (nrg == 0)
 705                         return -ENOMEM;
 706                 nrg->from = f;
 707                 nrg->to   = f;
 708                 INIT_LIST_HEAD(&nrg->link);
 709                 list_add(&nrg->link, rg->link.prev);
 710
 711                 return t - f;
 712         }
 713
 714         /* Round our left edge to the current segment if it encloses us. */
 715         if (f > rg->from)
 716                 f = rg->from;
 717         chg = t - f;
 718
 719         /* Check for and consume any regions we now overlap with. */
 720         list_for_each_entry(rg, rg->link.prev, link) {
 721                 if (&rg->link == head)
 722                         break;
 723                 if (rg->from > t)
 724                         return chg;
 725
 726                 /* We overlap with this area, if it extends futher than
 727                  * us then we must extend ourselves.  Account for its
 728                  * existing reservation. */
 729                 if (rg->to > t) {
 730                         chg += rg->to - t;
 731                         t = rg->to;
 732                 }
 733                 chg -= rg->to - rg->from;
 734         }
 735         return chg;
 736 }
 737
 738 static long region_truncate(struct list_head *head, long end)
 739 {
 740         struct file_region *rg, *trg;
 741         long chg = 0;
 742
 743         /* Locate the region we are either in or before. */
 744         list_for_each_entry(rg, head, link)
 745                 if (end <= rg->to)
 746                         break;
 747         if (&rg->link == head)
 748                 return 0;
 749
 750         /* If we are in the middle of a region then adjust it. */
 751         if (end > rg->from) {
 752                 chg = rg->to - end;
 753                 rg->to = end;
 754                 rg = list_entry(rg->link.next, typeof(*rg), link);
 755         }
 756
 757         /* Drop any remaining regions. */
 758         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 759                 if (&rg->link == head)
 760                         break;
 761                 chg += rg->to - rg->from;
 762                 list_del(&rg->link);
 763                 kfree(rg);
 764         }
 765         return chg;
 766 }
 767
 768 static int hugetlb_acct_memory(long delta)
 769 {
 770         int ret = -ENOMEM;
 771
 772         spin_lock(&hugetlb_lock);
 773         if ((delta + resv_huge_pages) <= free_huge_pages) {
 774                 resv_huge_pages += delta;
 775                 ret = 0;
 776         }
 777         spin_unlock(&hugetlb_lock);
 778         return ret;
 779 }
 780
 781 int hugetlb_reserve_pages(struct inode *inode, long from, long to)
 782 {
 783         long ret, chg;
 784
 785         chg = region_chg(&inode->i_mapping->private_list, from, to);
 786         if (chg < 0)
 787                 return chg;
 788         ret = hugetlb_acct_memory(chg);
 789         if (ret < 0)
 790                 return ret;
 791         region_add(&inode->i_mapping->private_list, from, to);
 792         return 0;
 793 }
 794
 795 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 796 {
 797         long chg = region_truncate(&inode->i_mapping->private_list, offset);
 798         hugetlb_acct_memory(freed - chg);
 799 }