mm/hugetlb.c

   1 /*
   2  * Generic hugetlb support.
   3  * (C) William Irwin, April 2004
   4  */
   5 #include <linux/gfp.h>
   6 #include <linux/list.h>
   7 #include <linux/init.h>
   8 #include <linux/module.h>
   9 #include <linux/mm.h>
  10 #include <linux/sysctl.h>
  11 #include <linux/highmem.h>
  12 #include <linux/nodemask.h>
  13 #include <linux/pagemap.h>
  14 #include <linux/mempolicy.h>
  15 #include <linux/cpuset.h>
  16 #include <linux/mutex.h>
  17
  18 #include <asm/page.h>
  19 #include <asm/pgtable.h>
  20
  21 #include <linux/hugetlb.h>
  22 #include <linux/vs_memory.h>
  23 #include "internal.h"
  24
  25 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
  26 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
  27 unsigned long max_huge_pages;
  28 static struct list_head hugepage_freelists[MAX_NUMNODES];
  29 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
  30 static unsigned int free_huge_pages_node[MAX_NUMNODES];
  31 /*
  32  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  33  */
  34 static DEFINE_SPINLOCK(hugetlb_lock);
  35
  36 static void clear_huge_page(struct page *page, unsigned long addr)
  37 {
  38         int i;
  39
  40         might_sleep();
  41         for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
  42                 cond_resched();
  43                 clear_user_highpage(page + i, addr);
  44         }
  45 }
  46
  47 static void copy_huge_page(struct page *dst, struct page *src,
  48                            unsigned long addr, struct vm_area_struct *vma)
  49 {
  50         int i;
  51
  52         might_sleep();
  53         for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
  54                 cond_resched();
  55                 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
  56         }
  57 }
  58
  59 static void enqueue_huge_page(struct page *page)
  60 {
  61         int nid = page_to_nid(page);
  62         list_add(&page->lru, &hugepage_freelists[nid]);
  63         free_huge_pages++;
  64         free_huge_pages_node[nid]++;
  65 }
  66
  67 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
  68                                 unsigned long address)
  69 {
  70         int nid = numa_node_id();
  71         struct page *page = NULL;
  72         struct zonelist *zonelist = huge_zonelist(vma, address);
  73         struct zone **z;
  74
  75         for (z = zonelist->zones; *z; z++) {
  76                 nid = zone_to_nid(*z);
  77                 if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
  78                     !list_empty(&hugepage_freelists[nid]))
  79                         break;
  80         }
  81
  82         if (*z) {
  83                 page = list_entry(hugepage_freelists[nid].next,
  84                                   struct page, lru);
  85                 list_del(&page->lru);
  86                 free_huge_pages--;
  87                 free_huge_pages_node[nid]--;
  88         }
  89         return page;
  90 }
  91
  92 static void free_huge_page(struct page *page)
  93 {
  94         BUG_ON(page_count(page));
  95
  96         INIT_LIST_HEAD(&page->lru);
  97
  98         spin_lock(&hugetlb_lock);
  99         enqueue_huge_page(page);
 100         spin_unlock(&hugetlb_lock);
 101 }
 102
 103 static int alloc_fresh_huge_page(void)
 104 {
 105         static int nid = 0;
 106         struct page *page;
 107         page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
 108                                         HUGETLB_PAGE_ORDER);
 109         nid = next_node(nid, node_online_map);
 110         if (nid == MAX_NUMNODES)
 111                 nid = first_node(node_online_map);
 112         if (page) {
 113                 set_compound_page_dtor(page, free_huge_page);
 114                 spin_lock(&hugetlb_lock);
 115                 nr_huge_pages++;
 116                 nr_huge_pages_node[page_to_nid(page)]++;
 117                 spin_unlock(&hugetlb_lock);
 118                 put_page(page); /* free it into the hugepage allocator */
 119                 return 1;
 120         }
 121         return 0;
 122 }
 123
 124 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 125                                     unsigned long addr)
 126 {
 127         struct page *page;
 128
 129         spin_lock(&hugetlb_lock);
 130         if (vma->vm_flags & VM_MAYSHARE)
 131                 resv_huge_pages--;
 132         else if (free_huge_pages <= resv_huge_pages)
 133                 goto fail;
 134
 135         page = dequeue_huge_page(vma, addr);
 136         if (!page)
 137                 goto fail;
 138
 139         spin_unlock(&hugetlb_lock);
 140         set_page_refcounted(page);
 141         return page;
 142
 143 fail:
 144         spin_unlock(&hugetlb_lock);
 145         return NULL;
 146 }
 147
 148 static int __init hugetlb_init(void)
 149 {
 150         unsigned long i;
 151
 152         if (HPAGE_SHIFT == 0)
 153                 return 0;
 154
 155         for (i = 0; i < MAX_NUMNODES; ++i)
 156                 INIT_LIST_HEAD(&hugepage_freelists[i]);
 157
 158         for (i = 0; i < max_huge_pages; ++i) {
 159                 if (!alloc_fresh_huge_page())
 160                         break;
 161         }
 162         max_huge_pages = free_huge_pages = nr_huge_pages = i;
 163         printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
 164         return 0;
 165 }
 166 module_init(hugetlb_init);
 167
 168 static int __init hugetlb_setup(char *s)
 169 {
 170         if (sscanf(s, "%lu", &max_huge_pages) <= 0)
 171                 max_huge_pages = 0;
 172         return 1;
 173 }
 174 __setup("hugepages=", hugetlb_setup);
 175
 176 #ifdef CONFIG_SYSCTL
 177 static void update_and_free_page(struct page *page)
 178 {
 179         int i;
 180         nr_huge_pages--;
 181         nr_huge_pages_node[page_to_nid(page)]--;
 182         for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
 183                 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 184                                 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 185                                 1 << PG_private | 1<< PG_writeback);
 186         }
 187         page[1].lru.next = NULL;
 188         set_page_refcounted(page);
 189         __free_pages(page, HUGETLB_PAGE_ORDER);
 190 }
 191
 192 #ifdef CONFIG_HIGHMEM
 193 static void try_to_free_low(unsigned long count)
 194 {
 195         int i;
 196
 197         for (i = 0; i < MAX_NUMNODES; ++i) {
 198                 struct page *page, *next;
 199                 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
 200                         if (PageHighMem(page))
 201                                 continue;
 202                         list_del(&page->lru);
 203                         update_and_free_page(page);
 204                         free_huge_pages--;
 205                         free_huge_pages_node[page_to_nid(page)]--;
 206                         if (count >= nr_huge_pages)
 207                                 return;
 208                 }
 209         }
 210 }
 211 #else
 212 static inline void try_to_free_low(unsigned long count)
 213 {
 214 }
 215 #endif
 216
 217 static unsigned long set_max_huge_pages(unsigned long count)
 218 {
 219         while (count > nr_huge_pages) {
 220                 if (!alloc_fresh_huge_page())
 221                         return nr_huge_pages;
 222         }
 223         if (count >= nr_huge_pages)
 224                 return nr_huge_pages;
 225
 226         spin_lock(&hugetlb_lock);
 227         count = max(count, resv_huge_pages);
 228         try_to_free_low(count);
 229         while (count < nr_huge_pages) {
 230                 struct page *page = dequeue_huge_page(NULL, 0);
 231                 if (!page)
 232                         break;
 233                 update_and_free_page(page);
 234         }
 235         spin_unlock(&hugetlb_lock);
 236         return nr_huge_pages;
 237 }
 238
 239 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 240                            struct file *file, void __user *buffer,
 241                            size_t *length, loff_t *ppos)
 242 {
 243         proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
 244         max_huge_pages = set_max_huge_pages(max_huge_pages);
 245         return 0;
 246 }
 247 #endif /* CONFIG_SYSCTL */
 248
 249 int hugetlb_report_meminfo(char *buf)
 250 {
 251         return sprintf(buf,
 252                         "HugePages_Total: %5lu\n"
 253                         "HugePages_Free:  %5lu\n"
 254                         "HugePages_Rsvd:  %5lu\n"
 255                         "Hugepagesize:    %5lu kB\n",
 256                         nr_huge_pages,
 257                         free_huge_pages,
 258                         resv_huge_pages,
 259                         HPAGE_SIZE/1024);
 260 }
 261
 262 int hugetlb_report_node_meminfo(int nid, char *buf)
 263 {
 264         return sprintf(buf,
 265                 "Node %d HugePages_Total: %5u\n"
 266                 "Node %d HugePages_Free:  %5u\n",
 267                 nid, nr_huge_pages_node[nid],
 268                 nid, free_huge_pages_node[nid]);
 269 }
 270
 271 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 272 unsigned long hugetlb_total_pages(void)
 273 {
 274         return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
 275 }
 276
 277 /*
 278  * We cannot handle pagefaults against hugetlb pages at all.  They cause
 279  * handle_mm_fault() to try to instantiate regular-sized pages in the
 280  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 281  * this far.
 282  */
 283 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
 284                                 unsigned long address, int *unused)
 285 {
 286         BUG();
 287         return NULL;
 288 }
 289
 290 struct vm_operations_struct hugetlb_vm_ops = {
 291         .nopage = hugetlb_nopage,
 292 };
 293
 294 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
 295                                 int writable)
 296 {
 297         pte_t entry;
 298
 299         if (writable) {
 300                 entry =
 301                     pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 302         } else {
 303                 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
 304         }
 305         entry = pte_mkyoung(entry);
 306         entry = pte_mkhuge(entry);
 307
 308         return entry;
 309 }
 310
 311 static void set_huge_ptep_writable(struct vm_area_struct *vma,
 312                                    unsigned long address, pte_t *ptep)
 313 {
 314         pte_t entry;
 315
 316         entry = pte_mkwrite(pte_mkdirty(*ptep));
 317         ptep_set_access_flags(vma, address, ptep, entry, 1);
 318         update_mmu_cache(vma, address, entry);
 319         lazy_mmu_prot_update(entry);
 320 }
 321
 322
 323 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 324                             struct vm_area_struct *vma)
 325 {
 326         pte_t *src_pte, *dst_pte, entry;
 327         struct page *ptepage;
 328         unsigned long addr;
 329         int cow;
 330
 331         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 332
 333         for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 334                 src_pte = huge_pte_offset(src, addr);
 335                 if (!src_pte)
 336                         continue;
 337                 dst_pte = huge_pte_alloc(dst, addr);
 338                 if (!dst_pte)
 339                         goto nomem;
 340                 spin_lock(&dst->page_table_lock);
 341                 spin_lock(&src->page_table_lock);
 342                 if (!pte_none(*src_pte)) {
 343                         if (cow)
 344                                 ptep_set_wrprotect(src, addr, src_pte);
 345                         entry = *src_pte;
 346                         ptepage = pte_page(entry);
 347                         get_page(ptepage);
 348                         set_huge_pte_at(dst, addr, dst_pte, entry);
 349                 }
 350                 spin_unlock(&src->page_table_lock);
 351                 spin_unlock(&dst->page_table_lock);
 352         }
 353         return 0;
 354
 355 nomem:
 356         return -ENOMEM;
 357 }
 358
 359 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 360                             unsigned long end)
 361 {
 362         struct mm_struct *mm = vma->vm_mm;
 363         unsigned long address;
 364         pte_t *ptep;
 365         pte_t pte;
 366         struct page *page;
 367         struct page *tmp;
 368         /*
 369          * A page gathering list, protected by per file i_mmap_lock. The
 370          * lock is used to avoid list corruption from multiple unmapping
 371          * of the same page since we are using page->lru.
 372          */
 373         LIST_HEAD(page_list);
 374
 375         WARN_ON(!is_vm_hugetlb_page(vma));
 376         BUG_ON(start & ~HPAGE_MASK);
 377         BUG_ON(end & ~HPAGE_MASK);
 378
 379         spin_lock(&mm->page_table_lock);
 380         for (address = start; address < end; address += HPAGE_SIZE) {
 381                 ptep = huge_pte_offset(mm, address);
 382                 if (!ptep)
 383                         continue;
 384
 385                 if (huge_pmd_unshare(mm, &address, ptep))
 386                         continue;
 387
 388                 pte = huge_ptep_get_and_clear(mm, address, ptep);
 389                 if (pte_none(pte))
 390                         continue;
 391
 392                 page = pte_page(pte);
 393                 if (pte_dirty(pte))
 394                         set_page_dirty(page);
 395                 list_add(&page->lru, &page_list);
 396         }
 397         spin_unlock(&mm->page_table_lock);
 398         flush_tlb_range(vma, start, end);
 399         list_for_each_entry_safe(page, tmp, &page_list, lru) {
 400                 list_del(&page->lru);
 401                 put_page(page);
 402         }
 403 }
 404
 405 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 406                           unsigned long end)
 407 {
 408         /*
 409          * It is undesirable to test vma->vm_file as it should be non-null
 410          * for valid hugetlb area. However, vm_file will be NULL in the error
 411          * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
 412          * do_mmap_pgoff() nullifies vma->vm_file before calling this function
 413          * to clean up. Since no pte has actually been setup, it is safe to
 414          * do nothing in this case.
 415          */
 416         if (vma->vm_file) {
 417                 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
 418                 __unmap_hugepage_range(vma, start, end);
 419                 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 420         }
 421 }
 422
 423 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 424                         unsigned long address, pte_t *ptep, pte_t pte)
 425 {
 426         struct page *old_page, *new_page;
 427         int avoidcopy;
 428
 429         old_page = pte_page(pte);
 430
 431         /* If no-one else is actually using this page, avoid the copy
 432          * and just make the page writable */
 433         avoidcopy = (page_count(old_page) == 1);
 434         if (avoidcopy) {
 435                 set_huge_ptep_writable(vma, address, ptep);
 436                 return VM_FAULT_MINOR;
 437         }
 438
 439         page_cache_get(old_page);
 440         new_page = alloc_huge_page(vma, address);
 441
 442         if (!new_page) {
 443                 page_cache_release(old_page);
 444                 return VM_FAULT_OOM;
 445         }
 446
 447         spin_unlock(&mm->page_table_lock);
 448         copy_huge_page(new_page, old_page, address, vma);
 449         spin_lock(&mm->page_table_lock);
 450
 451         ptep = huge_pte_offset(mm, address & HPAGE_MASK);
 452         if (likely(pte_same(*ptep, pte))) {
 453                 /* Break COW */
 454                 set_huge_pte_at(mm, address, ptep,
 455                                 make_huge_pte(vma, new_page, 1));
 456                 /* Make the old page be freed below */
 457                 new_page = old_page;
 458         }
 459         page_cache_release(new_page);
 460         page_cache_release(old_page);
 461         return VM_FAULT_MINOR;
 462 }
 463
 464 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 465                         unsigned long address, pte_t *ptep, int write_access)
 466 {
 467         int ret = VM_FAULT_SIGBUS;
 468         unsigned long idx;
 469         unsigned long size;
 470         struct page *page;
 471         struct address_space *mapping;
 472         pte_t new_pte;
 473
 474         mapping = vma->vm_file->f_mapping;
 475         idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
 476                 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
 477
 478         /*
 479          * Use page lock to guard against racing truncation
 480          * before we get page_table_lock.
 481          */
 482 retry:
 483         page = find_lock_page(mapping, idx);
 484         if (!page) {
 485                 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 486                 if (idx >= size)
 487                         goto out;
 488                 if (hugetlb_get_quota(mapping))
 489                         goto out;
 490                 page = alloc_huge_page(vma, address);
 491                 if (!page) {
 492                         hugetlb_put_quota(mapping);
 493                         ret = VM_FAULT_OOM;
 494                         goto out;
 495                 }
 496                 clear_huge_page(page, address);
 497
 498                 if (vma->vm_flags & VM_SHARED) {
 499                         int err;
 500
 501                         err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
 502                         if (err) {
 503                                 put_page(page);
 504                                 hugetlb_put_quota(mapping);
 505                                 if (err == -EEXIST)
 506                                         goto retry;
 507                                 goto out;
 508                         }
 509                 } else
 510                         lock_page(page);
 511         }
 512
 513         spin_lock(&mm->page_table_lock);
 514         size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 515         if (idx >= size)
 516                 goto backout;
 517
 518         ret = VM_FAULT_MINOR;
 519         if (!pte_none(*ptep))
 520                 goto backout;
 521
 522         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 523                                 && (vma->vm_flags & VM_SHARED)));
 524         set_huge_pte_at(mm, address, ptep, new_pte);
 525
 526         if (write_access && !(vma->vm_flags & VM_SHARED)) {
 527                 /* Optimization, do the COW without a second fault */
 528                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
 529         }
 530
 531         spin_unlock(&mm->page_table_lock);
 532         unlock_page(page);
 533 out:
 534         return ret;
 535
 536 backout:
 537         spin_unlock(&mm->page_table_lock);
 538         hugetlb_put_quota(mapping);
 539         unlock_page(page);
 540         put_page(page);
 541         goto out;
 542 }
 543
 544 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 545                         unsigned long address, int write_access)
 546 {
 547         pte_t *ptep;
 548         pte_t entry;
 549         int ret;
 550         static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 551
 552         ptep = huge_pte_alloc(mm, address);
 553         if (!ptep)
 554                 return VM_FAULT_OOM;
 555
 556         /*
 557          * Serialize hugepage allocation and instantiation, so that we don't
 558          * get spurious allocation failures if two CPUs race to instantiate
 559          * the same page in the page cache.
 560          */
 561         mutex_lock(&hugetlb_instantiation_mutex);
 562         entry = *ptep;
 563         if (pte_none(entry)) {
 564                 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
 565                 mutex_unlock(&hugetlb_instantiation_mutex);
 566                 return ret;
 567         }
 568
 569         ret = VM_FAULT_MINOR;
 570
 571         spin_lock(&mm->page_table_lock);
 572         /* Check for a racing update before calling hugetlb_cow */
 573         if (likely(pte_same(entry, *ptep)))
 574                 if (write_access && !pte_write(entry))
 575                         ret = hugetlb_cow(mm, vma, address, ptep, entry);
 576         spin_unlock(&mm->page_table_lock);
 577         mutex_unlock(&hugetlb_instantiation_mutex);
 578
 579         return ret;
 580 }
 581
 582 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 583                         struct page **pages, struct vm_area_struct **vmas,
 584                         unsigned long *position, int *length, int i)
 585 {
 586         unsigned long pfn_offset;
 587         unsigned long vaddr = *position;
 588         int remainder = *length;
 589
 590         spin_lock(&mm->page_table_lock);
 591         while (vaddr < vma->vm_end && remainder) {
 592                 pte_t *pte;
 593                 struct page *page;
 594
 595                 /*
 596                  * Some archs (sparc64, sh*) have multiple pte_ts to
 597                  * each hugepage.  We have to make * sure we get the
 598                  * first, for the page indexing below to work.
 599                  */
 600                 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
 601
 602                 if (!pte || pte_none(*pte)) {
 603                         int ret;
 604
 605                         spin_unlock(&mm->page_table_lock);
 606                         ret = hugetlb_fault(mm, vma, vaddr, 0);
 607                         spin_lock(&mm->page_table_lock);
 608                         if (ret == VM_FAULT_MINOR)
 609                                 continue;
 610
 611                         remainder = 0;
 612                         if (!i)
 613                                 i = -EFAULT;
 614                         break;
 615                 }
 616
 617                 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
 618                 page = pte_page(*pte);
 619 same_page:
 620                 if (pages) {
 621                         get_page(page);
 622                         pages[i] = page + pfn_offset;
 623                 }
 624
 625                 if (vmas)
 626                         vmas[i] = vma;
 627
 628                 vaddr += PAGE_SIZE;
 629                 ++pfn_offset;
 630                 --remainder;
 631                 ++i;
 632                 if (vaddr < vma->vm_end && remainder &&
 633                                 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
 634                         /*
 635                          * We use pfn_offset to avoid touching the pageframes
 636                          * of this compound page.
 637                          */
 638                         goto same_page;
 639                 }
 640         }
 641         spin_unlock(&mm->page_table_lock);
 642         *length = remainder;
 643         *position = vaddr;
 644
 645         return i;
 646 }
 647
 648 void hugetlb_change_protection(struct vm_area_struct *vma,
 649                 unsigned long address, unsigned long end, pgprot_t newprot)
 650 {
 651         struct mm_struct *mm = vma->vm_mm;
 652         unsigned long start = address;
 653         pte_t *ptep;
 654         pte_t pte;
 655
 656         BUG_ON(address >= end);
 657         flush_cache_range(vma, address, end);
 658
 659         spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
 660         spin_lock(&mm->page_table_lock);
 661         for (; address < end; address += HPAGE_SIZE) {
 662                 ptep = huge_pte_offset(mm, address);
 663                 if (!ptep)
 664                         continue;
 665                 if (huge_pmd_unshare(mm, &address, ptep))
 666                         continue;
 667                 if (!pte_none(*ptep)) {
 668                         pte = huge_ptep_get_and_clear(mm, address, ptep);
 669                         pte = pte_mkhuge(pte_modify(pte, newprot));
 670                         set_huge_pte_at(mm, address, ptep, pte);
 671                         lazy_mmu_prot_update(pte);
 672                 }
 673         }
 674         spin_unlock(&mm->page_table_lock);
 675         spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 676
 677         flush_tlb_range(vma, start, end);
 678 }
 679
 680 struct file_region {
 681         struct list_head link;
 682         long from;
 683         long to;
 684 };
 685
 686 static long region_add(struct list_head *head, long f, long t)
 687 {
 688         struct file_region *rg, *nrg, *trg;
 689
 690         /* Locate the region we are either in or before. */
 691         list_for_each_entry(rg, head, link)
 692                 if (f <= rg->to)
 693                         break;
 694
 695         /* Round our left edge to the current segment if it encloses us. */
 696         if (f > rg->from)
 697                 f = rg->from;
 698
 699         /* Check for and consume any regions we now overlap with. */
 700         nrg = rg;
 701         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 702                 if (&rg->link == head)
 703                         break;
 704                 if (rg->from > t)
 705                         break;
 706
 707                 /* If this area reaches higher then extend our area to
 708                  * include it completely.  If this is not the first area
 709                  * which we intend to reuse, free it. */
 710                 if (rg->to > t)
 711                         t = rg->to;
 712                 if (rg != nrg) {
 713                         list_del(&rg->link);
 714                         kfree(rg);
 715                 }
 716         }
 717         nrg->from = f;
 718         nrg->to = t;
 719         return 0;
 720 }
 721
 722 static long region_chg(struct list_head *head, long f, long t)
 723 {
 724         struct file_region *rg, *nrg;
 725         long chg = 0;
 726
 727         /* Locate the region we are before or in. */
 728         list_for_each_entry(rg, head, link)
 729                 if (f <= rg->to)
 730                         break;
 731
 732         /* If we are below the current region then a new region is required.
 733          * Subtle, allocate a new region at the position but make it zero
 734          * size such that we can guarentee to record the reservation. */
 735         if (&rg->link == head || t < rg->from) {
 736                 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
 737                 if (nrg == 0)
 738                         return -ENOMEM;
 739                 nrg->from = f;
 740                 nrg->to   = f;
 741                 INIT_LIST_HEAD(&nrg->link);
 742                 list_add(&nrg->link, rg->link.prev);
 743
 744                 return t - f;
 745         }
 746
 747         /* Round our left edge to the current segment if it encloses us. */
 748         if (f > rg->from)
 749                 f = rg->from;
 750         chg = t - f;
 751
 752         /* Check for and consume any regions we now overlap with. */
 753         list_for_each_entry(rg, rg->link.prev, link) {
 754                 if (&rg->link == head)
 755                         break;
 756                 if (rg->from > t)
 757                         return chg;
 758
 759                 /* We overlap with this area, if it extends futher than
 760                  * us then we must extend ourselves.  Account for its
 761                  * existing reservation. */
 762                 if (rg->to > t) {
 763                         chg += rg->to - t;
 764                         t = rg->to;
 765                 }
 766                 chg -= rg->to - rg->from;
 767         }
 768         return chg;
 769 }
 770
 771 static long region_truncate(struct list_head *head, long end)
 772 {
 773         struct file_region *rg, *trg;
 774         long chg = 0;
 775
 776         /* Locate the region we are either in or before. */
 777         list_for_each_entry(rg, head, link)
 778                 if (end <= rg->to)
 779                         break;
 780         if (&rg->link == head)
 781                 return 0;
 782
 783         /* If we are in the middle of a region then adjust it. */
 784         if (end > rg->from) {
 785                 chg = rg->to - end;
 786                 rg->to = end;
 787                 rg = list_entry(rg->link.next, typeof(*rg), link);
 788         }
 789
 790         /* Drop any remaining regions. */
 791         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 792                 if (&rg->link == head)
 793                         break;
 794                 chg += rg->to - rg->from;
 795                 list_del(&rg->link);
 796                 kfree(rg);
 797         }
 798         return chg;
 799 }
 800
 801 static int hugetlb_acct_memory(long delta)
 802 {
 803         int ret = -ENOMEM;
 804
 805         spin_lock(&hugetlb_lock);
 806         if ((delta + resv_huge_pages) <= free_huge_pages) {
 807                 resv_huge_pages += delta;
 808                 ret = 0;
 809         }
 810         spin_unlock(&hugetlb_lock);
 811         return ret;
 812 }
 813
 814 int hugetlb_reserve_pages(struct inode *inode, long from, long to)
 815 {
 816         long ret, chg;
 817
 818         chg = region_chg(&inode->i_mapping->private_list, from, to);
 819         if (chg < 0)
 820                 return chg;
 821         ret = hugetlb_acct_memory(chg);
 822         if (ret < 0)
 823                 return ret;
 824         region_add(&inode->i_mapping->private_list, from, to);
 825         return 0;
 826 }
 827
 828 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 829 {
 830         long chg = region_truncate(&inode->i_mapping->private_list, offset);
 831         hugetlb_acct_memory(freed - chg);
 832 }