mm/hugetlb.c

   1 /*
   2  * Generic hugetlb support.
   3  * (C) William Irwin, April 2004
   4  */
   5 #include <linux/gfp.h>
   6 #include <linux/list.h>
   7 #include <linux/init.h>
   8 #include <linux/module.h>
   9 #include <linux/mm.h>
  10 #include <linux/sysctl.h>
  11 #include <linux/highmem.h>
  12 #include <linux/nodemask.h>
  13 #include <linux/pagemap.h>
  14 #include <linux/mempolicy.h>
  15 #include <linux/cpuset.h>
  16 #include <linux/mutex.h>
  17 #include <linux/vs_base.h>
  18 #include <linux/vs_memory.h>
  19
  20 #include <asm/page.h>
  21 #include <asm/pgtable.h>
  22
  23 #include <linux/hugetlb.h>
  24 #include "internal.h"
  25
  26 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
  27 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
  28 unsigned long max_huge_pages;
  29 static struct list_head hugepage_freelists[MAX_NUMNODES];
  30 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
  31 static unsigned int free_huge_pages_node[MAX_NUMNODES];
  32 /*
  33  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  34  */
  35 static DEFINE_SPINLOCK(hugetlb_lock);
  36
  37 static void clear_huge_page(struct page *page, unsigned long addr)
  38 {
  39         int i;
  40
  41         might_sleep();
  42         for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
  43                 cond_resched();
  44                 clear_user_highpage(page + i, addr);
  45         }
  46 }
  47
  48 static void copy_huge_page(struct page *dst, struct page *src,
  49                            unsigned long addr)
  50 {
  51         int i;
  52
  53         might_sleep();
  54         for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
  55                 cond_resched();
  56                 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
  57         }
  58 }
  59
  60 static void enqueue_huge_page(struct page *page)
  61 {
  62         int nid = page_to_nid(page);
  63         list_add(&page->lru, &hugepage_freelists[nid]);
  64         free_huge_pages++;
  65         free_huge_pages_node[nid]++;
  66 }
  67
  68 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
  69                                 unsigned long address)
  70 {
  71         int nid = numa_node_id();
  72         struct page *page = NULL;
  73         struct zonelist *zonelist = huge_zonelist(vma, address);
  74         struct zone **z;
  75
  76         for (z = zonelist->zones; *z; z++) {
  77                 nid = (*z)->zone_pgdat->node_id;
  78                 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
  79                     !list_empty(&hugepage_freelists[nid]))
  80                         break;
  81         }
  82
  83         if (*z) {
  84                 page = list_entry(hugepage_freelists[nid].next,
  85                                   struct page, lru);
  86                 list_del(&page->lru);
  87                 free_huge_pages--;
  88                 free_huge_pages_node[nid]--;
  89         }
  90         return page;
  91 }
  92
  93 static void free_huge_page(struct page *page)
  94 {
  95         BUG_ON(page_count(page));
  96
  97         INIT_LIST_HEAD(&page->lru);
  98
  99         spin_lock(&hugetlb_lock);
 100         enqueue_huge_page(page);
 101         spin_unlock(&hugetlb_lock);
 102 }
 103
 104 static int alloc_fresh_huge_page(void)
 105 {
 106         static int nid = 0;
 107         struct page *page;
 108         page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
 109                                         HUGETLB_PAGE_ORDER);
 110         nid = next_node(nid, node_online_map);
 111         if (nid == MAX_NUMNODES)
 112                 nid = first_node(node_online_map);
 113         if (page) {
 114                 page[1].lru.next = (void *)free_huge_page;      /* dtor */
 115                 spin_lock(&hugetlb_lock);
 116                 nr_huge_pages++;
 117                 nr_huge_pages_node[page_to_nid(page)]++;
 118                 spin_unlock(&hugetlb_lock);
 119                 put_page(page); /* free it into the hugepage allocator */
 120                 return 1;
 121         }
 122         return 0;
 123 }
 124
 125 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 126                                     unsigned long addr)
 127 {
 128         struct page *page;
 129
 130         spin_lock(&hugetlb_lock);
 131         if (vma->vm_flags & VM_MAYSHARE)
 132                 resv_huge_pages--;
 133         else if (free_huge_pages <= resv_huge_pages)
 134                 goto fail;
 135
 136         page = dequeue_huge_page(vma, addr);
 137         if (!page)
 138                 goto fail;
 139
 140         spin_unlock(&hugetlb_lock);
 141         set_page_refcounted(page);
 142         return page;
 143
 144 fail:
 145         spin_unlock(&hugetlb_lock);
 146         return NULL;
 147 }
 148
 149 static int __init hugetlb_init(void)
 150 {
 151         unsigned long i;
 152
 153         if (HPAGE_SHIFT == 0)
 154                 return 0;
 155
 156         for (i = 0; i < MAX_NUMNODES; ++i)
 157                 INIT_LIST_HEAD(&hugepage_freelists[i]);
 158
 159         for (i = 0; i < max_huge_pages; ++i) {
 160                 if (!alloc_fresh_huge_page())
 161                         break;
 162         }
 163         max_huge_pages = free_huge_pages = nr_huge_pages = i;
 164         printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
 165         return 0;
 166 }
 167 module_init(hugetlb_init);
 168
 169 static int __init hugetlb_setup(char *s)
 170 {
 171         if (sscanf(s, "%lu", &max_huge_pages) <= 0)
 172                 max_huge_pages = 0;
 173         return 1;
 174 }
 175 __setup("hugepages=", hugetlb_setup);
 176
 177 #ifdef CONFIG_SYSCTL
 178 static void update_and_free_page(struct page *page)
 179 {
 180         int i;
 181         nr_huge_pages--;
 182         nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
 183         for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
 184                 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 185                                 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 186                                 1 << PG_private | 1<< PG_writeback);
 187         }
 188         page[1].lru.next = NULL;
 189         set_page_refcounted(page);
 190         __free_pages(page, HUGETLB_PAGE_ORDER);
 191 }
 192
 193 #ifdef CONFIG_HIGHMEM
 194 static void try_to_free_low(unsigned long count)
 195 {
 196         int i, nid;
 197         for (i = 0; i < MAX_NUMNODES; ++i) {
 198                 struct page *page, *next;
 199                 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
 200                         if (PageHighMem(page))
 201                                 continue;
 202                         list_del(&page->lru);
 203                         update_and_free_page(page);
 204                         nid = page_zone(page)->zone_pgdat->node_id;
 205                         free_huge_pages--;
 206                         free_huge_pages_node[nid]--;
 207                         if (count >= nr_huge_pages)
 208                                 return;
 209                 }
 210         }
 211 }
 212 #else
 213 static inline void try_to_free_low(unsigned long count)
 214 {
 215 }
 216 #endif
 217
 218 static unsigned long set_max_huge_pages(unsigned long count)
 219 {
 220         while (count > nr_huge_pages) {
 221                 if (!alloc_fresh_huge_page())
 222                         return nr_huge_pages;
 223         }
 224         if (count >= nr_huge_pages)
 225                 return nr_huge_pages;
 226
 227         spin_lock(&hugetlb_lock);
 228         count = max(count, resv_huge_pages);
 229         try_to_free_low(count);
 230         while (count < nr_huge_pages) {
 231                 struct page *page = dequeue_huge_page(NULL, 0);
 232                 if (!page)
 233                         break;
 234                 update_and_free_page(page);
 235         }
 236         spin_unlock(&hugetlb_lock);
 237         return nr_huge_pages;
 238 }
 239
 240 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 241                            struct file *file, void __user *buffer,
 242                            size_t *length, loff_t *ppos)
 243 {
 244         proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
 245         max_huge_pages = set_max_huge_pages(max_huge_pages);
 246         return 0;
 247 }
 248 #endif /* CONFIG_SYSCTL */
 249
 250 int hugetlb_report_meminfo(char *buf)
 251 {
 252         return sprintf(buf,
 253                         "HugePages_Total: %5lu\n"
 254                         "HugePages_Free:  %5lu\n"
 255                         "HugePages_Rsvd:  %5lu\n"
 256                         "Hugepagesize:    %5lu kB\n",
 257                         nr_huge_pages,
 258                         free_huge_pages,
 259                         resv_huge_pages,
 260                         HPAGE_SIZE/1024);
 261 }
 262
 263 int hugetlb_report_node_meminfo(int nid, char *buf)
 264 {
 265         return sprintf(buf,
 266                 "Node %d HugePages_Total: %5u\n"
 267                 "Node %d HugePages_Free:  %5u\n",
 268                 nid, nr_huge_pages_node[nid],
 269                 nid, free_huge_pages_node[nid]);
 270 }
 271
 272 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 273 unsigned long hugetlb_total_pages(void)
 274 {
 275         return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
 276 }
 277
 278 /*
 279  * We cannot handle pagefaults against hugetlb pages at all.  They cause
 280  * handle_mm_fault() to try to instantiate regular-sized pages in the
 281  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 282  * this far.
 283  */
 284 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
 285                                 unsigned long address, int *unused)
 286 {
 287         BUG();
 288         return NULL;
 289 }
 290
 291 struct vm_operations_struct hugetlb_vm_ops = {
 292         .nopage = hugetlb_nopage,
 293 };
 294
 295 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
 296                                 int writable)
 297 {
 298         pte_t entry;
 299
 300         if (writable) {
 301                 entry =
 302                     pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 303         } else {
 304                 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
 305         }
 306         entry = pte_mkyoung(entry);
 307         entry = pte_mkhuge(entry);
 308
 309         return entry;
 310 }
 311
 312 static void set_huge_ptep_writable(struct vm_area_struct *vma,
 313                                    unsigned long address, pte_t *ptep)
 314 {
 315         pte_t entry;
 316
 317         entry = pte_mkwrite(pte_mkdirty(*ptep));
 318         ptep_set_access_flags(vma, address, ptep, entry, 1);
 319         update_mmu_cache(vma, address, entry);
 320         lazy_mmu_prot_update(entry);
 321 }
 322
 323
 324 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 325                             struct vm_area_struct *vma)
 326 {
 327         pte_t *src_pte, *dst_pte, entry;
 328         struct page *ptepage;
 329         unsigned long addr;
 330         int cow;
 331
 332         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 333
 334         for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 335                 src_pte = huge_pte_offset(src, addr);
 336                 if (!src_pte)
 337                         continue;
 338                 dst_pte = huge_pte_alloc(dst, addr);
 339                 if (!dst_pte)
 340                         goto nomem;
 341                 spin_lock(&dst->page_table_lock);
 342                 spin_lock(&src->page_table_lock);
 343                 if (!pte_none(*src_pte)) {
 344                         if (cow)
 345                                 ptep_set_wrprotect(src, addr, src_pte);
 346                         entry = *src_pte;
 347                         ptepage = pte_page(entry);
 348                         get_page(ptepage);
 349                         add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 350                         set_huge_pte_at(dst, addr, dst_pte, entry);
 351                 }
 352                 spin_unlock(&src->page_table_lock);
 353                 spin_unlock(&dst->page_table_lock);
 354         }
 355         return 0;
 356
 357 nomem:
 358         return -ENOMEM;
 359 }
 360
 361 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 362                           unsigned long end)
 363 {
 364         struct mm_struct *mm = vma->vm_mm;
 365         unsigned long address;
 366         pte_t *ptep;
 367         pte_t pte;
 368         struct page *page;
 369
 370         WARN_ON(!is_vm_hugetlb_page(vma));
 371         BUG_ON(start & ~HPAGE_MASK);
 372         BUG_ON(end & ~HPAGE_MASK);
 373
 374         spin_lock(&mm->page_table_lock);
 375
 376         /* Update high watermark before we lower rss */
 377         update_hiwater_rss(mm);
 378
 379         for (address = start; address < end; address += HPAGE_SIZE) {
 380                 ptep = huge_pte_offset(mm, address);
 381                 if (!ptep)
 382                         continue;
 383
 384                 pte = huge_ptep_get_and_clear(mm, address, ptep);
 385                 if (pte_none(pte))
 386                         continue;
 387
 388                 page = pte_page(pte);
 389                 put_page(page);
 390                 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 391         }
 392
 393         spin_unlock(&mm->page_table_lock);
 394         flush_tlb_range(vma, start, end);
 395 }
 396
 397 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 398                         unsigned long address, pte_t *ptep, pte_t pte)
 399 {
 400         struct page *old_page, *new_page;
 401         int avoidcopy;
 402
 403         old_page = pte_page(pte);
 404
 405         /* If no-one else is actually using this page, avoid the copy
 406          * and just make the page writable */
 407         avoidcopy = (page_count(old_page) == 1);
 408         if (avoidcopy) {
 409                 set_huge_ptep_writable(vma, address, ptep);
 410                 return VM_FAULT_MINOR;
 411         }
 412
 413         page_cache_get(old_page);
 414         new_page = alloc_huge_page(vma, address);
 415
 416         if (!new_page) {
 417                 page_cache_release(old_page);
 418                 return VM_FAULT_OOM;
 419         }
 420
 421         spin_unlock(&mm->page_table_lock);
 422         copy_huge_page(new_page, old_page, address);
 423         spin_lock(&mm->page_table_lock);
 424
 425         ptep = huge_pte_offset(mm, address & HPAGE_MASK);
 426         if (likely(pte_same(*ptep, pte))) {
 427                 /* Break COW */
 428                 set_huge_pte_at(mm, address, ptep,
 429                                 make_huge_pte(vma, new_page, 1));
 430                 /* Make the old page be freed below */
 431                 new_page = old_page;
 432         }
 433         page_cache_release(new_page);
 434         page_cache_release(old_page);
 435         return VM_FAULT_MINOR;
 436 }
 437
 438 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 439                         unsigned long address, pte_t *ptep, int write_access)
 440 {
 441         int ret = VM_FAULT_SIGBUS;
 442         unsigned long idx;
 443         unsigned long size;
 444         struct page *page;
 445         struct address_space *mapping;
 446         pte_t new_pte;
 447
 448         mapping = vma->vm_file->f_mapping;
 449         idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
 450                 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
 451
 452         /*
 453          * Use page lock to guard against racing truncation
 454          * before we get page_table_lock.
 455          */
 456 retry:
 457         page = find_lock_page(mapping, idx);
 458         if (!page) {
 459                 if (hugetlb_get_quota(mapping))
 460                         goto out;
 461                 page = alloc_huge_page(vma, address);
 462                 if (!page) {
 463                         hugetlb_put_quota(mapping);
 464                         ret = VM_FAULT_OOM;
 465                         goto out;
 466                 }
 467                 clear_huge_page(page, address);
 468
 469                 if (vma->vm_flags & VM_SHARED) {
 470                         int err;
 471
 472                         err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
 473                         if (err) {
 474                                 put_page(page);
 475                                 hugetlb_put_quota(mapping);
 476                                 if (err == -EEXIST)
 477                                         goto retry;
 478                                 goto out;
 479                         }
 480                 } else
 481                         lock_page(page);
 482         }
 483
 484         spin_lock(&mm->page_table_lock);
 485         size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 486         if (idx >= size)
 487                 goto backout;
 488
 489         ret = VM_FAULT_MINOR;
 490         if (!pte_none(*ptep))
 491                 goto backout;
 492
 493         add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 494         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 495                                 && (vma->vm_flags & VM_SHARED)));
 496         set_huge_pte_at(mm, address, ptep, new_pte);
 497
 498         if (write_access && !(vma->vm_flags & VM_SHARED)) {
 499                 /* Optimization, do the COW without a second fault */
 500                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
 501         }
 502
 503         spin_unlock(&mm->page_table_lock);
 504         unlock_page(page);
 505 out:
 506         return ret;
 507
 508 backout:
 509         spin_unlock(&mm->page_table_lock);
 510         hugetlb_put_quota(mapping);
 511         unlock_page(page);
 512         put_page(page);
 513         goto out;
 514 }
 515
 516 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 517                         unsigned long address, int write_access)
 518 {
 519         pte_t *ptep;
 520         pte_t entry;
 521         int ret;
 522         static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 523
 524         ptep = huge_pte_alloc(mm, address);
 525         if (!ptep)
 526                 return VM_FAULT_OOM;
 527
 528         /*
 529          * Serialize hugepage allocation and instantiation, so that we don't
 530          * get spurious allocation failures if two CPUs race to instantiate
 531          * the same page in the page cache.
 532          */
 533         mutex_lock(&hugetlb_instantiation_mutex);
 534         entry = *ptep;
 535         if (pte_none(entry)) {
 536                 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
 537                 mutex_unlock(&hugetlb_instantiation_mutex);
 538                 return ret;
 539         }
 540
 541         ret = VM_FAULT_MINOR;
 542
 543         spin_lock(&mm->page_table_lock);
 544         /* Check for a racing update before calling hugetlb_cow */
 545         if (likely(pte_same(entry, *ptep)))
 546                 if (write_access && !pte_write(entry))
 547                         ret = hugetlb_cow(mm, vma, address, ptep, entry);
 548         spin_unlock(&mm->page_table_lock);
 549         mutex_unlock(&hugetlb_instantiation_mutex);
 550
 551         return ret;
 552 }
 553
 554 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 555                         struct page **pages, struct vm_area_struct **vmas,
 556                         unsigned long *position, int *length, int i)
 557 {
 558         unsigned long pfn_offset;
 559         unsigned long vaddr = *position;
 560         int remainder = *length;
 561
 562         spin_lock(&mm->page_table_lock);
 563         while (vaddr < vma->vm_end && remainder) {
 564                 pte_t *pte;
 565                 struct page *page;
 566
 567                 /*
 568                  * Some archs (sparc64, sh*) have multiple pte_ts to
 569                  * each hugepage.  We have to make * sure we get the
 570                  * first, for the page indexing below to work.
 571                  */
 572                 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
 573
 574                 if (!pte || pte_none(*pte)) {
 575                         int ret;
 576
 577                         spin_unlock(&mm->page_table_lock);
 578                         ret = hugetlb_fault(mm, vma, vaddr, 0);
 579                         spin_lock(&mm->page_table_lock);
 580                         if (ret == VM_FAULT_MINOR)
 581                                 continue;
 582
 583                         remainder = 0;
 584                         if (!i)
 585                                 i = -EFAULT;
 586                         break;
 587                 }
 588
 589                 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
 590                 page = pte_page(*pte);
 591 same_page:
 592                 if (pages) {
 593                         get_page(page);
 594                         pages[i] = page + pfn_offset;
 595                 }
 596
 597                 if (vmas)
 598                         vmas[i] = vma;
 599
 600                 vaddr += PAGE_SIZE;
 601                 ++pfn_offset;
 602                 --remainder;
 603                 ++i;
 604                 if (vaddr < vma->vm_end && remainder &&
 605                                 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
 606                         /*
 607                          * We use pfn_offset to avoid touching the pageframes
 608                          * of this compound page.
 609                          */
 610                         goto same_page;
 611                 }
 612         }
 613         spin_unlock(&mm->page_table_lock);
 614         *length = remainder;
 615         *position = vaddr;
 616
 617         return i;
 618 }
 619
 620 void hugetlb_change_protection(struct vm_area_struct *vma,
 621                 unsigned long address, unsigned long end, pgprot_t newprot)
 622 {
 623         struct mm_struct *mm = vma->vm_mm;
 624         unsigned long start = address;
 625         pte_t *ptep;
 626         pte_t pte;
 627
 628         BUG_ON(address >= end);
 629         flush_cache_range(vma, address, end);
 630
 631         spin_lock(&mm->page_table_lock);
 632         for (; address < end; address += HPAGE_SIZE) {
 633                 ptep = huge_pte_offset(mm, address);
 634                 if (!ptep)
 635                         continue;
 636                 if (!pte_none(*ptep)) {
 637                         pte = huge_ptep_get_and_clear(mm, address, ptep);
 638                         pte = pte_mkhuge(pte_modify(pte, newprot));
 639                         set_huge_pte_at(mm, address, ptep, pte);
 640                         lazy_mmu_prot_update(pte);
 641                 }
 642         }
 643         spin_unlock(&mm->page_table_lock);
 644
 645         flush_tlb_range(vma, start, end);
 646 }
 647
 648 struct file_region {
 649         struct list_head link;
 650         long from;
 651         long to;
 652 };
 653
 654 static long region_add(struct list_head *head, long f, long t)
 655 {
 656         struct file_region *rg, *nrg, *trg;
 657
 658         /* Locate the region we are either in or before. */
 659         list_for_each_entry(rg, head, link)
 660                 if (f <= rg->to)
 661                         break;
 662
 663         /* Round our left edge to the current segment if it encloses us. */
 664         if (f > rg->from)
 665                 f = rg->from;
 666
 667         /* Check for and consume any regions we now overlap with. */
 668         nrg = rg;
 669         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 670                 if (&rg->link == head)
 671                         break;
 672                 if (rg->from > t)
 673                         break;
 674
 675                 /* If this area reaches higher then extend our area to
 676                  * include it completely.  If this is not the first area
 677                  * which we intend to reuse, free it. */
 678                 if (rg->to > t)
 679                         t = rg->to;
 680                 if (rg != nrg) {
 681                         list_del(&rg->link);
 682                         kfree(rg);
 683                 }
 684         }
 685         nrg->from = f;
 686         nrg->to = t;
 687         return 0;
 688 }
 689
 690 static long region_chg(struct list_head *head, long f, long t)
 691 {
 692         struct file_region *rg, *nrg;
 693         long chg = 0;
 694
 695         /* Locate the region we are before or in. */
 696         list_for_each_entry(rg, head, link)
 697                 if (f <= rg->to)
 698                         break;
 699
 700         /* If we are below the current region then a new region is required.
 701          * Subtle, allocate a new region at the position but make it zero
 702          * size such that we can guarentee to record the reservation. */
 703         if (&rg->link == head || t < rg->from) {
 704                 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
 705                 if (nrg == 0)
 706                         return -ENOMEM;
 707                 nrg->from = f;
 708                 nrg->to   = f;
 709                 INIT_LIST_HEAD(&nrg->link);
 710                 list_add(&nrg->link, rg->link.prev);
 711
 712                 return t - f;
 713         }
 714
 715         /* Round our left edge to the current segment if it encloses us. */
 716         if (f > rg->from)
 717                 f = rg->from;
 718         chg = t - f;
 719
 720         /* Check for and consume any regions we now overlap with. */
 721         list_for_each_entry(rg, rg->link.prev, link) {
 722                 if (&rg->link == head)
 723                         break;
 724                 if (rg->from > t)
 725                         return chg;
 726
 727                 /* We overlap with this area, if it extends futher than
 728                  * us then we must extend ourselves.  Account for its
 729                  * existing reservation. */
 730                 if (rg->to > t) {
 731                         chg += rg->to - t;
 732                         t = rg->to;
 733                 }
 734                 chg -= rg->to - rg->from;
 735         }
 736         return chg;
 737 }
 738
 739 static long region_truncate(struct list_head *head, long end)
 740 {
 741         struct file_region *rg, *trg;
 742         long chg = 0;
 743
 744         /* Locate the region we are either in or before. */
 745         list_for_each_entry(rg, head, link)
 746                 if (end <= rg->to)
 747                         break;
 748         if (&rg->link == head)
 749                 return 0;
 750
 751         /* If we are in the middle of a region then adjust it. */
 752         if (end > rg->from) {
 753                 chg = rg->to - end;
 754                 rg->to = end;
 755                 rg = list_entry(rg->link.next, typeof(*rg), link);
 756         }
 757
 758         /* Drop any remaining regions. */
 759         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 760                 if (&rg->link == head)
 761                         break;
 762                 chg += rg->to - rg->from;
 763                 list_del(&rg->link);
 764                 kfree(rg);
 765         }
 766         return chg;
 767 }
 768
 769 static int hugetlb_acct_memory(long delta)
 770 {
 771         int ret = -ENOMEM;
 772
 773         spin_lock(&hugetlb_lock);
 774         if ((delta + resv_huge_pages) <= free_huge_pages) {
 775                 resv_huge_pages += delta;
 776                 ret = 0;
 777         }
 778         spin_unlock(&hugetlb_lock);
 779         return ret;
 780 }
 781
 782 int hugetlb_reserve_pages(struct inode *inode, long from, long to)
 783 {
 784         long ret, chg;
 785
 786         chg = region_chg(&inode->i_mapping->private_list, from, to);
 787         if (chg < 0)
 788                 return chg;
 789         ret = hugetlb_acct_memory(chg);
 790         if (ret < 0)
 791                 return ret;
 792         region_add(&inode->i_mapping->private_list, from, to);
 793         return 0;
 794 }
 795
 796 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 797 {
 798         long chg = region_truncate(&inode->i_mapping->private_list, offset);
 799         hugetlb_acct_memory(freed - chg);
 800 }