]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - arch/ppc64/mm/hugetlbpage.c
Merge master.kernel.org:/pub/scm/linux/kernel/git/paulus/ppc64-2.6
[mirror_ubuntu-artful-kernel.git] / arch / ppc64 / mm / hugetlbpage.c
index d3bf86a5c1ad40a745ded13bdf9d9b6aa80da0c8..338771ec70d7622775190417f69085496ac2e209 100644 (file)
 
 #include <linux/sysctl.h>
 
-#define        HUGEPGDIR_SHIFT         (HPAGE_SHIFT + PAGE_SHIFT - 3)
-#define HUGEPGDIR_SIZE         (1UL << HUGEPGDIR_SHIFT)
-#define HUGEPGDIR_MASK         (~(HUGEPGDIR_SIZE-1))
+#define NUM_LOW_AREAS  (0x100000000UL >> SID_SHIFT)
+#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 
-#define HUGEPTE_INDEX_SIZE     9
-#define HUGEPGD_INDEX_SIZE     10
-
-#define PTRS_PER_HUGEPTE       (1 << HUGEPTE_INDEX_SIZE)
-#define PTRS_PER_HUGEPGD       (1 << HUGEPGD_INDEX_SIZE)
-
-static inline int hugepgd_index(unsigned long addr)
-{
-       return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
-}
-
-static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
+/* Modelled after find_linux_pte() */
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-       int index;
+       pgd_t *pg;
+       pud_t *pu;
+       pmd_t *pm;
+       pte_t *pt;
 
-       if (! mm->context.huge_pgdir)
-               return NULL;
+       BUG_ON(! in_hugepage_area(mm->context, addr));
 
+       addr &= HPAGE_MASK;
+
+       pg = pgd_offset(mm, addr);
+       if (!pgd_none(*pg)) {
+               pu = pud_offset(pg, addr);
+               if (!pud_none(*pu)) {
+                       pm = pmd_offset(pu, addr);
+                       pt = (pte_t *)pm;
+                       BUG_ON(!pmd_none(*pm)
+                              && !(pte_present(*pt) && pte_huge(*pt)));
+                       return pt;
+               }
+       }
 
-       index = hugepgd_index(addr);
-       BUG_ON(index >= PTRS_PER_HUGEPGD);
-       return (pud_t *)(mm->context.huge_pgdir + index);
+       return NULL;
 }
 
-static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
-       int index;
+       pgd_t *pg;
+       pud_t *pu;
+       pmd_t *pm;
+       pte_t *pt;
 
-       if (pud_none(*dir))
-               return NULL;
-
-       index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
-       return (pte_t *)pud_page(*dir) + index;
-}
-
-static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
-{
        BUG_ON(! in_hugepage_area(mm->context, addr));
 
-       if (! mm->context.huge_pgdir) {
-               pgd_t *new;
-               spin_unlock(&mm->page_table_lock);
-               /* Don't use pgd_alloc(), because we want __GFP_REPEAT */
-               new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-               BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-               spin_lock(&mm->page_table_lock);
-
-               /*
-                * Because we dropped the lock, we should re-check the
-                * entry, as somebody else could have populated it..
-                */
-               if (mm->context.huge_pgdir)
-                       pgd_free(new);
-               else
-                       mm->context.huge_pgdir = new;
-       }
-       return hugepgd_offset(mm, addr);
-}
+       addr &= HPAGE_MASK;
 
-static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr)
-{
-       if (! pud_present(*dir)) {
-               pte_t *new;
+       pg = pgd_offset(mm, addr);
+       pu = pud_alloc(mm, pg, addr);
 
-               spin_unlock(&mm->page_table_lock);
-               new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-               BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-               spin_lock(&mm->page_table_lock);
-               /*
-                * Because we dropped the lock, we should re-check the
-                * entry, as somebody else could have populated it..
-                */
-               if (pud_present(*dir)) {
-                       if (new)
-                               kmem_cache_free(zero_cache, new);
-               } else {
-                       struct page *ptepage;
-
-                       if (! new)
-                               return NULL;
-                       ptepage = virt_to_page(new);
-                       ptepage->mapping = (void *) mm;
-                       ptepage->index = addr & HUGEPGDIR_MASK;
-                       pud_populate(mm, dir, new);
+       if (pu) {
+               pm = pmd_alloc(mm, pu, addr);
+               if (pm) {
+                       pt = (pte_t *)pm;
+                       BUG_ON(!pmd_none(*pm)
+                              && !(pte_present(*pt) && pte_huge(*pt)));
+                       return pt;
                }
        }
 
-       return hugepte_offset(dir, addr);
+       return NULL;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-       pud_t *pud;
-
-       BUG_ON(! in_hugepage_area(mm->context, addr));
-
-       pud = hugepgd_offset(mm, addr);
-       if (! pud)
-               return NULL;
+#define HUGEPTE_BATCH_SIZE     (HPAGE_SIZE / PMD_SIZE)
 
-       return hugepte_offset(pud, addr);
-}
-
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+                    pte_t *ptep, pte_t pte)
 {
-       pud_t *pud;
-
-       BUG_ON(! in_hugepage_area(mm->context, addr));
+       int i;
 
-       pud = hugepgd_alloc(mm, addr);
-       if (! pud)
-               return NULL;
+       if (pte_present(*ptep)) {
+               pte_clear(mm, addr, ptep);
+               flush_tlb_pending();
+       }
 
-       return hugepte_alloc(mm, pud, addr);
+       for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
+               *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+               ptep++;
+       }
 }
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long addr, struct page *page,
-                        pte_t *ptep, int write_access)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                             pte_t *ptep)
 {
-       pte_t entry;
+       unsigned long old = pte_update(ptep, ~0UL);
+       int i;
 
-       add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-       if (write_access) {
-               entry =
-                   pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-       } else {
-               entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-       }
-       entry = pte_mkyoung(entry);
-       entry = pte_mkhuge(entry);
+       if (old & _PAGE_HASHPTE)
+               hpte_update(mm, addr, old, 0);
+
+       for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
+               ptep[i] = __pte(0);
 
-       set_pte_at(mm, addr, ptep, entry);
+       return __pte(old);
 }
 
 /*
@@ -181,29 +132,69 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
        return 0;
 }
 
-static void flush_segments(void *parm)
+static void flush_low_segments(void *parm)
 {
-       u16 segs = (unsigned long) parm;
+       u16 areas = (unsigned long) parm;
        unsigned long i;
 
        asm volatile("isync" : : : "memory");
 
-       for (i = 0; i < 16; i++) {
-               if (! (segs & (1U << i)))
+       BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+
+       for (i = 0; i < NUM_LOW_AREAS; i++) {
+               if (! (areas & (1U << i)))
                        continue;
-               asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
+               asm volatile("slbie %0"
+                            : : "r" ((i << SID_SHIFT) | SLBIE_C));
        }
 
        asm volatile("isync" : : : "memory");
 }
 
-static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
+static void flush_high_segments(void *parm)
 {
-       unsigned long start = seg << SID_SHIFT;
-       unsigned long end = (seg+1) << SID_SHIFT;
+       u16 areas = (unsigned long) parm;
+       unsigned long i, j;
+
+       asm volatile("isync" : : : "memory");
+
+       BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+
+       for (i = 0; i < NUM_HIGH_AREAS; i++) {
+               if (! (areas & (1U << i)))
+                       continue;
+               for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
+                       asm volatile("slbie %0"
+                                    :: "r" (((i << HTLB_AREA_SHIFT)
+                                            + (j << SID_SHIFT)) | SLBIE_C));
+       }
+
+       asm volatile("isync" : : : "memory");
+}
+
+static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+       unsigned long start = area << SID_SHIFT;
+       unsigned long end = (area+1) << SID_SHIFT;
+       struct vm_area_struct *vma;
+
+       BUG_ON(area >= NUM_LOW_AREAS);
+
+       /* Check no VMAs are in the region */
+       vma = find_vma(mm, start);
+       if (vma && (vma->vm_start < end))
+               return -EBUSY;
+
+       return 0;
+}
+
+static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+       unsigned long start = area << HTLB_AREA_SHIFT;
+       unsigned long end = (area+1) << HTLB_AREA_SHIFT;
        struct vm_area_struct *vma;
 
-       BUG_ON(seg >= 16);
+       BUG_ON(area >= NUM_HIGH_AREAS);
 
        /* Check no VMAs are in the region */
        vma = find_vma(mm, start);
@@ -213,20 +204,23 @@ static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
        return 0;
 }
 
-static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
+static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
        unsigned long i;
 
-       newsegs &= ~(mm->context.htlb_segs);
-       if (! newsegs)
+       BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
+       BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
+
+       newareas &= ~(mm->context.low_htlb_areas);
+       if (! newareas)
                return 0; /* The segments we want are already open */
 
-       for (i = 0; i < 16; i++)
-               if ((1 << i) & newsegs)
-                       if (prepare_low_seg_for_htlb(mm, i) != 0)
+       for (i = 0; i < NUM_LOW_AREAS; i++)
+               if ((1 << i) & newareas)
+                       if (prepare_low_area_for_htlb(mm, i) != 0)
                                return -EBUSY;
 
-       mm->context.htlb_segs |= newsegs;
+       mm->context.low_htlb_areas |= newareas;
 
        /* update the paca copy of the context struct */
        get_paca()->context = mm->context;
@@ -234,103 +228,63 @@ static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
        /* the context change must make it to memory before the flush,
         * so that further SLB misses do the right thing. */
        mb();
-       on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
+       on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
 
        return 0;
 }
 
-int prepare_hugepage_range(unsigned long addr, unsigned long len)
-{
-       if (within_hugepage_high_range(addr, len))
-               return 0;
-       else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
-               int err;
-               /* Yes, we need both tests, in case addr+len overflows
-                * 64-bit arithmetic */
-               err = open_low_hpage_segs(current->mm,
-                                         LOW_ESID_MASK(addr, len));
-               if (err)
-                       printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
-                              " failed (segs: 0x%04hx)\n", addr, len,
-                              LOW_ESID_MASK(addr, len));
-               return err;
-       }
-
-       return -EINVAL;
-}
-
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-                       struct vm_area_struct *vma)
+static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
-       pte_t *src_pte, *dst_pte, entry;
-       struct page *ptepage;
-       unsigned long addr = vma->vm_start;
-       unsigned long end = vma->vm_end;
-       int err = -ENOMEM;
-
-       while (addr < end) {
-               dst_pte = huge_pte_alloc(dst, addr);
-               if (!dst_pte)
-                       goto out;
-
-               src_pte = huge_pte_offset(src, addr);
-               entry = *src_pte;
-               
-               ptepage = pte_page(entry);
-               get_page(ptepage);
-               add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-               set_pte_at(dst, addr, dst_pte, entry);
-
-               addr += HPAGE_SIZE;
-       }
-
-       err = 0;
- out:
-       return err;
-}
+       unsigned long i;
 
-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                   struct page **pages, struct vm_area_struct **vmas,
-                   unsigned long *position, int *length, int i)
-{
-       unsigned long vpfn, vaddr = *position;
-       int remainder = *length;
+       BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
+       BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
+                    != NUM_HIGH_AREAS);
 
-       WARN_ON(!is_vm_hugetlb_page(vma));
+       newareas &= ~(mm->context.high_htlb_areas);
+       if (! newareas)
+               return 0; /* The areas we want are already open */
 
-       vpfn = vaddr/PAGE_SIZE;
-       while (vaddr < vma->vm_end && remainder) {
-               if (pages) {
-                       pte_t *pte;
-                       struct page *page;
+       for (i = 0; i < NUM_HIGH_AREAS; i++)
+               if ((1 << i) & newareas)
+                       if (prepare_high_area_for_htlb(mm, i) != 0)
+                               return -EBUSY;
 
-                       pte = huge_pte_offset(mm, vaddr);
+       mm->context.high_htlb_areas |= newareas;
 
-                       /* hugetlb should be locked, and hence, prefaulted */
-                       WARN_ON(!pte || pte_none(*pte));
+       /* update the paca copy of the context struct */
+       get_paca()->context = mm->context;
 
-                       page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+       /* the context change must make it to memory before the flush,
+        * so that further SLB misses do the right thing. */
+       mb();
+       on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
 
-                       WARN_ON(!PageCompound(page));
+       return 0;
+}
 
-                       get_page(page);
-                       pages[i] = page;
-               }
+int prepare_hugepage_range(unsigned long addr, unsigned long len)
+{
+       int err;
 
-               if (vmas)
-                       vmas[i] = vma;
+       if ( (addr+len) < addr )
+               return -EINVAL;
 
-               vaddr += PAGE_SIZE;
-               ++vpfn;
-               --remainder;
-               ++i;
+       if ((addr + len) < 0x100000000UL)
+               err = open_low_hpage_areas(current->mm,
+                                         LOW_ESID_MASK(addr, len));
+       else
+               err = open_high_hpage_areas(current->mm,
+                                           HTLB_AREA_MASK(addr, len));
+       if (err) {
+               printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+                      " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
+                      addr, len,
+                      LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
+               return err;
        }
 
-       *length = remainder;
-       *position = vaddr;
-
-       return i;
+       return 0;
 }
 
 struct page *
@@ -363,89 +317,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma,
-                         unsigned long start, unsigned long end)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long addr;
-       pte_t *ptep;
-       struct page *page;
-
-       WARN_ON(!is_vm_hugetlb_page(vma));
-       BUG_ON((start % HPAGE_SIZE) != 0);
-       BUG_ON((end % HPAGE_SIZE) != 0);
-
-       for (addr = start; addr < end; addr += HPAGE_SIZE) {
-               pte_t pte;
-
-               ptep = huge_pte_offset(mm, addr);
-               if (!ptep || pte_none(*ptep))
-                       continue;
-
-               pte = *ptep;
-               page = pte_page(pte);
-               pte_clear(mm, addr, ptep);
-
-               put_page(page);
-       }
-       add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-       flush_tlb_pending();
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-       struct mm_struct *mm = current->mm;
-       unsigned long addr;
-       int ret = 0;
-
-       WARN_ON(!is_vm_hugetlb_page(vma));
-       BUG_ON((vma->vm_start % HPAGE_SIZE) != 0);
-       BUG_ON((vma->vm_end % HPAGE_SIZE) != 0);
-
-       spin_lock(&mm->page_table_lock);
-       for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-               unsigned long idx;
-               pte_t *pte = huge_pte_alloc(mm, addr);
-               struct page *page;
-
-               if (!pte) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-               if (! pte_none(*pte))
-                       continue;
-
-               idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-                       + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-               page = find_get_page(mapping, idx);
-               if (!page) {
-                       /* charge the fs quota first */
-                       if (hugetlb_get_quota(mapping)) {
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       page = alloc_huge_page();
-                       if (!page) {
-                               hugetlb_put_quota(mapping);
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-                       if (! ret) {
-                               unlock_page(page);
-                       } else {
-                               hugetlb_put_quota(mapping);
-                               free_huge_page(page);
-                               goto out;
-                       }
-               }
-               set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE);
-       }
-out:
-       spin_unlock(&mm->page_table_lock);
-       return ret;
-}
-
 /* Because we have an exclusive hugepage region which lies within the
  * normal user address space, we have to take special measures to make
  * non-huge mmap()s evade the hugepage reserved regions. */
@@ -468,7 +339,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
                    && !is_hugepage_only_range(mm, addr,len))
                        return addr;
        }
-       start_addr = addr = mm->free_area_cache;
+       if (len > mm->cached_hole_size) {
+               start_addr = addr = mm->free_area_cache;
+       } else {
+               start_addr = addr = TASK_UNMAPPED_BASE;
+               mm->cached_hole_size = 0;
+       }
 
 full_search:
        vma = find_vma(mm, addr);
@@ -480,8 +356,8 @@ full_search:
                        vma = find_vma(mm, addr);
                        continue;
                }
-               if (touches_hugepage_high_range(addr, len)) {
-                       addr = TASK_HPAGE_END;
+               if (touches_hugepage_high_range(mm, addr, len)) {
+                       addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
                        vma = find_vma(mm, addr);
                        continue;
                }
@@ -492,6 +368,8 @@ full_search:
                        mm->free_area_cache = addr + len;
                        return addr;
                }
+               if (addr + mm->cached_hole_size < vma->vm_start)
+                       mm->cached_hole_size = vma->vm_start - addr;
                addr = vma->vm_end;
                vma = vma->vm_next;
        }
@@ -499,6 +377,7 @@ full_search:
        /* Make sure we didn't miss any holes */
        if (start_addr != TASK_UNMAPPED_BASE) {
                start_addr = addr = TASK_UNMAPPED_BASE;
+               mm->cached_hole_size = 0;
                goto full_search;
        }
        return -ENOMEM;
@@ -520,6 +399,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
        struct vm_area_struct *vma, *prev_vma;
        struct mm_struct *mm = current->mm;
        unsigned long base = mm->mmap_base, addr = addr0;
+       unsigned long largest_hole = mm->cached_hole_size;
        int first_time = 1;
 
        /* requested length too big for entire address space */
@@ -540,6 +420,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return addr;
        }
 
+       if (len <= largest_hole) {
+               largest_hole = 0;
+               mm->free_area_cache = base;
+       }
 try_again:
        /* make sure it can fit in the remaining address space */
        if (mm->free_area_cache < len)
@@ -552,8 +436,9 @@ hugepage_recheck:
                if (touches_hugepage_low_range(mm, addr, len)) {
                        addr = (addr & ((~0) << SID_SHIFT)) - len;
                        goto hugepage_recheck;
-               } else if (touches_hugepage_high_range(addr, len)) {
-                       addr = TASK_HPAGE_BASE - len;
+               } else if (touches_hugepage_high_range(mm, addr, len)) {
+                       addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
+                       goto hugepage_recheck;
                }
 
                /*
@@ -568,13 +453,21 @@ hugepage_recheck:
                 * vma->vm_start, use it:
                 */
                if (addr+len <= vma->vm_start &&
-                               (!prev_vma || (addr >= prev_vma->vm_end)))
+                         (!prev_vma || (addr >= prev_vma->vm_end))) {
                        /* remember the address as a hint for next time */
-                       return (mm->free_area_cache = addr);
-               else
+                       mm->cached_hole_size = largest_hole;
+                       return (mm->free_area_cache = addr);
+               } else {
                        /* pull free_area_cache down to the first hole */
-                       if (mm->free_area_cache == vma->vm_end)
+                       if (mm->free_area_cache == vma->vm_end) {
                                mm->free_area_cache = vma->vm_start;
+                               mm->cached_hole_size = largest_hole;
+                       }
+               }
+
+               /* remember the largest hole we saw so far */
+               if (addr + largest_hole < vma->vm_start)
+                       largest_hole = vma->vm_start - addr;
 
                /* try just below the current vma->vm_start */
                addr = vma->vm_start-len;
@@ -587,6 +480,7 @@ fail:
         */
        if (first_time) {
                mm->free_area_cache = base;
+               largest_hole = 0;
                first_time = 0;
                goto try_again;
        }
@@ -597,11 +491,13 @@ fail:
         * allocations.
         */
        mm->free_area_cache = TASK_UNMAPPED_BASE;
+       mm->cached_hole_size = ~0UL;
        addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
        /*
         * Restore the topdown base:
         */
        mm->free_area_cache = base;
+       mm->cached_hole_size = ~0UL;
 
        return addr;
 }
@@ -633,23 +529,28 @@ static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
        return -ENOMEM;
 }
 
-static unsigned long htlb_get_high_area(unsigned long len)
+static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
 {
-       unsigned long addr = TASK_HPAGE_BASE;
+       unsigned long addr = 0x100000000UL;
        struct vm_area_struct *vma;
 
        vma = find_vma(current->mm, addr);
-       for (vma = find_vma(current->mm, addr);
-            addr + len <= TASK_HPAGE_END;
-            vma = vma->vm_next) {
+       while (addr + len <= TASK_SIZE_USER64) {
                BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
-               BUG_ON(! within_hugepage_high_range(addr, len));
+
+               if (! __within_hugepage_high_range(addr, len, areamask)) {
+                       addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+                       vma = find_vma(current->mm, addr);
+                       continue;
+               }
 
                if (!vma || (addr + len) <= vma->vm_start)
                        return addr;
                addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-               /* Because we're in a hugepage region, this alignment
-                * should not skip us over any VMAs */
+               /* Depending on segmask this might not be a confirmed
+                * hugepage region, so the ALIGN could have skipped
+                * some VMAs */
+               vma = find_vma(current->mm, addr);
        }
 
        return -ENOMEM;
@@ -659,6 +560,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
                                        unsigned long flags)
 {
+       int lastshift;
+       u16 areamask, curareas;
+
        if (len & ~HPAGE_MASK)
                return -EINVAL;
 
@@ -666,67 +570,49 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                return -EINVAL;
 
        if (test_thread_flag(TIF_32BIT)) {
-               int lastshift = 0;
-               u16 segmask, cursegs = current->mm->context.htlb_segs;
+               curareas = current->mm->context.low_htlb_areas;
 
                /* First see if we can do the mapping in the existing
-                * low hpage segments */
-               addr = htlb_get_low_area(len, cursegs);
+                * low areas */
+               addr = htlb_get_low_area(len, curareas);
                if (addr != -ENOMEM)
                        return addr;
 
-               for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
-                    ! lastshift; segmask >>=1) {
-                       if (segmask & 1)
+               lastshift = 0;
+               for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
+                    ! lastshift; areamask >>=1) {
+                       if (areamask & 1)
                                lastshift = 1;
 
-                       addr = htlb_get_low_area(len, cursegs | segmask);
+                       addr = htlb_get_low_area(len, curareas | areamask);
                        if ((addr != -ENOMEM)
-                           && open_low_hpage_segs(current->mm, segmask) == 0)
+                           && open_low_hpage_areas(current->mm, areamask) == 0)
                                return addr;
                }
-               printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
-                      " enough segments\n");
-               return -ENOMEM;
        } else {
-               return htlb_get_high_area(len);
-       }
-}
+               curareas = current->mm->context.high_htlb_areas;
 
-void hugetlb_mm_free_pgd(struct mm_struct *mm)
-{
-       int i;
-       pgd_t *pgdir;
-
-       spin_lock(&mm->page_table_lock);
-
-       pgdir = mm->context.huge_pgdir;
-       if (! pgdir)
-               goto out;
-
-       mm->context.huge_pgdir = NULL;
-
-       /* cleanup any hugepte pages leftover */
-       for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
-               pud_t *pud = (pud_t *)(pgdir + i);
-
-               if (! pud_none(*pud)) {
-                       pte_t *pte = (pte_t *)pud_page(*pud);
-                       struct page *ptepage = virt_to_page(pte);
+               /* First see if we can do the mapping in the existing
+                * high areas */
+               addr = htlb_get_high_area(len, curareas);
+               if (addr != -ENOMEM)
+                       return addr;
 
-                       ptepage->mapping = NULL;
+               lastshift = 0;
+               for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
+                    ! lastshift; areamask >>=1) {
+                       if (areamask & 1)
+                               lastshift = 1;
 
-                       BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
-                       kmem_cache_free(zero_cache, pte);
+                       addr = htlb_get_high_area(len, curareas | areamask);
+                       if ((addr != -ENOMEM)
+                           && open_high_hpage_areas(current->mm, areamask) == 0)
+                               return addr;
                }
-               pud_clear(pud);
        }
-
-       BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
-       kmem_cache_free(zero_cache, pgdir);
-
- out:
-       spin_unlock(&mm->page_table_lock);
+       printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
+              " enough areas\n");
+       return -ENOMEM;
 }
 
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
@@ -735,7 +621,7 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
        pte_t *ptep;
        unsigned long va, vpn;
        pte_t old_pte, new_pte;
-       unsigned long hpteflags, prpn;
+       unsigned long rflags, prpn;
        long slot;
        int err = 1;
 
@@ -778,9 +664,9 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
        old_pte = *ptep;
        new_pte = old_pte;
 
-       hpteflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
+       rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
        /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-       hpteflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
+       rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC);
 
        /* Check if pte already has an hpte (case 2) */
        if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
@@ -793,7 +679,7 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
                slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
 
-               if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1)
+               if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1)
                        pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
        }
 
@@ -813,10 +699,10 @@ repeat:
 
                /* Add in WIMG bits */
                /* XXX We should store these in the pte */
-               hpteflags |= _PAGE_COHERENT;
+               rflags |= _PAGE_COHERENT;
 
-               slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0,
-                                         hpteflags, 0, 1);
+               slot = ppc_md.hpte_insert(hpte_group, va, prpn,
+                                         HPTE_V_LARGE, rflags);
 
                /* Primary is full, try the secondary */
                if (unlikely(slot == -1)) {
@@ -824,7 +710,7 @@ repeat:
                        hpte_group = ((~hash & htab_hash_mask) *
                                      HPTES_PER_GROUP) & ~0x7UL; 
                        slot = ppc_md.hpte_insert(hpte_group, va, prpn,
-                                                 1, hpteflags, 0, 1);
+                                                 HPTE_V_LARGE, rflags);
                        if (slot == -1) {
                                if (mftb() & 0x1)
                                        hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;