]>
git.proxmox.com Git - mirror_ubuntu-kernels.git/blob - arch/ppc64/mm/hugetlbpage.c
2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
10 #include <linux/init.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/smp_lock.h>
16 #include <linux/slab.h>
17 #include <linux/err.h>
18 #include <linux/sysctl.h>
20 #include <asm/pgalloc.h>
22 #include <asm/tlbflush.h>
23 #include <asm/mmu_context.h>
24 #include <asm/machdep.h>
25 #include <asm/cputable.h>
28 #include <linux/sysctl.h>
30 #define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3)
31 #define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT)
32 #define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1))
34 #define HUGEPTE_INDEX_SIZE 9
35 #define HUGEPGD_INDEX_SIZE 10
37 #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
38 #define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE)
40 static inline int hugepgd_index(unsigned long addr
)
42 return (addr
& ~REGION_MASK
) >> HUGEPGDIR_SHIFT
;
45 static pud_t
*hugepgd_offset(struct mm_struct
*mm
, unsigned long addr
)
49 if (! mm
->context
.huge_pgdir
)
53 index
= hugepgd_index(addr
);
54 BUG_ON(index
>= PTRS_PER_HUGEPGD
);
55 return (pud_t
*)(mm
->context
.huge_pgdir
+ index
);
58 static inline pte_t
*hugepte_offset(pud_t
*dir
, unsigned long addr
)
65 index
= (addr
>> HPAGE_SHIFT
) % PTRS_PER_HUGEPTE
;
66 return (pte_t
*)pud_page(*dir
) + index
;
69 static pud_t
*hugepgd_alloc(struct mm_struct
*mm
, unsigned long addr
)
71 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
73 if (! mm
->context
.huge_pgdir
) {
75 spin_unlock(&mm
->page_table_lock
);
76 /* Don't use pgd_alloc(), because we want __GFP_REPEAT */
77 new = kmem_cache_alloc(zero_cache
, GFP_KERNEL
| __GFP_REPEAT
);
78 BUG_ON(memcmp(new, empty_zero_page
, PAGE_SIZE
));
79 spin_lock(&mm
->page_table_lock
);
82 * Because we dropped the lock, we should re-check the
83 * entry, as somebody else could have populated it..
85 if (mm
->context
.huge_pgdir
)
88 mm
->context
.huge_pgdir
= new;
90 return hugepgd_offset(mm
, addr
);
93 static pte_t
*hugepte_alloc(struct mm_struct
*mm
, pud_t
*dir
, unsigned long addr
)
95 if (! pud_present(*dir
)) {
98 spin_unlock(&mm
->page_table_lock
);
99 new = kmem_cache_alloc(zero_cache
, GFP_KERNEL
| __GFP_REPEAT
);
100 BUG_ON(memcmp(new, empty_zero_page
, PAGE_SIZE
));
101 spin_lock(&mm
->page_table_lock
);
103 * Because we dropped the lock, we should re-check the
104 * entry, as somebody else could have populated it..
106 if (pud_present(*dir
)) {
108 kmem_cache_free(zero_cache
, new);
110 struct page
*ptepage
;
114 ptepage
= virt_to_page(new);
115 ptepage
->mapping
= (void *) mm
;
116 ptepage
->index
= addr
& HUGEPGDIR_MASK
;
117 pud_populate(mm
, dir
, new);
121 return hugepte_offset(dir
, addr
);
124 pte_t
*huge_pte_offset(struct mm_struct
*mm
, unsigned long addr
)
128 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
130 pud
= hugepgd_offset(mm
, addr
);
134 return hugepte_offset(pud
, addr
);
137 pte_t
*huge_pte_alloc(struct mm_struct
*mm
, unsigned long addr
)
141 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
143 pud
= hugepgd_alloc(mm
, addr
);
147 return hugepte_alloc(mm
, pud
, addr
);
151 * This function checks for proper alignment of input addr and len parameters.
153 int is_aligned_hugepage_range(unsigned long addr
, unsigned long len
)
155 if (len
& ~HPAGE_MASK
)
157 if (addr
& ~HPAGE_MASK
)
159 if (! (within_hugepage_low_range(addr
, len
)
160 || within_hugepage_high_range(addr
, len
)) )
165 static void flush_segments(void *parm
)
167 u16 segs
= (unsigned long) parm
;
170 asm volatile("isync" : : : "memory");
172 for (i
= 0; i
< 16; i
++) {
173 if (! (segs
& (1U << i
)))
175 asm volatile("slbie %0" : : "r" (i
<< SID_SHIFT
));
178 asm volatile("isync" : : : "memory");
181 static int prepare_low_seg_for_htlb(struct mm_struct
*mm
, unsigned long seg
)
183 unsigned long start
= seg
<< SID_SHIFT
;
184 unsigned long end
= (seg
+1) << SID_SHIFT
;
185 struct vm_area_struct
*vma
;
189 /* Check no VMAs are in the region */
190 vma
= find_vma(mm
, start
);
191 if (vma
&& (vma
->vm_start
< end
))
197 static int open_low_hpage_segs(struct mm_struct
*mm
, u16 newsegs
)
201 newsegs
&= ~(mm
->context
.htlb_segs
);
203 return 0; /* The segments we want are already open */
205 for (i
= 0; i
< 16; i
++)
206 if ((1 << i
) & newsegs
)
207 if (prepare_low_seg_for_htlb(mm
, i
) != 0)
210 mm
->context
.htlb_segs
|= newsegs
;
212 /* update the paca copy of the context struct */
213 get_paca()->context
= mm
->context
;
215 /* the context change must make it to memory before the flush,
216 * so that further SLB misses do the right thing. */
218 on_each_cpu(flush_segments
, (void *)(unsigned long)newsegs
, 0, 1);
223 int prepare_hugepage_range(unsigned long addr
, unsigned long len
)
225 if (within_hugepage_high_range(addr
, len
))
227 else if ((addr
< 0x100000000UL
) && ((addr
+len
) < 0x100000000UL
)) {
229 /* Yes, we need both tests, in case addr+len overflows
230 * 64-bit arithmetic */
231 err
= open_low_hpage_segs(current
->mm
,
232 LOW_ESID_MASK(addr
, len
));
234 printk(KERN_DEBUG
"prepare_hugepage_range(%lx, %lx)"
235 " failed (segs: 0x%04hx)\n", addr
, len
,
236 LOW_ESID_MASK(addr
, len
));
244 follow_huge_addr(struct mm_struct
*mm
, unsigned long address
, int write
)
249 if (! in_hugepage_area(mm
->context
, address
))
250 return ERR_PTR(-EINVAL
);
252 ptep
= huge_pte_offset(mm
, address
);
253 page
= pte_page(*ptep
);
255 page
+= (address
% HPAGE_SIZE
) / PAGE_SIZE
;
260 int pmd_huge(pmd_t pmd
)
266 follow_huge_pmd(struct mm_struct
*mm
, unsigned long address
,
267 pmd_t
*pmd
, int write
)
273 /* Because we have an exclusive hugepage region which lies within the
274 * normal user address space, we have to take special measures to make
275 * non-huge mmap()s evade the hugepage reserved regions. */
276 unsigned long arch_get_unmapped_area(struct file
*filp
, unsigned long addr
,
277 unsigned long len
, unsigned long pgoff
,
280 struct mm_struct
*mm
= current
->mm
;
281 struct vm_area_struct
*vma
;
282 unsigned long start_addr
;
288 addr
= PAGE_ALIGN(addr
);
289 vma
= find_vma(mm
, addr
);
290 if (((TASK_SIZE
- len
) >= addr
)
291 && (!vma
|| (addr
+len
) <= vma
->vm_start
)
292 && !is_hugepage_only_range(mm
, addr
,len
))
295 start_addr
= addr
= mm
->free_area_cache
;
298 vma
= find_vma(mm
, addr
);
299 while (TASK_SIZE
- len
>= addr
) {
300 BUG_ON(vma
&& (addr
>= vma
->vm_end
));
302 if (touches_hugepage_low_range(mm
, addr
, len
)) {
303 addr
= ALIGN(addr
+1, 1<<SID_SHIFT
);
304 vma
= find_vma(mm
, addr
);
307 if (touches_hugepage_high_range(addr
, len
)) {
308 addr
= TASK_HPAGE_END
;
309 vma
= find_vma(mm
, addr
);
312 if (!vma
|| addr
+ len
<= vma
->vm_start
) {
314 * Remember the place where we stopped the search:
316 mm
->free_area_cache
= addr
+ len
;
323 /* Make sure we didn't miss any holes */
324 if (start_addr
!= TASK_UNMAPPED_BASE
) {
325 start_addr
= addr
= TASK_UNMAPPED_BASE
;
332 * This mmap-allocator allocates new areas top-down from below the
333 * stack's low limit (the base):
335 * Because we have an exclusive hugepage region which lies within the
336 * normal user address space, we have to take special measures to make
337 * non-huge mmap()s evade the hugepage reserved regions.
340 arch_get_unmapped_area_topdown(struct file
*filp
, const unsigned long addr0
,
341 const unsigned long len
, const unsigned long pgoff
,
342 const unsigned long flags
)
344 struct vm_area_struct
*vma
, *prev_vma
;
345 struct mm_struct
*mm
= current
->mm
;
346 unsigned long base
= mm
->mmap_base
, addr
= addr0
;
349 /* requested length too big for entire address space */
353 /* dont allow allocations above current base */
354 if (mm
->free_area_cache
> base
)
355 mm
->free_area_cache
= base
;
357 /* requesting a specific address */
359 addr
= PAGE_ALIGN(addr
);
360 vma
= find_vma(mm
, addr
);
361 if (TASK_SIZE
- len
>= addr
&&
362 (!vma
|| addr
+ len
<= vma
->vm_start
)
363 && !is_hugepage_only_range(mm
, addr
,len
))
368 /* make sure it can fit in the remaining address space */
369 if (mm
->free_area_cache
< len
)
372 /* either no address requested or cant fit in requested address hole */
373 addr
= (mm
->free_area_cache
- len
) & PAGE_MASK
;
376 if (touches_hugepage_low_range(mm
, addr
, len
)) {
377 addr
= (addr
& ((~0) << SID_SHIFT
)) - len
;
378 goto hugepage_recheck
;
379 } else if (touches_hugepage_high_range(addr
, len
)) {
380 addr
= TASK_HPAGE_BASE
- len
;
384 * Lookup failure means no vma is above this address,
385 * i.e. return with success:
387 if (!(vma
= find_vma_prev(mm
, addr
, &prev_vma
)))
391 * new region fits between prev_vma->vm_end and
392 * vma->vm_start, use it:
394 if (addr
+len
<= vma
->vm_start
&&
395 (!prev_vma
|| (addr
>= prev_vma
->vm_end
)))
396 /* remember the address as a hint for next time */
397 return (mm
->free_area_cache
= addr
);
399 /* pull free_area_cache down to the first hole */
400 if (mm
->free_area_cache
== vma
->vm_end
)
401 mm
->free_area_cache
= vma
->vm_start
;
403 /* try just below the current vma->vm_start */
404 addr
= vma
->vm_start
-len
;
405 } while (len
<= vma
->vm_start
);
409 * if hint left us with no space for the requested
410 * mapping then try again:
413 mm
->free_area_cache
= base
;
418 * A failed mmap() very likely causes application failure,
419 * so fall back to the bottom-up function here. This scenario
420 * can happen with large stack limits and large mmap()
423 mm
->free_area_cache
= TASK_UNMAPPED_BASE
;
424 addr
= arch_get_unmapped_area(filp
, addr0
, len
, pgoff
, flags
);
426 * Restore the topdown base:
428 mm
->free_area_cache
= base
;
433 static unsigned long htlb_get_low_area(unsigned long len
, u16 segmask
)
435 unsigned long addr
= 0;
436 struct vm_area_struct
*vma
;
438 vma
= find_vma(current
->mm
, addr
);
439 while (addr
+ len
<= 0x100000000UL
) {
440 BUG_ON(vma
&& (addr
>= vma
->vm_end
)); /* invariant */
442 if (! __within_hugepage_low_range(addr
, len
, segmask
)) {
443 addr
= ALIGN(addr
+1, 1<<SID_SHIFT
);
444 vma
= find_vma(current
->mm
, addr
);
448 if (!vma
|| (addr
+ len
) <= vma
->vm_start
)
450 addr
= ALIGN(vma
->vm_end
, HPAGE_SIZE
);
451 /* Depending on segmask this might not be a confirmed
452 * hugepage region, so the ALIGN could have skipped
454 vma
= find_vma(current
->mm
, addr
);
460 static unsigned long htlb_get_high_area(unsigned long len
)
462 unsigned long addr
= TASK_HPAGE_BASE
;
463 struct vm_area_struct
*vma
;
465 vma
= find_vma(current
->mm
, addr
);
466 for (vma
= find_vma(current
->mm
, addr
);
467 addr
+ len
<= TASK_HPAGE_END
;
468 vma
= vma
->vm_next
) {
469 BUG_ON(vma
&& (addr
>= vma
->vm_end
)); /* invariant */
470 BUG_ON(! within_hugepage_high_range(addr
, len
));
472 if (!vma
|| (addr
+ len
) <= vma
->vm_start
)
474 addr
= ALIGN(vma
->vm_end
, HPAGE_SIZE
);
475 /* Because we're in a hugepage region, this alignment
476 * should not skip us over any VMAs */
482 unsigned long hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
483 unsigned long len
, unsigned long pgoff
,
486 if (len
& ~HPAGE_MASK
)
489 if (!cpu_has_feature(CPU_FTR_16M_PAGE
))
492 if (test_thread_flag(TIF_32BIT
)) {
494 u16 segmask
, cursegs
= current
->mm
->context
.htlb_segs
;
496 /* First see if we can do the mapping in the existing
497 * low hpage segments */
498 addr
= htlb_get_low_area(len
, cursegs
);
502 for (segmask
= LOW_ESID_MASK(0x100000000UL
-len
, len
);
503 ! lastshift
; segmask
>>=1) {
507 addr
= htlb_get_low_area(len
, cursegs
| segmask
);
508 if ((addr
!= -ENOMEM
)
509 && open_low_hpage_segs(current
->mm
, segmask
) == 0)
512 printk(KERN_DEBUG
"hugetlb_get_unmapped_area() unable to open"
513 " enough segments\n");
516 return htlb_get_high_area(len
);
520 void hugetlb_mm_free_pgd(struct mm_struct
*mm
)
525 spin_lock(&mm
->page_table_lock
);
527 pgdir
= mm
->context
.huge_pgdir
;
531 mm
->context
.huge_pgdir
= NULL
;
533 /* cleanup any hugepte pages leftover */
534 for (i
= 0; i
< PTRS_PER_HUGEPGD
; i
++) {
535 pud_t
*pud
= (pud_t
*)(pgdir
+ i
);
537 if (! pud_none(*pud
)) {
538 pte_t
*pte
= (pte_t
*)pud_page(*pud
);
539 struct page
*ptepage
= virt_to_page(pte
);
541 ptepage
->mapping
= NULL
;
543 BUG_ON(memcmp(pte
, empty_zero_page
, PAGE_SIZE
));
544 kmem_cache_free(zero_cache
, pte
);
549 BUG_ON(memcmp(pgdir
, empty_zero_page
, PAGE_SIZE
));
550 kmem_cache_free(zero_cache
, pgdir
);
553 spin_unlock(&mm
->page_table_lock
);
556 int hash_huge_page(struct mm_struct
*mm
, unsigned long access
,
557 unsigned long ea
, unsigned long vsid
, int local
)
560 unsigned long va
, vpn
;
561 pte_t old_pte
, new_pte
;
562 unsigned long hpteflags
, prpn
;
566 spin_lock(&mm
->page_table_lock
);
568 ptep
= huge_pte_offset(mm
, ea
);
570 /* Search the Linux page table for a match with va */
571 va
= (vsid
<< 28) | (ea
& 0x0fffffff);
572 vpn
= va
>> HPAGE_SHIFT
;
575 * If no pte found or not present, send the problem up to
578 if (unlikely(!ptep
|| pte_none(*ptep
)))
581 /* BUG_ON(pte_bad(*ptep)); */
584 * Check the user's access rights to the page. If access should be
585 * prevented then send the problem up to do_page_fault.
587 if (unlikely(access
& ~pte_val(*ptep
)))
590 * At this point, we have a pte (old_pte) which can be used to build
591 * or update an HPTE. There are 2 cases:
593 * 1. There is a valid (present) pte with no associated HPTE (this is
594 * the most common case)
595 * 2. There is a valid (present) pte with an associated HPTE. The
596 * current values of the pp bits in the HPTE prevent access
597 * because we are doing software DIRTY bit management and the
598 * page is currently not DIRTY.
605 hpteflags
= 0x2 | (! (pte_val(new_pte
) & _PAGE_RW
));
606 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
607 hpteflags
|= ((pte_val(new_pte
) & _PAGE_EXEC
) ? 0 : HW_NO_EXEC
);
609 /* Check if pte already has an hpte (case 2) */
610 if (unlikely(pte_val(old_pte
) & _PAGE_HASHPTE
)) {
611 /* There MIGHT be an HPTE for this pte */
612 unsigned long hash
, slot
;
614 hash
= hpt_hash(vpn
, 1);
615 if (pte_val(old_pte
) & _PAGE_SECONDARY
)
617 slot
= (hash
& htab_hash_mask
) * HPTES_PER_GROUP
;
618 slot
+= (pte_val(old_pte
) & _PAGE_GROUP_IX
) >> 12;
620 if (ppc_md
.hpte_updatepp(slot
, hpteflags
, va
, 1, local
) == -1)
621 pte_val(old_pte
) &= ~_PAGE_HPTEFLAGS
;
624 if (likely(!(pte_val(old_pte
) & _PAGE_HASHPTE
))) {
625 unsigned long hash
= hpt_hash(vpn
, 1);
626 unsigned long hpte_group
;
628 prpn
= pte_pfn(old_pte
);
631 hpte_group
= ((hash
& htab_hash_mask
) *
632 HPTES_PER_GROUP
) & ~0x7UL
;
634 /* Update the linux pte with the HPTE slot */
635 pte_val(new_pte
) &= ~_PAGE_HPTEFLAGS
;
636 pte_val(new_pte
) |= _PAGE_HASHPTE
;
638 /* Add in WIMG bits */
639 /* XXX We should store these in the pte */
640 hpteflags
|= _PAGE_COHERENT
;
642 slot
= ppc_md
.hpte_insert(hpte_group
, va
, prpn
, 0,
645 /* Primary is full, try the secondary */
646 if (unlikely(slot
== -1)) {
647 pte_val(new_pte
) |= _PAGE_SECONDARY
;
648 hpte_group
= ((~hash
& htab_hash_mask
) *
649 HPTES_PER_GROUP
) & ~0x7UL
;
650 slot
= ppc_md
.hpte_insert(hpte_group
, va
, prpn
,
654 hpte_group
= ((hash
& htab_hash_mask
) * HPTES_PER_GROUP
) & ~0x7UL
;
656 ppc_md
.hpte_remove(hpte_group
);
661 if (unlikely(slot
== -2))
662 panic("hash_huge_page: pte_insert failed\n");
664 pte_val(new_pte
) |= (slot
<<12) & _PAGE_GROUP_IX
;
667 * No need to use ldarx/stdcx here because all who
668 * might be updating the pte will hold the
677 spin_unlock(&mm
->page_table_lock
);