]>
Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains ioremap and related functions for 64-bit machines. | |
3 | * | |
4 | * Derived from arch/ppc64/mm/init.c | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * Modifications by Paul Mackerras (PowerMac) (paulus@samba.org) | |
8 | * and Cort Dougan (PReP) (cort@cs.nmt.edu) | |
9 | * Copyright (C) 1996 Paul Mackerras | |
14cf11af PM |
10 | * |
11 | * Derived from "arch/i386/mm/init.c" | |
12 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | |
13 | * | |
14 | * Dave Engebretsen <engebret@us.ibm.com> | |
15 | * Rework for PPC64 port. | |
16 | * | |
17 | * This program is free software; you can redistribute it and/or | |
18 | * modify it under the terms of the GNU General Public License | |
19 | * as published by the Free Software Foundation; either version | |
20 | * 2 of the License, or (at your option) any later version. | |
21 | * | |
22 | */ | |
23 | ||
14cf11af PM |
24 | #include <linux/signal.h> |
25 | #include <linux/sched.h> | |
26 | #include <linux/kernel.h> | |
27 | #include <linux/errno.h> | |
28 | #include <linux/string.h> | |
66b15db6 | 29 | #include <linux/export.h> |
14cf11af PM |
30 | #include <linux/types.h> |
31 | #include <linux/mman.h> | |
32 | #include <linux/mm.h> | |
33 | #include <linux/swap.h> | |
34 | #include <linux/stddef.h> | |
35 | #include <linux/vmalloc.h> | |
95f72d1e | 36 | #include <linux/memblock.h> |
5a0e3ad6 | 37 | #include <linux/slab.h> |
06743521 | 38 | #include <linux/hugetlb.h> |
14cf11af PM |
39 | |
40 | #include <asm/pgalloc.h> | |
41 | #include <asm/page.h> | |
42 | #include <asm/prom.h> | |
14cf11af PM |
43 | #include <asm/io.h> |
44 | #include <asm/mmu_context.h> | |
45 | #include <asm/pgtable.h> | |
46 | #include <asm/mmu.h> | |
14cf11af PM |
47 | #include <asm/smp.h> |
48 | #include <asm/machdep.h> | |
49 | #include <asm/tlb.h> | |
14cf11af | 50 | #include <asm/processor.h> |
14cf11af | 51 | #include <asm/cputable.h> |
14cf11af | 52 | #include <asm/sections.h> |
5e203d68 | 53 | #include <asm/firmware.h> |
68cf0d64 | 54 | #include <asm/dma.h> |
800fc3ee DG |
55 | |
56 | #include "mmu_decl.h" | |
14cf11af | 57 | |
9e813308 AK |
58 | #define CREATE_TRACE_POINTS |
59 | #include <trace/events/thp.h> | |
60 | ||
78f1dbde | 61 | #ifdef CONFIG_PPC_STD_MMU_64 |
af81d787 | 62 | #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) |
78f1dbde AK |
63 | #error TASK_SIZE_USER64 exceeds user VSID range |
64 | #endif | |
65 | #endif | |
14cf11af | 66 | |
50de596d AK |
67 | #ifdef CONFIG_PPC_BOOK3S_64 |
68 | /* | |
69 | * partition table and process table for ISA 3.0 | |
70 | */ | |
71 | struct prtb_entry *process_tb; | |
72 | struct patb_entry *partition_tb; | |
dd1842a2 AK |
73 | /* |
74 | * page table size | |
75 | */ | |
76 | unsigned long __pte_index_size; | |
77 | EXPORT_SYMBOL(__pte_index_size); | |
78 | unsigned long __pmd_index_size; | |
79 | EXPORT_SYMBOL(__pmd_index_size); | |
80 | unsigned long __pud_index_size; | |
81 | EXPORT_SYMBOL(__pud_index_size); | |
82 | unsigned long __pgd_index_size; | |
83 | EXPORT_SYMBOL(__pgd_index_size); | |
84 | unsigned long __pmd_cache_index; | |
85 | EXPORT_SYMBOL(__pmd_cache_index); | |
86 | unsigned long __pte_table_size; | |
87 | EXPORT_SYMBOL(__pte_table_size); | |
88 | unsigned long __pmd_table_size; | |
89 | EXPORT_SYMBOL(__pmd_table_size); | |
90 | unsigned long __pud_table_size; | |
91 | EXPORT_SYMBOL(__pud_table_size); | |
92 | unsigned long __pgd_table_size; | |
93 | EXPORT_SYMBOL(__pgd_table_size); | |
a2f41eb9 AK |
94 | unsigned long __pmd_val_bits; |
95 | EXPORT_SYMBOL(__pmd_val_bits); | |
96 | unsigned long __pud_val_bits; | |
97 | EXPORT_SYMBOL(__pud_val_bits); | |
98 | unsigned long __pgd_val_bits; | |
99 | EXPORT_SYMBOL(__pgd_val_bits); | |
d6a9996e AK |
100 | unsigned long __kernel_virt_start; |
101 | EXPORT_SYMBOL(__kernel_virt_start); | |
102 | unsigned long __kernel_virt_size; | |
103 | EXPORT_SYMBOL(__kernel_virt_size); | |
104 | unsigned long __vmalloc_start; | |
105 | EXPORT_SYMBOL(__vmalloc_start); | |
106 | unsigned long __vmalloc_end; | |
107 | EXPORT_SYMBOL(__vmalloc_end); | |
108 | struct page *vmemmap; | |
109 | EXPORT_SYMBOL(vmemmap); | |
110 | unsigned long ioremap_bot; | |
111 | #else /* !CONFIG_PPC_BOOK3S_64 */ | |
78f1dbde | 112 | unsigned long ioremap_bot = IOREMAP_BASE; |
d6a9996e | 113 | #endif |
a245067e | 114 | |
3d5134ee BH |
115 | /** |
116 | * __ioremap_at - Low level function to establish the page tables | |
117 | * for an IO mapping | |
118 | */ | |
119 | void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, | |
14cf11af PM |
120 | unsigned long flags) |
121 | { | |
122 | unsigned long i; | |
123 | ||
a1f242ff | 124 | /* Make sure we have the base flags */ |
14cf11af PM |
125 | if ((flags & _PAGE_PRESENT) == 0) |
126 | flags |= pgprot_val(PAGE_KERNEL); | |
127 | ||
a1f242ff | 128 | /* We don't support the 4K PFN hack with ioremap */ |
945537df | 129 | if (flags & H_PAGE_4K_PFN) |
a1f242ff BH |
130 | return NULL; |
131 | ||
3d5134ee BH |
132 | WARN_ON(pa & ~PAGE_MASK); |
133 | WARN_ON(((unsigned long)ea) & ~PAGE_MASK); | |
134 | WARN_ON(size & ~PAGE_MASK); | |
135 | ||
14cf11af | 136 | for (i = 0; i < size; i += PAGE_SIZE) |
a245067e | 137 | if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) |
14cf11af PM |
138 | return NULL; |
139 | ||
3d5134ee BH |
140 | return (void __iomem *)ea; |
141 | } | |
142 | ||
143 | /** | |
144 | * __iounmap_from - Low level function to tear down the page tables | |
145 | * for an IO mapping. This is used for mappings that | |
146 | * are manipulated manually, like partial unmapping of | |
147 | * PCI IOs or ISA space. | |
148 | */ | |
149 | void __iounmap_at(void *ea, unsigned long size) | |
150 | { | |
151 | WARN_ON(((unsigned long)ea) & ~PAGE_MASK); | |
152 | WARN_ON(size & ~PAGE_MASK); | |
153 | ||
154 | unmap_kernel_range((unsigned long)ea, size); | |
14cf11af PM |
155 | } |
156 | ||
1cdab55d BH |
157 | void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, |
158 | unsigned long flags, void *caller) | |
14cf11af | 159 | { |
3d5134ee | 160 | phys_addr_t paligned; |
14cf11af PM |
161 | void __iomem *ret; |
162 | ||
163 | /* | |
164 | * Choose an address to map it to. | |
165 | * Once the imalloc system is running, we use it. | |
166 | * Before that, we map using addresses going | |
167 | * up from ioremap_bot. imalloc will use | |
168 | * the addresses from ioremap_bot through | |
169 | * IMALLOC_END | |
170 | * | |
171 | */ | |
3d5134ee BH |
172 | paligned = addr & PAGE_MASK; |
173 | size = PAGE_ALIGN(addr + size) - paligned; | |
14cf11af | 174 | |
3d5134ee | 175 | if ((size == 0) || (paligned == 0)) |
14cf11af PM |
176 | return NULL; |
177 | ||
f691fa10 | 178 | if (slab_is_available()) { |
14cf11af | 179 | struct vm_struct *area; |
3d5134ee | 180 | |
1cdab55d BH |
181 | area = __get_vm_area_caller(size, VM_IOREMAP, |
182 | ioremap_bot, IOREMAP_END, | |
183 | caller); | |
14cf11af PM |
184 | if (area == NULL) |
185 | return NULL; | |
7a9d1256 ME |
186 | |
187 | area->phys_addr = paligned; | |
3d5134ee | 188 | ret = __ioremap_at(paligned, area->addr, size, flags); |
14cf11af | 189 | if (!ret) |
3d5134ee | 190 | vunmap(area->addr); |
14cf11af | 191 | } else { |
3d5134ee | 192 | ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); |
14cf11af PM |
193 | if (ret) |
194 | ioremap_bot += size; | |
195 | } | |
3d5134ee BH |
196 | |
197 | if (ret) | |
198 | ret += addr & ~PAGE_MASK; | |
14cf11af PM |
199 | return ret; |
200 | } | |
201 | ||
1cdab55d BH |
202 | void __iomem * __ioremap(phys_addr_t addr, unsigned long size, |
203 | unsigned long flags) | |
204 | { | |
205 | return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); | |
206 | } | |
4cb3cee0 | 207 | |
68a64357 | 208 | void __iomem * ioremap(phys_addr_t addr, unsigned long size) |
4cb3cee0 | 209 | { |
72176dd0 | 210 | unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); |
1cdab55d | 211 | void *caller = __builtin_return_address(0); |
4cb3cee0 BH |
212 | |
213 | if (ppc_md.ioremap) | |
1cdab55d BH |
214 | return ppc_md.ioremap(addr, size, flags, caller); |
215 | return __ioremap_caller(addr, size, flags, caller); | |
4cb3cee0 BH |
216 | } |
217 | ||
be135f40 AB |
218 | void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) |
219 | { | |
72176dd0 | 220 | unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); |
be135f40 AB |
221 | void *caller = __builtin_return_address(0); |
222 | ||
223 | if (ppc_md.ioremap) | |
224 | return ppc_md.ioremap(addr, size, flags, caller); | |
225 | return __ioremap_caller(addr, size, flags, caller); | |
226 | } | |
227 | ||
40f1ce7f | 228 | void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, |
4cb3cee0 BH |
229 | unsigned long flags) |
230 | { | |
1cdab55d BH |
231 | void *caller = __builtin_return_address(0); |
232 | ||
a1f242ff | 233 | /* writeable implies dirty for kernel addresses */ |
c7d54842 | 234 | if (flags & _PAGE_WRITE) |
a1f242ff BH |
235 | flags |= _PAGE_DIRTY; |
236 | ||
ac29c640 AK |
237 | /* we don't want to let _PAGE_EXEC leak out */ |
238 | flags &= ~_PAGE_EXEC; | |
239 | /* | |
240 | * Force kernel mapping. | |
241 | */ | |
242 | #if defined(CONFIG_PPC_BOOK3S_64) | |
243 | flags |= _PAGE_PRIVILEGED; | |
244 | #else | |
245 | flags &= ~_PAGE_USER; | |
246 | #endif | |
247 | ||
a1f242ff | 248 | |
55052eec BH |
249 | #ifdef _PAGE_BAP_SR |
250 | /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format | |
251 | * which means that we just cleared supervisor access... oops ;-) This | |
252 | * restores it | |
253 | */ | |
254 | flags |= _PAGE_BAP_SR; | |
255 | #endif | |
256 | ||
4cb3cee0 | 257 | if (ppc_md.ioremap) |
1cdab55d BH |
258 | return ppc_md.ioremap(addr, size, flags, caller); |
259 | return __ioremap_caller(addr, size, flags, caller); | |
4cb3cee0 BH |
260 | } |
261 | ||
262 | ||
14cf11af PM |
263 | /* |
264 | * Unmap an IO region and remove it from imalloc'd list. | |
265 | * Access to IO memory should be serialized by driver. | |
14cf11af | 266 | */ |
68a64357 | 267 | void __iounmap(volatile void __iomem *token) |
14cf11af PM |
268 | { |
269 | void *addr; | |
270 | ||
f691fa10 | 271 | if (!slab_is_available()) |
14cf11af PM |
272 | return; |
273 | ||
3d5134ee BH |
274 | addr = (void *) ((unsigned long __force) |
275 | PCI_FIX_ADDR(token) & PAGE_MASK); | |
276 | if ((unsigned long)addr < ioremap_bot) { | |
277 | printk(KERN_WARNING "Attempt to iounmap early bolted mapping" | |
278 | " at 0x%p\n", addr); | |
279 | return; | |
280 | } | |
281 | vunmap(addr); | |
14cf11af PM |
282 | } |
283 | ||
68a64357 | 284 | void iounmap(volatile void __iomem *token) |
4cb3cee0 BH |
285 | { |
286 | if (ppc_md.iounmap) | |
287 | ppc_md.iounmap(token); | |
288 | else | |
289 | __iounmap(token); | |
290 | } | |
291 | ||
14cf11af | 292 | EXPORT_SYMBOL(ioremap); |
be135f40 | 293 | EXPORT_SYMBOL(ioremap_wc); |
40f1ce7f | 294 | EXPORT_SYMBOL(ioremap_prot); |
14cf11af | 295 | EXPORT_SYMBOL(__ioremap); |
a302cb9d | 296 | EXPORT_SYMBOL(__ioremap_at); |
14cf11af | 297 | EXPORT_SYMBOL(iounmap); |
4cb3cee0 | 298 | EXPORT_SYMBOL(__iounmap); |
a302cb9d | 299 | EXPORT_SYMBOL(__iounmap_at); |
5c1f6ee9 | 300 | |
06743521 AK |
301 | #ifndef __PAGETABLE_PUD_FOLDED |
302 | /* 4 level page table */ | |
303 | struct page *pgd_page(pgd_t pgd) | |
304 | { | |
305 | if (pgd_huge(pgd)) | |
306 | return pte_page(pgd_pte(pgd)); | |
307 | return virt_to_page(pgd_page_vaddr(pgd)); | |
308 | } | |
309 | #endif | |
310 | ||
311 | struct page *pud_page(pud_t pud) | |
312 | { | |
313 | if (pud_huge(pud)) | |
314 | return pte_page(pud_pte(pud)); | |
315 | return virt_to_page(pud_page_vaddr(pud)); | |
316 | } | |
317 | ||
074c2eae AK |
318 | /* |
319 | * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags | |
320 | * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address. | |
321 | */ | |
322 | struct page *pmd_page(pmd_t pmd) | |
323 | { | |
06743521 | 324 | if (pmd_trans_huge(pmd) || pmd_huge(pmd)) |
e34aa03c | 325 | return pte_page(pmd_pte(pmd)); |
074c2eae AK |
326 | return virt_to_page(pmd_page_vaddr(pmd)); |
327 | } | |
328 | ||
5c1f6ee9 AK |
329 | #ifdef CONFIG_PPC_64K_PAGES |
330 | static pte_t *get_from_cache(struct mm_struct *mm) | |
331 | { | |
332 | void *pte_frag, *ret; | |
333 | ||
334 | spin_lock(&mm->page_table_lock); | |
335 | ret = mm->context.pte_frag; | |
336 | if (ret) { | |
337 | pte_frag = ret + PTE_FRAG_SIZE; | |
338 | /* | |
339 | * If we have taken up all the fragments mark PTE page NULL | |
340 | */ | |
341 | if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) | |
342 | pte_frag = NULL; | |
343 | mm->context.pte_frag = pte_frag; | |
344 | } | |
345 | spin_unlock(&mm->page_table_lock); | |
346 | return (pte_t *)ret; | |
347 | } | |
348 | ||
349 | static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) | |
350 | { | |
351 | void *ret = NULL; | |
352 | struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | | |
353 | __GFP_REPEAT | __GFP_ZERO); | |
354 | if (!page) | |
355 | return NULL; | |
4f804943 KS |
356 | if (!kernel && !pgtable_page_ctor(page)) { |
357 | __free_page(page); | |
358 | return NULL; | |
359 | } | |
5c1f6ee9 AK |
360 | |
361 | ret = page_address(page); | |
362 | spin_lock(&mm->page_table_lock); | |
363 | /* | |
364 | * If we find pgtable_page set, we return | |
365 | * the allocated page with single fragement | |
366 | * count. | |
367 | */ | |
368 | if (likely(!mm->context.pte_frag)) { | |
fe896d18 | 369 | set_page_count(page, PTE_FRAG_NR); |
5c1f6ee9 AK |
370 | mm->context.pte_frag = ret + PTE_FRAG_SIZE; |
371 | } | |
372 | spin_unlock(&mm->page_table_lock); | |
373 | ||
5c1f6ee9 AK |
374 | return (pte_t *)ret; |
375 | } | |
376 | ||
74701d59 | 377 | pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) |
5c1f6ee9 AK |
378 | { |
379 | pte_t *pte; | |
380 | ||
381 | pte = get_from_cache(mm); | |
382 | if (pte) | |
383 | return pte; | |
384 | ||
385 | return __alloc_for_cache(mm, kernel); | |
386 | } | |
934828ed | 387 | #endif /* CONFIG_PPC_64K_PAGES */ |
5c1f6ee9 | 388 | |
74701d59 | 389 | void pte_fragment_free(unsigned long *table, int kernel) |
5c1f6ee9 AK |
390 | { |
391 | struct page *page = virt_to_page(table); | |
392 | if (put_page_testzero(page)) { | |
393 | if (!kernel) | |
394 | pgtable_page_dtor(page); | |
395 | free_hot_cold_page(page, 0); | |
396 | } | |
397 | } | |
398 | ||
399 | #ifdef CONFIG_SMP | |
5c1f6ee9 AK |
400 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) |
401 | { | |
402 | unsigned long pgf = (unsigned long)table; | |
403 | ||
404 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
405 | pgf |= shift; | |
406 | tlb_remove_table(tlb, (void *)pgf); | |
407 | } | |
408 | ||
409 | void __tlb_remove_table(void *_table) | |
410 | { | |
411 | void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); | |
412 | unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; | |
413 | ||
414 | if (!shift) | |
415 | /* PTE page needs special handling */ | |
74701d59 | 416 | pte_fragment_free(table, 0); |
5c1f6ee9 AK |
417 | else { |
418 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
419 | kmem_cache_free(PGT_CACHE(shift), table); | |
420 | } | |
421 | } | |
422 | #else | |
423 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) | |
424 | { | |
425 | if (!shift) { | |
426 | /* PTE page needs special handling */ | |
74701d59 | 427 | pte_fragment_free(table, 0); |
5c1f6ee9 AK |
428 | } else { |
429 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
430 | kmem_cache_free(PGT_CACHE(shift), table); | |
431 | } | |
432 | } | |
433 | #endif | |
074c2eae AK |
434 | |
435 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
436 | ||
437 | /* | |
438 | * This is called when relaxing access to a hugepage. It's also called in the page | |
439 | * fault path when we don't hit any of the major fault cases, ie, a minor | |
440 | * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have | |
441 | * handled those two for us, we additionally deal with missing execute | |
442 | * permission here on some processors | |
443 | */ | |
444 | int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, | |
445 | pmd_t *pmdp, pmd_t entry, int dirty) | |
446 | { | |
447 | int changed; | |
448 | #ifdef CONFIG_DEBUG_VM | |
449 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
450 | assert_spin_locked(&vma->vm_mm->page_table_lock); | |
451 | #endif | |
452 | changed = !pmd_same(*(pmdp), entry); | |
453 | if (changed) { | |
454 | __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); | |
455 | /* | |
456 | * Since we are not supporting SW TLB systems, we don't | |
457 | * have any thing similar to flush_tlb_page_nohash() | |
458 | */ | |
459 | } | |
460 | return changed; | |
461 | } | |
462 | ||
463 | unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, | |
88247e8d AK |
464 | pmd_t *pmdp, unsigned long clr, |
465 | unsigned long set) | |
074c2eae AK |
466 | { |
467 | ||
5dc1ef85 AK |
468 | __be64 old_be, tmp; |
469 | unsigned long old; | |
074c2eae AK |
470 | |
471 | #ifdef CONFIG_DEBUG_VM | |
472 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
473 | assert_spin_locked(&mm->page_table_lock); | |
474 | #endif | |
475 | ||
074c2eae AK |
476 | __asm__ __volatile__( |
477 | "1: ldarx %0,0,%3\n\ | |
5dc1ef85 | 478 | and. %1,%0,%6\n\ |
074c2eae AK |
479 | bne- 1b \n\ |
480 | andc %1,%0,%4 \n\ | |
88247e8d | 481 | or %1,%1,%7\n\ |
074c2eae AK |
482 | stdcx. %1,0,%3 \n\ |
483 | bne- 1b" | |
5dc1ef85 AK |
484 | : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) |
485 | : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), | |
945537df | 486 | "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) |
074c2eae | 487 | : "cc" ); |
4bece39b | 488 | |
5dc1ef85 AK |
489 | old = be64_to_cpu(old_be); |
490 | ||
9e813308 | 491 | trace_hugepage_update(addr, old, clr, set); |
945537df | 492 | if (old & H_PAGE_HASHPTE) |
fc047955 | 493 | hpte_do_hugepage_flush(mm, addr, pmdp, old); |
074c2eae AK |
494 | return old; |
495 | } | |
496 | ||
15a25b2e AK |
497 | pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, |
498 | pmd_t *pmdp) | |
499 | { | |
500 | pmd_t pmd; | |
501 | ||
502 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
503 | VM_BUG_ON(pmd_trans_huge(*pmdp)); | |
504 | ||
505 | pmd = *pmdp; | |
506 | pmd_clear(pmdp); | |
507 | /* | |
508 | * Wait for all pending hash_page to finish. This is needed | |
509 | * in case of subpage collapse. When we collapse normal pages | |
510 | * to hugepage, we first clear the pmd, then invalidate all | |
511 | * the PTE entries. The assumption here is that any low level | |
512 | * page fault will see a none pmd and take the slow path that | |
513 | * will wait on mmap_sem. But we could very well be in a | |
514 | * hash_page with local ptep pointer value. Such a hash page | |
515 | * can result in adding new HPTE entries for normal subpages. | |
516 | * That means we could be modifying the page content as we | |
517 | * copy them to a huge page. So wait for parallel hash_page | |
518 | * to finish before invalidating HPTE entries. We can do this | |
519 | * by sending an IPI to all the cpus and executing a dummy | |
520 | * function there. | |
521 | */ | |
522 | kick_all_cpus_sync(); | |
523 | /* | |
524 | * Now invalidate the hpte entries in the range | |
525 | * covered by pmd. This make sure we take a | |
526 | * fault and will find the pmd as none, which will | |
527 | * result in a major fault which takes mmap_sem and | |
528 | * hence wait for collapse to complete. Without this | |
529 | * the __collapse_huge_page_copy can result in copying | |
530 | * the old content. | |
531 | */ | |
532 | flush_tlb_pmd_range(vma->vm_mm, &pmd, address); | |
074c2eae AK |
533 | return pmd; |
534 | } | |
535 | ||
074c2eae AK |
536 | /* |
537 | * We currently remove entries from the hashtable regardless of whether | |
ff844b74 | 538 | * the entry was young or dirty. |
074c2eae AK |
539 | * |
540 | * We should be more intelligent about this but for the moment we override | |
541 | * these functions and force a tlb flush unconditionally | |
542 | */ | |
ff844b74 AK |
543 | int pmdp_test_and_clear_young(struct vm_area_struct *vma, |
544 | unsigned long address, pmd_t *pmdp) | |
074c2eae AK |
545 | { |
546 | return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); | |
547 | } | |
548 | ||
074c2eae AK |
549 | /* |
550 | * We want to put the pgtable in pmd and use pgtable for tracking | |
551 | * the base page size hptes | |
552 | */ | |
553 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | |
554 | pgtable_t pgtable) | |
555 | { | |
556 | pgtable_t *pgtable_slot; | |
557 | assert_spin_locked(&mm->page_table_lock); | |
558 | /* | |
559 | * we store the pgtable in the second half of PMD | |
560 | */ | |
561 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
562 | *pgtable_slot = pgtable; | |
563 | /* | |
564 | * expose the deposited pgtable to other cpus. | |
565 | * before we set the hugepage PTE at pmd level | |
566 | * hash fault code looks at the deposted pgtable | |
567 | * to store hash index values. | |
568 | */ | |
569 | smp_wmb(); | |
570 | } | |
571 | ||
572 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) | |
573 | { | |
574 | pgtable_t pgtable; | |
575 | pgtable_t *pgtable_slot; | |
576 | ||
577 | assert_spin_locked(&mm->page_table_lock); | |
578 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
579 | pgtable = *pgtable_slot; | |
580 | /* | |
581 | * Once we withdraw, mark the entry NULL. | |
582 | */ | |
583 | *pgtable_slot = NULL; | |
584 | /* | |
585 | * We store HPTE information in the deposited PTE fragment. | |
586 | * zero out the content on withdraw. | |
587 | */ | |
588 | memset(pgtable, 0, PTE_FRAG_SIZE); | |
589 | return pgtable; | |
590 | } | |
591 | ||
c777e2a8 AK |
592 | void pmdp_huge_split_prepare(struct vm_area_struct *vma, |
593 | unsigned long address, pmd_t *pmdp) | |
594 | { | |
595 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
596 | VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); | |
597 | ||
598 | /* | |
599 | * We can't mark the pmd none here, because that will cause a race | |
600 | * against exit_mmap. We need to continue mark pmd TRANS HUGE, while | |
601 | * we spilt, but at the same time we wan't rest of the ppc64 code | |
602 | * not to insert hash pte on this, because we will be modifying | |
603 | * the deposited pgtable in the caller of this function. Hence | |
604 | * clear the _PAGE_USER so that we move the fault handling to | |
605 | * higher level function and that will serialize against ptl. | |
606 | * We need to flush existing hash pte entries here even though, | |
607 | * the translation is still valid, because we will withdraw | |
608 | * pgtable_t after this. | |
609 | */ | |
ac29c640 | 610 | pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); |
c777e2a8 AK |
611 | } |
612 | ||
613 | ||
074c2eae AK |
614 | /* |
615 | * set a new huge pmd. We should not be called for updating | |
616 | * an existing pmd entry. That should go via pmd_hugepage_update. | |
617 | */ | |
618 | void set_pmd_at(struct mm_struct *mm, unsigned long addr, | |
619 | pmd_t *pmdp, pmd_t pmd) | |
620 | { | |
621 | #ifdef CONFIG_DEBUG_VM | |
c7d54842 | 622 | WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); |
074c2eae AK |
623 | assert_spin_locked(&mm->page_table_lock); |
624 | WARN_ON(!pmd_trans_huge(pmd)); | |
625 | #endif | |
4f9c53c8 | 626 | trace_hugepage_set_pmd(addr, pmd_val(pmd)); |
074c2eae AK |
627 | return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); |
628 | } | |
629 | ||
c777e2a8 AK |
630 | /* |
631 | * We use this to invalidate a pmdp entry before switching from a | |
632 | * hugepte to regular pmd entry. | |
633 | */ | |
074c2eae AK |
634 | void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, |
635 | pmd_t *pmdp) | |
636 | { | |
88247e8d | 637 | pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); |
c777e2a8 AK |
638 | |
639 | /* | |
640 | * This ensures that generic code that rely on IRQ disabling | |
641 | * to prevent a parallel THP split work as expected. | |
642 | */ | |
643 | kick_all_cpus_sync(); | |
074c2eae AK |
644 | } |
645 | ||
646 | /* | |
647 | * A linux hugepage PMD was changed and the corresponding hash table entries | |
648 | * neesd to be flushed. | |
649 | */ | |
650 | void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, | |
fc047955 | 651 | pmd_t *pmdp, unsigned long old_pmd) |
074c2eae | 652 | { |
aefa5688 | 653 | int ssize; |
f1581bf1 AK |
654 | unsigned int psize; |
655 | unsigned long vsid; | |
aefa5688 | 656 | unsigned long flags = 0; |
d557b098 | 657 | const struct cpumask *tmp; |
074c2eae | 658 | |
fa1f8ae8 | 659 | /* get the base page size,vsid and segment size */ |
fc047955 | 660 | #ifdef CONFIG_DEBUG_VM |
f1581bf1 | 661 | psize = get_slice_psize(mm, addr); |
fc047955 AK |
662 | BUG_ON(psize == MMU_PAGE_16M); |
663 | #endif | |
945537df | 664 | if (old_pmd & H_PAGE_COMBO) |
fc047955 AK |
665 | psize = MMU_PAGE_4K; |
666 | else | |
667 | psize = MMU_PAGE_64K; | |
668 | ||
f1581bf1 AK |
669 | if (!is_kernel_addr(addr)) { |
670 | ssize = user_segment_size(addr); | |
671 | vsid = get_vsid(mm->context.id, addr, ssize); | |
fa1f8ae8 AK |
672 | WARN_ON(vsid == 0); |
673 | } else { | |
f1581bf1 | 674 | vsid = get_kernel_vsid(addr, mmu_kernel_ssize); |
fa1f8ae8 AK |
675 | ssize = mmu_kernel_ssize; |
676 | } | |
074c2eae | 677 | |
d557b098 AK |
678 | tmp = cpumask_of(smp_processor_id()); |
679 | if (cpumask_equal(mm_cpumask(mm), tmp)) | |
aefa5688 | 680 | flags |= HPTE_LOCAL_UPDATE; |
d557b098 | 681 | |
aefa5688 | 682 | return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); |
074c2eae AK |
683 | } |
684 | ||
685 | static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) | |
686 | { | |
f281b5d5 | 687 | return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); |
074c2eae AK |
688 | } |
689 | ||
690 | pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) | |
691 | { | |
f281b5d5 | 692 | unsigned long pmdv; |
6a119eae | 693 | |
96270b1f | 694 | pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; |
f281b5d5 | 695 | return pmd_set_protbits(__pmd(pmdv), pgprot); |
074c2eae AK |
696 | } |
697 | ||
698 | pmd_t mk_pmd(struct page *page, pgprot_t pgprot) | |
699 | { | |
700 | return pfn_pmd(page_to_pfn(page), pgprot); | |
701 | } | |
702 | ||
703 | pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | |
704 | { | |
f281b5d5 | 705 | unsigned long pmdv; |
074c2eae | 706 | |
f281b5d5 AK |
707 | pmdv = pmd_val(pmd); |
708 | pmdv &= _HPAGE_CHG_MASK; | |
709 | return pmd_set_protbits(__pmd(pmdv), newprot); | |
074c2eae AK |
710 | } |
711 | ||
712 | /* | |
713 | * This is called at the end of handling a user page fault, when the | |
714 | * fault has been handled by updating a HUGE PMD entry in the linux page tables. | |
715 | * We use it to preload an HPTE into the hash table corresponding to | |
716 | * the updated linux HUGE PMD entry. | |
717 | */ | |
718 | void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, | |
719 | pmd_t *pmd) | |
720 | { | |
721 | return; | |
722 | } | |
723 | ||
8809aa2d AK |
724 | pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
725 | unsigned long addr, pmd_t *pmdp) | |
074c2eae AK |
726 | { |
727 | pmd_t old_pmd; | |
728 | pgtable_t pgtable; | |
729 | unsigned long old; | |
730 | pgtable_t *pgtable_slot; | |
731 | ||
88247e8d | 732 | old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); |
074c2eae AK |
733 | old_pmd = __pmd(old); |
734 | /* | |
735 | * We have pmd == none and we are holding page_table_lock. | |
736 | * So we can safely go and clear the pgtable hash | |
737 | * index info. | |
738 | */ | |
739 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
740 | pgtable = *pgtable_slot; | |
741 | /* | |
742 | * Let's zero out old valid and hash index details | |
743 | * hash fault look at them. | |
744 | */ | |
745 | memset(pgtable, 0, PTE_FRAG_SIZE); | |
13bd817b AK |
746 | /* |
747 | * Serialize against find_linux_pte_or_hugepte which does lock-less | |
748 | * lookup in page tables with local interrupts disabled. For huge pages | |
749 | * it casts pmd_t to pte_t. Since format of pte_t is different from | |
750 | * pmd_t we want to prevent transit from pmd pointing to page table | |
751 | * to pmd pointing to huge page (and back) while interrupts are disabled. | |
752 | * We clear pmd to possibly replace it with page table pointer in | |
753 | * different code paths. So make sure we wait for the parallel | |
754 | * find_linux_pte_or_hugepage to finish. | |
755 | */ | |
756 | kick_all_cpus_sync(); | |
074c2eae AK |
757 | return old_pmd; |
758 | } | |
437d4964 AK |
759 | |
760 | int has_transparent_hugepage(void) | |
761 | { | |
ff20c2e0 | 762 | |
437d4964 AK |
763 | if (!mmu_has_feature(MMU_FTR_16M_PAGE)) |
764 | return 0; | |
765 | /* | |
766 | * We support THP only if PMD_SIZE is 16MB. | |
767 | */ | |
768 | if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) | |
769 | return 0; | |
770 | /* | |
771 | * We need to make sure that we support 16MB hugepage in a segement | |
772 | * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE | |
773 | * of 64K. | |
774 | */ | |
775 | /* | |
776 | * If we have 64K HPTE, we will be using that by default | |
777 | */ | |
778 | if (mmu_psize_defs[MMU_PAGE_64K].shift && | |
779 | (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) | |
780 | return 0; | |
781 | /* | |
782 | * Ok we only have 4K HPTE | |
783 | */ | |
784 | if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) | |
785 | return 0; | |
786 | ||
787 | return 1; | |
788 | } | |
074c2eae | 789 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |