]>
Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains ioremap and related functions for 64-bit machines. | |
3 | * | |
4 | * Derived from arch/ppc64/mm/init.c | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * Modifications by Paul Mackerras (PowerMac) (paulus@samba.org) | |
8 | * and Cort Dougan (PReP) (cort@cs.nmt.edu) | |
9 | * Copyright (C) 1996 Paul Mackerras | |
14cf11af PM |
10 | * |
11 | * Derived from "arch/i386/mm/init.c" | |
12 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | |
13 | * | |
14 | * Dave Engebretsen <engebret@us.ibm.com> | |
15 | * Rework for PPC64 port. | |
16 | * | |
17 | * This program is free software; you can redistribute it and/or | |
18 | * modify it under the terms of the GNU General Public License | |
19 | * as published by the Free Software Foundation; either version | |
20 | * 2 of the License, or (at your option) any later version. | |
21 | * | |
22 | */ | |
23 | ||
14cf11af PM |
24 | #include <linux/signal.h> |
25 | #include <linux/sched.h> | |
26 | #include <linux/kernel.h> | |
27 | #include <linux/errno.h> | |
28 | #include <linux/string.h> | |
66b15db6 | 29 | #include <linux/export.h> |
14cf11af PM |
30 | #include <linux/types.h> |
31 | #include <linux/mman.h> | |
32 | #include <linux/mm.h> | |
33 | #include <linux/swap.h> | |
34 | #include <linux/stddef.h> | |
35 | #include <linux/vmalloc.h> | |
36 | #include <linux/init.h> | |
a245067e | 37 | #include <linux/bootmem.h> |
95f72d1e | 38 | #include <linux/memblock.h> |
5a0e3ad6 | 39 | #include <linux/slab.h> |
14cf11af PM |
40 | |
41 | #include <asm/pgalloc.h> | |
42 | #include <asm/page.h> | |
43 | #include <asm/prom.h> | |
14cf11af PM |
44 | #include <asm/io.h> |
45 | #include <asm/mmu_context.h> | |
46 | #include <asm/pgtable.h> | |
47 | #include <asm/mmu.h> | |
14cf11af PM |
48 | #include <asm/smp.h> |
49 | #include <asm/machdep.h> | |
50 | #include <asm/tlb.h> | |
14cf11af | 51 | #include <asm/processor.h> |
14cf11af | 52 | #include <asm/cputable.h> |
14cf11af | 53 | #include <asm/sections.h> |
5e203d68 | 54 | #include <asm/firmware.h> |
800fc3ee DG |
55 | |
56 | #include "mmu_decl.h" | |
14cf11af | 57 | |
78f1dbde AK |
58 | /* Some sanity checking */ |
59 | #if TASK_SIZE_USER64 > PGTABLE_RANGE | |
60 | #error TASK_SIZE_USER64 exceeds pagetable range | |
61 | #endif | |
62 | ||
63 | #ifdef CONFIG_PPC_STD_MMU_64 | |
af81d787 | 64 | #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) |
78f1dbde AK |
65 | #error TASK_SIZE_USER64 exceeds user VSID range |
66 | #endif | |
67 | #endif | |
14cf11af | 68 | |
78f1dbde | 69 | unsigned long ioremap_bot = IOREMAP_BASE; |
a245067e BH |
70 | |
71 | #ifdef CONFIG_PPC_MMU_NOHASH | |
72 | static void *early_alloc_pgtable(unsigned long size) | |
73 | { | |
74 | void *pt; | |
75 | ||
76 | if (init_bootmem_done) | |
77 | pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS)); | |
78 | else | |
95f72d1e | 79 | pt = __va(memblock_alloc_base(size, size, |
a245067e BH |
80 | __pa(MAX_DMA_ADDRESS))); |
81 | memset(pt, 0, size); | |
82 | ||
83 | return pt; | |
84 | } | |
85 | #endif /* CONFIG_PPC_MMU_NOHASH */ | |
86 | ||
14cf11af | 87 | /* |
a245067e BH |
88 | * map_kernel_page currently only called by __ioremap |
89 | * map_kernel_page adds an entry to the ioremap page table | |
14cf11af PM |
90 | * and adds an entry to the HPT, possibly bolting it |
91 | */ | |
32a74949 | 92 | int map_kernel_page(unsigned long ea, unsigned long pa, int flags) |
14cf11af PM |
93 | { |
94 | pgd_t *pgdp; | |
95 | pud_t *pudp; | |
96 | pmd_t *pmdp; | |
97 | pte_t *ptep; | |
14cf11af | 98 | |
a245067e | 99 | if (slab_is_available()) { |
14cf11af PM |
100 | pgdp = pgd_offset_k(ea); |
101 | pudp = pud_alloc(&init_mm, pgdp, ea); | |
102 | if (!pudp) | |
103 | return -ENOMEM; | |
104 | pmdp = pmd_alloc(&init_mm, pudp, ea); | |
105 | if (!pmdp) | |
106 | return -ENOMEM; | |
23fd0775 | 107 | ptep = pte_alloc_kernel(pmdp, ea); |
14cf11af PM |
108 | if (!ptep) |
109 | return -ENOMEM; | |
110 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, | |
111 | __pgprot(flags))); | |
14cf11af | 112 | } else { |
a245067e BH |
113 | #ifdef CONFIG_PPC_MMU_NOHASH |
114 | /* Warning ! This will blow up if bootmem is not initialized | |
115 | * which our ppc64 code is keen to do that, we'll need to | |
116 | * fix it and/or be more careful | |
117 | */ | |
118 | pgdp = pgd_offset_k(ea); | |
119 | #ifdef PUD_TABLE_SIZE | |
120 | if (pgd_none(*pgdp)) { | |
121 | pudp = early_alloc_pgtable(PUD_TABLE_SIZE); | |
122 | BUG_ON(pudp == NULL); | |
123 | pgd_populate(&init_mm, pgdp, pudp); | |
124 | } | |
125 | #endif /* PUD_TABLE_SIZE */ | |
126 | pudp = pud_offset(pgdp, ea); | |
127 | if (pud_none(*pudp)) { | |
128 | pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); | |
129 | BUG_ON(pmdp == NULL); | |
130 | pud_populate(&init_mm, pudp, pmdp); | |
131 | } | |
132 | pmdp = pmd_offset(pudp, ea); | |
133 | if (!pmd_present(*pmdp)) { | |
134 | ptep = early_alloc_pgtable(PAGE_SIZE); | |
135 | BUG_ON(ptep == NULL); | |
136 | pmd_populate_kernel(&init_mm, pmdp, ptep); | |
137 | } | |
138 | ptep = pte_offset_kernel(pmdp, ea); | |
139 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, | |
140 | __pgprot(flags))); | |
141 | #else /* CONFIG_PPC_MMU_NOHASH */ | |
14cf11af PM |
142 | /* |
143 | * If the mm subsystem is not fully up, we cannot create a | |
144 | * linux page table entry for this mapping. Simply bolt an | |
145 | * entry in the hardware page table. | |
3c726f8d | 146 | * |
14cf11af | 147 | */ |
1189be65 PM |
148 | if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, |
149 | mmu_io_psize, mmu_kernel_ssize)) { | |
77ac166f BH |
150 | printk(KERN_ERR "Failed to do bolted mapping IO " |
151 | "memory at %016lx !\n", pa); | |
152 | return -ENOMEM; | |
153 | } | |
a245067e | 154 | #endif /* !CONFIG_PPC_MMU_NOHASH */ |
14cf11af PM |
155 | } |
156 | return 0; | |
157 | } | |
158 | ||
159 | ||
3d5134ee BH |
160 | /** |
161 | * __ioremap_at - Low level function to establish the page tables | |
162 | * for an IO mapping | |
163 | */ | |
164 | void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, | |
14cf11af PM |
165 | unsigned long flags) |
166 | { | |
167 | unsigned long i; | |
168 | ||
a1f242ff | 169 | /* Make sure we have the base flags */ |
14cf11af PM |
170 | if ((flags & _PAGE_PRESENT) == 0) |
171 | flags |= pgprot_val(PAGE_KERNEL); | |
172 | ||
a1f242ff BH |
173 | /* Non-cacheable page cannot be coherent */ |
174 | if (flags & _PAGE_NO_CACHE) | |
175 | flags &= ~_PAGE_COHERENT; | |
176 | ||
177 | /* We don't support the 4K PFN hack with ioremap */ | |
178 | if (flags & _PAGE_4K_PFN) | |
179 | return NULL; | |
180 | ||
3d5134ee BH |
181 | WARN_ON(pa & ~PAGE_MASK); |
182 | WARN_ON(((unsigned long)ea) & ~PAGE_MASK); | |
183 | WARN_ON(size & ~PAGE_MASK); | |
184 | ||
14cf11af | 185 | for (i = 0; i < size; i += PAGE_SIZE) |
a245067e | 186 | if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) |
14cf11af PM |
187 | return NULL; |
188 | ||
3d5134ee BH |
189 | return (void __iomem *)ea; |
190 | } | |
191 | ||
192 | /** | |
193 | * __iounmap_from - Low level function to tear down the page tables | |
194 | * for an IO mapping. This is used for mappings that | |
195 | * are manipulated manually, like partial unmapping of | |
196 | * PCI IOs or ISA space. | |
197 | */ | |
198 | void __iounmap_at(void *ea, unsigned long size) | |
199 | { | |
200 | WARN_ON(((unsigned long)ea) & ~PAGE_MASK); | |
201 | WARN_ON(size & ~PAGE_MASK); | |
202 | ||
203 | unmap_kernel_range((unsigned long)ea, size); | |
14cf11af PM |
204 | } |
205 | ||
1cdab55d BH |
206 | void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, |
207 | unsigned long flags, void *caller) | |
14cf11af | 208 | { |
3d5134ee | 209 | phys_addr_t paligned; |
14cf11af PM |
210 | void __iomem *ret; |
211 | ||
212 | /* | |
213 | * Choose an address to map it to. | |
214 | * Once the imalloc system is running, we use it. | |
215 | * Before that, we map using addresses going | |
216 | * up from ioremap_bot. imalloc will use | |
217 | * the addresses from ioremap_bot through | |
218 | * IMALLOC_END | |
219 | * | |
220 | */ | |
3d5134ee BH |
221 | paligned = addr & PAGE_MASK; |
222 | size = PAGE_ALIGN(addr + size) - paligned; | |
14cf11af | 223 | |
3d5134ee | 224 | if ((size == 0) || (paligned == 0)) |
14cf11af PM |
225 | return NULL; |
226 | ||
227 | if (mem_init_done) { | |
228 | struct vm_struct *area; | |
3d5134ee | 229 | |
1cdab55d BH |
230 | area = __get_vm_area_caller(size, VM_IOREMAP, |
231 | ioremap_bot, IOREMAP_END, | |
232 | caller); | |
14cf11af PM |
233 | if (area == NULL) |
234 | return NULL; | |
7a9d1256 ME |
235 | |
236 | area->phys_addr = paligned; | |
3d5134ee | 237 | ret = __ioremap_at(paligned, area->addr, size, flags); |
14cf11af | 238 | if (!ret) |
3d5134ee | 239 | vunmap(area->addr); |
14cf11af | 240 | } else { |
3d5134ee | 241 | ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); |
14cf11af PM |
242 | if (ret) |
243 | ioremap_bot += size; | |
244 | } | |
3d5134ee BH |
245 | |
246 | if (ret) | |
247 | ret += addr & ~PAGE_MASK; | |
14cf11af PM |
248 | return ret; |
249 | } | |
250 | ||
1cdab55d BH |
251 | void __iomem * __ioremap(phys_addr_t addr, unsigned long size, |
252 | unsigned long flags) | |
253 | { | |
254 | return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); | |
255 | } | |
4cb3cee0 | 256 | |
68a64357 | 257 | void __iomem * ioremap(phys_addr_t addr, unsigned long size) |
4cb3cee0 BH |
258 | { |
259 | unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; | |
1cdab55d | 260 | void *caller = __builtin_return_address(0); |
4cb3cee0 BH |
261 | |
262 | if (ppc_md.ioremap) | |
1cdab55d BH |
263 | return ppc_md.ioremap(addr, size, flags, caller); |
264 | return __ioremap_caller(addr, size, flags, caller); | |
4cb3cee0 BH |
265 | } |
266 | ||
be135f40 AB |
267 | void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) |
268 | { | |
269 | unsigned long flags = _PAGE_NO_CACHE; | |
270 | void *caller = __builtin_return_address(0); | |
271 | ||
272 | if (ppc_md.ioremap) | |
273 | return ppc_md.ioremap(addr, size, flags, caller); | |
274 | return __ioremap_caller(addr, size, flags, caller); | |
275 | } | |
276 | ||
40f1ce7f | 277 | void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, |
4cb3cee0 BH |
278 | unsigned long flags) |
279 | { | |
1cdab55d BH |
280 | void *caller = __builtin_return_address(0); |
281 | ||
a1f242ff BH |
282 | /* writeable implies dirty for kernel addresses */ |
283 | if (flags & _PAGE_RW) | |
284 | flags |= _PAGE_DIRTY; | |
285 | ||
286 | /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ | |
287 | flags &= ~(_PAGE_USER | _PAGE_EXEC); | |
288 | ||
55052eec BH |
289 | #ifdef _PAGE_BAP_SR |
290 | /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format | |
291 | * which means that we just cleared supervisor access... oops ;-) This | |
292 | * restores it | |
293 | */ | |
294 | flags |= _PAGE_BAP_SR; | |
295 | #endif | |
296 | ||
4cb3cee0 | 297 | if (ppc_md.ioremap) |
1cdab55d BH |
298 | return ppc_md.ioremap(addr, size, flags, caller); |
299 | return __ioremap_caller(addr, size, flags, caller); | |
4cb3cee0 BH |
300 | } |
301 | ||
302 | ||
14cf11af PM |
303 | /* |
304 | * Unmap an IO region and remove it from imalloc'd list. | |
305 | * Access to IO memory should be serialized by driver. | |
14cf11af | 306 | */ |
68a64357 | 307 | void __iounmap(volatile void __iomem *token) |
14cf11af PM |
308 | { |
309 | void *addr; | |
310 | ||
311 | if (!mem_init_done) | |
312 | return; | |
313 | ||
3d5134ee BH |
314 | addr = (void *) ((unsigned long __force) |
315 | PCI_FIX_ADDR(token) & PAGE_MASK); | |
316 | if ((unsigned long)addr < ioremap_bot) { | |
317 | printk(KERN_WARNING "Attempt to iounmap early bolted mapping" | |
318 | " at 0x%p\n", addr); | |
319 | return; | |
320 | } | |
321 | vunmap(addr); | |
14cf11af PM |
322 | } |
323 | ||
68a64357 | 324 | void iounmap(volatile void __iomem *token) |
4cb3cee0 BH |
325 | { |
326 | if (ppc_md.iounmap) | |
327 | ppc_md.iounmap(token); | |
328 | else | |
329 | __iounmap(token); | |
330 | } | |
331 | ||
14cf11af | 332 | EXPORT_SYMBOL(ioremap); |
be135f40 | 333 | EXPORT_SYMBOL(ioremap_wc); |
40f1ce7f | 334 | EXPORT_SYMBOL(ioremap_prot); |
14cf11af | 335 | EXPORT_SYMBOL(__ioremap); |
a302cb9d | 336 | EXPORT_SYMBOL(__ioremap_at); |
14cf11af | 337 | EXPORT_SYMBOL(iounmap); |
4cb3cee0 | 338 | EXPORT_SYMBOL(__iounmap); |
a302cb9d | 339 | EXPORT_SYMBOL(__iounmap_at); |
5c1f6ee9 | 340 | |
074c2eae AK |
341 | /* |
342 | * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags | |
343 | * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address. | |
344 | */ | |
345 | struct page *pmd_page(pmd_t pmd) | |
346 | { | |
347 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
348 | if (pmd_trans_huge(pmd)) | |
349 | return pfn_to_page(pmd_pfn(pmd)); | |
350 | #endif | |
351 | return virt_to_page(pmd_page_vaddr(pmd)); | |
352 | } | |
353 | ||
5c1f6ee9 AK |
354 | #ifdef CONFIG_PPC_64K_PAGES |
355 | static pte_t *get_from_cache(struct mm_struct *mm) | |
356 | { | |
357 | void *pte_frag, *ret; | |
358 | ||
359 | spin_lock(&mm->page_table_lock); | |
360 | ret = mm->context.pte_frag; | |
361 | if (ret) { | |
362 | pte_frag = ret + PTE_FRAG_SIZE; | |
363 | /* | |
364 | * If we have taken up all the fragments mark PTE page NULL | |
365 | */ | |
366 | if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) | |
367 | pte_frag = NULL; | |
368 | mm->context.pte_frag = pte_frag; | |
369 | } | |
370 | spin_unlock(&mm->page_table_lock); | |
371 | return (pte_t *)ret; | |
372 | } | |
373 | ||
374 | static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) | |
375 | { | |
376 | void *ret = NULL; | |
377 | struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | | |
378 | __GFP_REPEAT | __GFP_ZERO); | |
379 | if (!page) | |
380 | return NULL; | |
381 | ||
382 | ret = page_address(page); | |
383 | spin_lock(&mm->page_table_lock); | |
384 | /* | |
385 | * If we find pgtable_page set, we return | |
386 | * the allocated page with single fragement | |
387 | * count. | |
388 | */ | |
389 | if (likely(!mm->context.pte_frag)) { | |
390 | atomic_set(&page->_count, PTE_FRAG_NR); | |
391 | mm->context.pte_frag = ret + PTE_FRAG_SIZE; | |
392 | } | |
393 | spin_unlock(&mm->page_table_lock); | |
394 | ||
395 | if (!kernel) | |
396 | pgtable_page_ctor(page); | |
397 | ||
398 | return (pte_t *)ret; | |
399 | } | |
400 | ||
401 | pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) | |
402 | { | |
403 | pte_t *pte; | |
404 | ||
405 | pte = get_from_cache(mm); | |
406 | if (pte) | |
407 | return pte; | |
408 | ||
409 | return __alloc_for_cache(mm, kernel); | |
410 | } | |
411 | ||
412 | void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) | |
413 | { | |
414 | struct page *page = virt_to_page(table); | |
415 | if (put_page_testzero(page)) { | |
416 | if (!kernel) | |
417 | pgtable_page_dtor(page); | |
418 | free_hot_cold_page(page, 0); | |
419 | } | |
420 | } | |
421 | ||
422 | #ifdef CONFIG_SMP | |
423 | static void page_table_free_rcu(void *table) | |
424 | { | |
425 | struct page *page = virt_to_page(table); | |
426 | if (put_page_testzero(page)) { | |
427 | pgtable_page_dtor(page); | |
428 | free_hot_cold_page(page, 0); | |
429 | } | |
430 | } | |
431 | ||
432 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) | |
433 | { | |
434 | unsigned long pgf = (unsigned long)table; | |
435 | ||
436 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
437 | pgf |= shift; | |
438 | tlb_remove_table(tlb, (void *)pgf); | |
439 | } | |
440 | ||
441 | void __tlb_remove_table(void *_table) | |
442 | { | |
443 | void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); | |
444 | unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; | |
445 | ||
446 | if (!shift) | |
447 | /* PTE page needs special handling */ | |
448 | page_table_free_rcu(table); | |
449 | else { | |
450 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
451 | kmem_cache_free(PGT_CACHE(shift), table); | |
452 | } | |
453 | } | |
454 | #else | |
455 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) | |
456 | { | |
457 | if (!shift) { | |
458 | /* PTE page needs special handling */ | |
459 | struct page *page = virt_to_page(table); | |
460 | if (put_page_testzero(page)) { | |
461 | pgtable_page_dtor(page); | |
462 | free_hot_cold_page(page, 0); | |
463 | } | |
464 | } else { | |
465 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
466 | kmem_cache_free(PGT_CACHE(shift), table); | |
467 | } | |
468 | } | |
469 | #endif | |
470 | #endif /* CONFIG_PPC_64K_PAGES */ | |
074c2eae AK |
471 | |
472 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
473 | ||
474 | /* | |
475 | * This is called when relaxing access to a hugepage. It's also called in the page | |
476 | * fault path when we don't hit any of the major fault cases, ie, a minor | |
477 | * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have | |
478 | * handled those two for us, we additionally deal with missing execute | |
479 | * permission here on some processors | |
480 | */ | |
481 | int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, | |
482 | pmd_t *pmdp, pmd_t entry, int dirty) | |
483 | { | |
484 | int changed; | |
485 | #ifdef CONFIG_DEBUG_VM | |
486 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
487 | assert_spin_locked(&vma->vm_mm->page_table_lock); | |
488 | #endif | |
489 | changed = !pmd_same(*(pmdp), entry); | |
490 | if (changed) { | |
491 | __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); | |
492 | /* | |
493 | * Since we are not supporting SW TLB systems, we don't | |
494 | * have any thing similar to flush_tlb_page_nohash() | |
495 | */ | |
496 | } | |
497 | return changed; | |
498 | } | |
499 | ||
500 | unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, | |
501 | pmd_t *pmdp, unsigned long clr) | |
502 | { | |
503 | ||
504 | unsigned long old, tmp; | |
505 | ||
506 | #ifdef CONFIG_DEBUG_VM | |
507 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
508 | assert_spin_locked(&mm->page_table_lock); | |
509 | #endif | |
510 | ||
511 | #ifdef PTE_ATOMIC_UPDATES | |
512 | __asm__ __volatile__( | |
513 | "1: ldarx %0,0,%3\n\ | |
514 | andi. %1,%0,%6\n\ | |
515 | bne- 1b \n\ | |
516 | andc %1,%0,%4 \n\ | |
517 | stdcx. %1,0,%3 \n\ | |
518 | bne- 1b" | |
519 | : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) | |
520 | : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY) | |
521 | : "cc" ); | |
522 | #else | |
523 | old = pmd_val(*pmdp); | |
524 | *pmdp = __pmd(old & ~clr); | |
525 | #endif | |
526 | if (old & _PAGE_HASHPTE) | |
527 | hpte_do_hugepage_flush(mm, addr, pmdp); | |
528 | return old; | |
529 | } | |
530 | ||
531 | pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | |
532 | pmd_t *pmdp) | |
533 | { | |
534 | pmd_t pmd; | |
535 | ||
536 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
537 | if (pmd_trans_huge(*pmdp)) { | |
538 | pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); | |
539 | } else { | |
540 | /* | |
541 | * khugepaged calls this for normal pmd | |
542 | */ | |
543 | pmd = *pmdp; | |
544 | pmd_clear(pmdp); | |
545 | /* | |
546 | * Wait for all pending hash_page to finish. This is needed | |
547 | * in case of subpage collapse. When we collapse normal pages | |
548 | * to hugepage, we first clear the pmd, then invalidate all | |
549 | * the PTE entries. The assumption here is that any low level | |
550 | * page fault will see a none pmd and take the slow path that | |
551 | * will wait on mmap_sem. But we could very well be in a | |
552 | * hash_page with local ptep pointer value. Such a hash page | |
553 | * can result in adding new HPTE entries for normal subpages. | |
554 | * That means we could be modifying the page content as we | |
555 | * copy them to a huge page. So wait for parallel hash_page | |
556 | * to finish before invalidating HPTE entries. We can do this | |
557 | * by sending an IPI to all the cpus and executing a dummy | |
558 | * function there. | |
559 | */ | |
560 | kick_all_cpus_sync(); | |
561 | /* | |
562 | * Now invalidate the hpte entries in the range | |
563 | * covered by pmd. This make sure we take a | |
564 | * fault and will find the pmd as none, which will | |
565 | * result in a major fault which takes mmap_sem and | |
566 | * hence wait for collapse to complete. Without this | |
567 | * the __collapse_huge_page_copy can result in copying | |
568 | * the old content. | |
569 | */ | |
570 | flush_tlb_pmd_range(vma->vm_mm, &pmd, address); | |
571 | } | |
572 | return pmd; | |
573 | } | |
574 | ||
575 | int pmdp_test_and_clear_young(struct vm_area_struct *vma, | |
576 | unsigned long address, pmd_t *pmdp) | |
577 | { | |
578 | return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); | |
579 | } | |
580 | ||
581 | /* | |
582 | * We currently remove entries from the hashtable regardless of whether | |
583 | * the entry was young or dirty. The generic routines only flush if the | |
584 | * entry was young or dirty which is not good enough. | |
585 | * | |
586 | * We should be more intelligent about this but for the moment we override | |
587 | * these functions and force a tlb flush unconditionally | |
588 | */ | |
589 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | |
590 | unsigned long address, pmd_t *pmdp) | |
591 | { | |
592 | return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); | |
593 | } | |
594 | ||
595 | /* | |
596 | * We mark the pmd splitting and invalidate all the hpte | |
597 | * entries for this hugepage. | |
598 | */ | |
599 | void pmdp_splitting_flush(struct vm_area_struct *vma, | |
600 | unsigned long address, pmd_t *pmdp) | |
601 | { | |
602 | unsigned long old, tmp; | |
603 | ||
604 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
605 | ||
606 | #ifdef CONFIG_DEBUG_VM | |
607 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
608 | assert_spin_locked(&vma->vm_mm->page_table_lock); | |
609 | #endif | |
610 | ||
611 | #ifdef PTE_ATOMIC_UPDATES | |
612 | ||
613 | __asm__ __volatile__( | |
614 | "1: ldarx %0,0,%3\n\ | |
615 | andi. %1,%0,%6\n\ | |
616 | bne- 1b \n\ | |
617 | ori %1,%0,%4 \n\ | |
618 | stdcx. %1,0,%3 \n\ | |
619 | bne- 1b" | |
620 | : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) | |
621 | : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY) | |
622 | : "cc" ); | |
623 | #else | |
624 | old = pmd_val(*pmdp); | |
625 | *pmdp = __pmd(old | _PAGE_SPLITTING); | |
626 | #endif | |
627 | /* | |
628 | * If we didn't had the splitting flag set, go and flush the | |
629 | * HPTE entries. | |
630 | */ | |
631 | if (!(old & _PAGE_SPLITTING)) { | |
632 | /* We need to flush the hpte */ | |
633 | if (old & _PAGE_HASHPTE) | |
634 | hpte_do_hugepage_flush(vma->vm_mm, address, pmdp); | |
635 | } | |
636 | } | |
637 | ||
638 | /* | |
639 | * We want to put the pgtable in pmd and use pgtable for tracking | |
640 | * the base page size hptes | |
641 | */ | |
642 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | |
643 | pgtable_t pgtable) | |
644 | { | |
645 | pgtable_t *pgtable_slot; | |
646 | assert_spin_locked(&mm->page_table_lock); | |
647 | /* | |
648 | * we store the pgtable in the second half of PMD | |
649 | */ | |
650 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
651 | *pgtable_slot = pgtable; | |
652 | /* | |
653 | * expose the deposited pgtable to other cpus. | |
654 | * before we set the hugepage PTE at pmd level | |
655 | * hash fault code looks at the deposted pgtable | |
656 | * to store hash index values. | |
657 | */ | |
658 | smp_wmb(); | |
659 | } | |
660 | ||
661 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) | |
662 | { | |
663 | pgtable_t pgtable; | |
664 | pgtable_t *pgtable_slot; | |
665 | ||
666 | assert_spin_locked(&mm->page_table_lock); | |
667 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
668 | pgtable = *pgtable_slot; | |
669 | /* | |
670 | * Once we withdraw, mark the entry NULL. | |
671 | */ | |
672 | *pgtable_slot = NULL; | |
673 | /* | |
674 | * We store HPTE information in the deposited PTE fragment. | |
675 | * zero out the content on withdraw. | |
676 | */ | |
677 | memset(pgtable, 0, PTE_FRAG_SIZE); | |
678 | return pgtable; | |
679 | } | |
680 | ||
681 | /* | |
682 | * set a new huge pmd. We should not be called for updating | |
683 | * an existing pmd entry. That should go via pmd_hugepage_update. | |
684 | */ | |
685 | void set_pmd_at(struct mm_struct *mm, unsigned long addr, | |
686 | pmd_t *pmdp, pmd_t pmd) | |
687 | { | |
688 | #ifdef CONFIG_DEBUG_VM | |
689 | WARN_ON(!pmd_none(*pmdp)); | |
690 | assert_spin_locked(&mm->page_table_lock); | |
691 | WARN_ON(!pmd_trans_huge(pmd)); | |
692 | #endif | |
693 | return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); | |
694 | } | |
695 | ||
696 | void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | |
697 | pmd_t *pmdp) | |
698 | { | |
699 | pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT); | |
700 | } | |
701 | ||
702 | /* | |
703 | * A linux hugepage PMD was changed and the corresponding hash table entries | |
704 | * neesd to be flushed. | |
705 | */ | |
706 | void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, | |
707 | pmd_t *pmdp) | |
708 | { | |
709 | int ssize, i; | |
710 | unsigned long s_addr; | |
1a527286 | 711 | int max_hpte_count; |
074c2eae AK |
712 | unsigned int psize, valid; |
713 | unsigned char *hpte_slot_array; | |
714 | unsigned long hidx, vpn, vsid, hash, shift, slot; | |
715 | ||
716 | /* | |
717 | * Flush all the hptes mapping this hugepage | |
718 | */ | |
719 | s_addr = addr & HPAGE_PMD_MASK; | |
720 | hpte_slot_array = get_hpte_slot_array(pmdp); | |
721 | /* | |
722 | * IF we try to do a HUGE PTE update after a withdraw is done. | |
723 | * we will find the below NULL. This happens when we do | |
724 | * split_huge_page_pmd | |
725 | */ | |
726 | if (!hpte_slot_array) | |
727 | return; | |
728 | ||
729 | /* get the base page size */ | |
730 | psize = get_slice_psize(mm, s_addr); | |
074c2eae | 731 | |
1a527286 AK |
732 | if (ppc_md.hugepage_invalidate) |
733 | return ppc_md.hugepage_invalidate(mm, hpte_slot_array, | |
734 | s_addr, psize); | |
735 | /* | |
736 | * No bluk hpte removal support, invalidate each entry | |
737 | */ | |
738 | shift = mmu_psize_defs[psize].shift; | |
739 | max_hpte_count = HPAGE_PMD_SIZE >> shift; | |
740 | for (i = 0; i < max_hpte_count; i++) { | |
074c2eae AK |
741 | /* |
742 | * 8 bits per each hpte entries | |
743 | * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] | |
744 | */ | |
745 | valid = hpte_valid(hpte_slot_array, i); | |
746 | if (!valid) | |
747 | continue; | |
748 | hidx = hpte_hash_index(hpte_slot_array, i); | |
749 | ||
750 | /* get the vpn */ | |
751 | addr = s_addr + (i * (1ul << shift)); | |
752 | if (!is_kernel_addr(addr)) { | |
753 | ssize = user_segment_size(addr); | |
754 | vsid = get_vsid(mm->context.id, addr, ssize); | |
755 | WARN_ON(vsid == 0); | |
756 | } else { | |
757 | vsid = get_kernel_vsid(addr, mmu_kernel_ssize); | |
758 | ssize = mmu_kernel_ssize; | |
759 | } | |
760 | ||
761 | vpn = hpt_vpn(addr, vsid, ssize); | |
762 | hash = hpt_hash(vpn, shift, ssize); | |
763 | if (hidx & _PTEIDX_SECONDARY) | |
764 | hash = ~hash; | |
765 | ||
766 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | |
767 | slot += hidx & _PTEIDX_GROUP_IX; | |
768 | ppc_md.hpte_invalidate(slot, vpn, psize, | |
769 | MMU_PAGE_16M, ssize, 0); | |
770 | } | |
771 | } | |
772 | ||
773 | static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) | |
774 | { | |
775 | pmd_val(pmd) |= pgprot_val(pgprot); | |
776 | return pmd; | |
777 | } | |
778 | ||
779 | pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) | |
780 | { | |
781 | pmd_t pmd; | |
782 | /* | |
783 | * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always | |
784 | * set. We use this to check THP page at pmd level. | |
785 | * leaf pte for huge page, bottom two bits != 00 | |
786 | */ | |
787 | pmd_val(pmd) = pfn << PTE_RPN_SHIFT; | |
788 | pmd_val(pmd) |= _PAGE_THP_HUGE; | |
789 | pmd = pmd_set_protbits(pmd, pgprot); | |
790 | return pmd; | |
791 | } | |
792 | ||
793 | pmd_t mk_pmd(struct page *page, pgprot_t pgprot) | |
794 | { | |
795 | return pfn_pmd(page_to_pfn(page), pgprot); | |
796 | } | |
797 | ||
798 | pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | |
799 | { | |
800 | ||
801 | pmd_val(pmd) &= _HPAGE_CHG_MASK; | |
802 | pmd = pmd_set_protbits(pmd, newprot); | |
803 | return pmd; | |
804 | } | |
805 | ||
806 | /* | |
807 | * This is called at the end of handling a user page fault, when the | |
808 | * fault has been handled by updating a HUGE PMD entry in the linux page tables. | |
809 | * We use it to preload an HPTE into the hash table corresponding to | |
810 | * the updated linux HUGE PMD entry. | |
811 | */ | |
812 | void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, | |
813 | pmd_t *pmd) | |
814 | { | |
815 | return; | |
816 | } | |
817 | ||
818 | pmd_t pmdp_get_and_clear(struct mm_struct *mm, | |
819 | unsigned long addr, pmd_t *pmdp) | |
820 | { | |
821 | pmd_t old_pmd; | |
822 | pgtable_t pgtable; | |
823 | unsigned long old; | |
824 | pgtable_t *pgtable_slot; | |
825 | ||
826 | old = pmd_hugepage_update(mm, addr, pmdp, ~0UL); | |
827 | old_pmd = __pmd(old); | |
828 | /* | |
829 | * We have pmd == none and we are holding page_table_lock. | |
830 | * So we can safely go and clear the pgtable hash | |
831 | * index info. | |
832 | */ | |
833 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
834 | pgtable = *pgtable_slot; | |
835 | /* | |
836 | * Let's zero out old valid and hash index details | |
837 | * hash fault look at them. | |
838 | */ | |
839 | memset(pgtable, 0, PTE_FRAG_SIZE); | |
840 | return old_pmd; | |
841 | } | |
437d4964 AK |
842 | |
843 | int has_transparent_hugepage(void) | |
844 | { | |
845 | if (!mmu_has_feature(MMU_FTR_16M_PAGE)) | |
846 | return 0; | |
847 | /* | |
848 | * We support THP only if PMD_SIZE is 16MB. | |
849 | */ | |
850 | if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) | |
851 | return 0; | |
852 | /* | |
853 | * We need to make sure that we support 16MB hugepage in a segement | |
854 | * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE | |
855 | * of 64K. | |
856 | */ | |
857 | /* | |
858 | * If we have 64K HPTE, we will be using that by default | |
859 | */ | |
860 | if (mmu_psize_defs[MMU_PAGE_64K].shift && | |
861 | (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) | |
862 | return 0; | |
863 | /* | |
864 | * Ok we only have 4K HPTE | |
865 | */ | |
866 | if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) | |
867 | return 0; | |
868 | ||
869 | return 1; | |
870 | } | |
074c2eae | 871 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |