]>
Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains ioremap and related functions for 64-bit machines. | |
3 | * | |
4 | * Derived from arch/ppc64/mm/init.c | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * Modifications by Paul Mackerras (PowerMac) (paulus@samba.org) | |
8 | * and Cort Dougan (PReP) (cort@cs.nmt.edu) | |
9 | * Copyright (C) 1996 Paul Mackerras | |
14cf11af PM |
10 | * |
11 | * Derived from "arch/i386/mm/init.c" | |
12 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | |
13 | * | |
14 | * Dave Engebretsen <engebret@us.ibm.com> | |
15 | * Rework for PPC64 port. | |
16 | * | |
17 | * This program is free software; you can redistribute it and/or | |
18 | * modify it under the terms of the GNU General Public License | |
19 | * as published by the Free Software Foundation; either version | |
20 | * 2 of the License, or (at your option) any later version. | |
21 | * | |
22 | */ | |
23 | ||
14cf11af PM |
24 | #include <linux/signal.h> |
25 | #include <linux/sched.h> | |
26 | #include <linux/kernel.h> | |
27 | #include <linux/errno.h> | |
28 | #include <linux/string.h> | |
66b15db6 | 29 | #include <linux/export.h> |
14cf11af PM |
30 | #include <linux/types.h> |
31 | #include <linux/mman.h> | |
32 | #include <linux/mm.h> | |
33 | #include <linux/swap.h> | |
34 | #include <linux/stddef.h> | |
35 | #include <linux/vmalloc.h> | |
36 | #include <linux/init.h> | |
a245067e | 37 | #include <linux/bootmem.h> |
95f72d1e | 38 | #include <linux/memblock.h> |
5a0e3ad6 | 39 | #include <linux/slab.h> |
14cf11af PM |
40 | |
41 | #include <asm/pgalloc.h> | |
42 | #include <asm/page.h> | |
43 | #include <asm/prom.h> | |
14cf11af PM |
44 | #include <asm/io.h> |
45 | #include <asm/mmu_context.h> | |
46 | #include <asm/pgtable.h> | |
47 | #include <asm/mmu.h> | |
14cf11af PM |
48 | #include <asm/smp.h> |
49 | #include <asm/machdep.h> | |
50 | #include <asm/tlb.h> | |
14cf11af | 51 | #include <asm/processor.h> |
14cf11af | 52 | #include <asm/cputable.h> |
14cf11af | 53 | #include <asm/sections.h> |
5e203d68 | 54 | #include <asm/firmware.h> |
800fc3ee DG |
55 | |
56 | #include "mmu_decl.h" | |
14cf11af | 57 | |
78f1dbde AK |
58 | /* Some sanity checking */ |
59 | #if TASK_SIZE_USER64 > PGTABLE_RANGE | |
60 | #error TASK_SIZE_USER64 exceeds pagetable range | |
61 | #endif | |
62 | ||
63 | #ifdef CONFIG_PPC_STD_MMU_64 | |
af81d787 | 64 | #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) |
78f1dbde AK |
65 | #error TASK_SIZE_USER64 exceeds user VSID range |
66 | #endif | |
67 | #endif | |
14cf11af | 68 | |
78f1dbde | 69 | unsigned long ioremap_bot = IOREMAP_BASE; |
a245067e BH |
70 | |
71 | #ifdef CONFIG_PPC_MMU_NOHASH | |
72 | static void *early_alloc_pgtable(unsigned long size) | |
73 | { | |
74 | void *pt; | |
75 | ||
76 | if (init_bootmem_done) | |
77 | pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS)); | |
78 | else | |
95f72d1e | 79 | pt = __va(memblock_alloc_base(size, size, |
a245067e BH |
80 | __pa(MAX_DMA_ADDRESS))); |
81 | memset(pt, 0, size); | |
82 | ||
83 | return pt; | |
84 | } | |
85 | #endif /* CONFIG_PPC_MMU_NOHASH */ | |
86 | ||
14cf11af | 87 | /* |
a245067e BH |
88 | * map_kernel_page currently only called by __ioremap |
89 | * map_kernel_page adds an entry to the ioremap page table | |
14cf11af PM |
90 | * and adds an entry to the HPT, possibly bolting it |
91 | */ | |
32a74949 | 92 | int map_kernel_page(unsigned long ea, unsigned long pa, int flags) |
14cf11af PM |
93 | { |
94 | pgd_t *pgdp; | |
95 | pud_t *pudp; | |
96 | pmd_t *pmdp; | |
97 | pte_t *ptep; | |
14cf11af | 98 | |
a245067e | 99 | if (slab_is_available()) { |
14cf11af PM |
100 | pgdp = pgd_offset_k(ea); |
101 | pudp = pud_alloc(&init_mm, pgdp, ea); | |
102 | if (!pudp) | |
103 | return -ENOMEM; | |
104 | pmdp = pmd_alloc(&init_mm, pudp, ea); | |
105 | if (!pmdp) | |
106 | return -ENOMEM; | |
23fd0775 | 107 | ptep = pte_alloc_kernel(pmdp, ea); |
14cf11af PM |
108 | if (!ptep) |
109 | return -ENOMEM; | |
110 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, | |
111 | __pgprot(flags))); | |
14cf11af | 112 | } else { |
a245067e BH |
113 | #ifdef CONFIG_PPC_MMU_NOHASH |
114 | /* Warning ! This will blow up if bootmem is not initialized | |
115 | * which our ppc64 code is keen to do that, we'll need to | |
116 | * fix it and/or be more careful | |
117 | */ | |
118 | pgdp = pgd_offset_k(ea); | |
119 | #ifdef PUD_TABLE_SIZE | |
120 | if (pgd_none(*pgdp)) { | |
121 | pudp = early_alloc_pgtable(PUD_TABLE_SIZE); | |
122 | BUG_ON(pudp == NULL); | |
123 | pgd_populate(&init_mm, pgdp, pudp); | |
124 | } | |
125 | #endif /* PUD_TABLE_SIZE */ | |
126 | pudp = pud_offset(pgdp, ea); | |
127 | if (pud_none(*pudp)) { | |
128 | pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); | |
129 | BUG_ON(pmdp == NULL); | |
130 | pud_populate(&init_mm, pudp, pmdp); | |
131 | } | |
132 | pmdp = pmd_offset(pudp, ea); | |
133 | if (!pmd_present(*pmdp)) { | |
134 | ptep = early_alloc_pgtable(PAGE_SIZE); | |
135 | BUG_ON(ptep == NULL); | |
136 | pmd_populate_kernel(&init_mm, pmdp, ptep); | |
137 | } | |
138 | ptep = pte_offset_kernel(pmdp, ea); | |
139 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, | |
140 | __pgprot(flags))); | |
141 | #else /* CONFIG_PPC_MMU_NOHASH */ | |
14cf11af PM |
142 | /* |
143 | * If the mm subsystem is not fully up, we cannot create a | |
144 | * linux page table entry for this mapping. Simply bolt an | |
145 | * entry in the hardware page table. | |
3c726f8d | 146 | * |
14cf11af | 147 | */ |
1189be65 PM |
148 | if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, |
149 | mmu_io_psize, mmu_kernel_ssize)) { | |
77ac166f BH |
150 | printk(KERN_ERR "Failed to do bolted mapping IO " |
151 | "memory at %016lx !\n", pa); | |
152 | return -ENOMEM; | |
153 | } | |
a245067e | 154 | #endif /* !CONFIG_PPC_MMU_NOHASH */ |
14cf11af PM |
155 | } |
156 | return 0; | |
157 | } | |
158 | ||
159 | ||
3d5134ee BH |
160 | /** |
161 | * __ioremap_at - Low level function to establish the page tables | |
162 | * for an IO mapping | |
163 | */ | |
164 | void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, | |
14cf11af PM |
165 | unsigned long flags) |
166 | { | |
167 | unsigned long i; | |
168 | ||
a1f242ff | 169 | /* Make sure we have the base flags */ |
14cf11af PM |
170 | if ((flags & _PAGE_PRESENT) == 0) |
171 | flags |= pgprot_val(PAGE_KERNEL); | |
172 | ||
a1f242ff BH |
173 | /* Non-cacheable page cannot be coherent */ |
174 | if (flags & _PAGE_NO_CACHE) | |
175 | flags &= ~_PAGE_COHERENT; | |
176 | ||
177 | /* We don't support the 4K PFN hack with ioremap */ | |
178 | if (flags & _PAGE_4K_PFN) | |
179 | return NULL; | |
180 | ||
3d5134ee BH |
181 | WARN_ON(pa & ~PAGE_MASK); |
182 | WARN_ON(((unsigned long)ea) & ~PAGE_MASK); | |
183 | WARN_ON(size & ~PAGE_MASK); | |
184 | ||
14cf11af | 185 | for (i = 0; i < size; i += PAGE_SIZE) |
a245067e | 186 | if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) |
14cf11af PM |
187 | return NULL; |
188 | ||
3d5134ee BH |
189 | return (void __iomem *)ea; |
190 | } | |
191 | ||
192 | /** | |
193 | * __iounmap_from - Low level function to tear down the page tables | |
194 | * for an IO mapping. This is used for mappings that | |
195 | * are manipulated manually, like partial unmapping of | |
196 | * PCI IOs or ISA space. | |
197 | */ | |
198 | void __iounmap_at(void *ea, unsigned long size) | |
199 | { | |
200 | WARN_ON(((unsigned long)ea) & ~PAGE_MASK); | |
201 | WARN_ON(size & ~PAGE_MASK); | |
202 | ||
203 | unmap_kernel_range((unsigned long)ea, size); | |
14cf11af PM |
204 | } |
205 | ||
1cdab55d BH |
206 | void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, |
207 | unsigned long flags, void *caller) | |
14cf11af | 208 | { |
3d5134ee | 209 | phys_addr_t paligned; |
14cf11af PM |
210 | void __iomem *ret; |
211 | ||
212 | /* | |
213 | * Choose an address to map it to. | |
214 | * Once the imalloc system is running, we use it. | |
215 | * Before that, we map using addresses going | |
216 | * up from ioremap_bot. imalloc will use | |
217 | * the addresses from ioremap_bot through | |
218 | * IMALLOC_END | |
219 | * | |
220 | */ | |
3d5134ee BH |
221 | paligned = addr & PAGE_MASK; |
222 | size = PAGE_ALIGN(addr + size) - paligned; | |
14cf11af | 223 | |
3d5134ee | 224 | if ((size == 0) || (paligned == 0)) |
14cf11af PM |
225 | return NULL; |
226 | ||
227 | if (mem_init_done) { | |
228 | struct vm_struct *area; | |
3d5134ee | 229 | |
1cdab55d BH |
230 | area = __get_vm_area_caller(size, VM_IOREMAP, |
231 | ioremap_bot, IOREMAP_END, | |
232 | caller); | |
14cf11af PM |
233 | if (area == NULL) |
234 | return NULL; | |
7a9d1256 ME |
235 | |
236 | area->phys_addr = paligned; | |
3d5134ee | 237 | ret = __ioremap_at(paligned, area->addr, size, flags); |
14cf11af | 238 | if (!ret) |
3d5134ee | 239 | vunmap(area->addr); |
14cf11af | 240 | } else { |
3d5134ee | 241 | ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); |
14cf11af PM |
242 | if (ret) |
243 | ioremap_bot += size; | |
244 | } | |
3d5134ee BH |
245 | |
246 | if (ret) | |
247 | ret += addr & ~PAGE_MASK; | |
14cf11af PM |
248 | return ret; |
249 | } | |
250 | ||
1cdab55d BH |
251 | void __iomem * __ioremap(phys_addr_t addr, unsigned long size, |
252 | unsigned long flags) | |
253 | { | |
254 | return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); | |
255 | } | |
4cb3cee0 | 256 | |
68a64357 | 257 | void __iomem * ioremap(phys_addr_t addr, unsigned long size) |
4cb3cee0 BH |
258 | { |
259 | unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; | |
1cdab55d | 260 | void *caller = __builtin_return_address(0); |
4cb3cee0 BH |
261 | |
262 | if (ppc_md.ioremap) | |
1cdab55d BH |
263 | return ppc_md.ioremap(addr, size, flags, caller); |
264 | return __ioremap_caller(addr, size, flags, caller); | |
4cb3cee0 BH |
265 | } |
266 | ||
be135f40 AB |
267 | void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) |
268 | { | |
269 | unsigned long flags = _PAGE_NO_CACHE; | |
270 | void *caller = __builtin_return_address(0); | |
271 | ||
272 | if (ppc_md.ioremap) | |
273 | return ppc_md.ioremap(addr, size, flags, caller); | |
274 | return __ioremap_caller(addr, size, flags, caller); | |
275 | } | |
276 | ||
40f1ce7f | 277 | void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, |
4cb3cee0 BH |
278 | unsigned long flags) |
279 | { | |
1cdab55d BH |
280 | void *caller = __builtin_return_address(0); |
281 | ||
a1f242ff BH |
282 | /* writeable implies dirty for kernel addresses */ |
283 | if (flags & _PAGE_RW) | |
284 | flags |= _PAGE_DIRTY; | |
285 | ||
286 | /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ | |
287 | flags &= ~(_PAGE_USER | _PAGE_EXEC); | |
288 | ||
55052eec BH |
289 | #ifdef _PAGE_BAP_SR |
290 | /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format | |
291 | * which means that we just cleared supervisor access... oops ;-) This | |
292 | * restores it | |
293 | */ | |
294 | flags |= _PAGE_BAP_SR; | |
295 | #endif | |
296 | ||
4cb3cee0 | 297 | if (ppc_md.ioremap) |
1cdab55d BH |
298 | return ppc_md.ioremap(addr, size, flags, caller); |
299 | return __ioremap_caller(addr, size, flags, caller); | |
4cb3cee0 BH |
300 | } |
301 | ||
302 | ||
14cf11af PM |
303 | /* |
304 | * Unmap an IO region and remove it from imalloc'd list. | |
305 | * Access to IO memory should be serialized by driver. | |
14cf11af | 306 | */ |
68a64357 | 307 | void __iounmap(volatile void __iomem *token) |
14cf11af PM |
308 | { |
309 | void *addr; | |
310 | ||
311 | if (!mem_init_done) | |
312 | return; | |
313 | ||
3d5134ee BH |
314 | addr = (void *) ((unsigned long __force) |
315 | PCI_FIX_ADDR(token) & PAGE_MASK); | |
316 | if ((unsigned long)addr < ioremap_bot) { | |
317 | printk(KERN_WARNING "Attempt to iounmap early bolted mapping" | |
318 | " at 0x%p\n", addr); | |
319 | return; | |
320 | } | |
321 | vunmap(addr); | |
14cf11af PM |
322 | } |
323 | ||
68a64357 | 324 | void iounmap(volatile void __iomem *token) |
4cb3cee0 BH |
325 | { |
326 | if (ppc_md.iounmap) | |
327 | ppc_md.iounmap(token); | |
328 | else | |
329 | __iounmap(token); | |
330 | } | |
331 | ||
14cf11af | 332 | EXPORT_SYMBOL(ioremap); |
be135f40 | 333 | EXPORT_SYMBOL(ioremap_wc); |
40f1ce7f | 334 | EXPORT_SYMBOL(ioremap_prot); |
14cf11af | 335 | EXPORT_SYMBOL(__ioremap); |
a302cb9d | 336 | EXPORT_SYMBOL(__ioremap_at); |
14cf11af | 337 | EXPORT_SYMBOL(iounmap); |
4cb3cee0 | 338 | EXPORT_SYMBOL(__iounmap); |
a302cb9d | 339 | EXPORT_SYMBOL(__iounmap_at); |
5c1f6ee9 | 340 | |
074c2eae AK |
341 | /* |
342 | * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags | |
343 | * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address. | |
344 | */ | |
345 | struct page *pmd_page(pmd_t pmd) | |
346 | { | |
347 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
348 | if (pmd_trans_huge(pmd)) | |
349 | return pfn_to_page(pmd_pfn(pmd)); | |
350 | #endif | |
351 | return virt_to_page(pmd_page_vaddr(pmd)); | |
352 | } | |
353 | ||
5c1f6ee9 AK |
354 | #ifdef CONFIG_PPC_64K_PAGES |
355 | static pte_t *get_from_cache(struct mm_struct *mm) | |
356 | { | |
357 | void *pte_frag, *ret; | |
358 | ||
359 | spin_lock(&mm->page_table_lock); | |
360 | ret = mm->context.pte_frag; | |
361 | if (ret) { | |
362 | pte_frag = ret + PTE_FRAG_SIZE; | |
363 | /* | |
364 | * If we have taken up all the fragments mark PTE page NULL | |
365 | */ | |
366 | if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) | |
367 | pte_frag = NULL; | |
368 | mm->context.pte_frag = pte_frag; | |
369 | } | |
370 | spin_unlock(&mm->page_table_lock); | |
371 | return (pte_t *)ret; | |
372 | } | |
373 | ||
374 | static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) | |
375 | { | |
376 | void *ret = NULL; | |
377 | struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | | |
378 | __GFP_REPEAT | __GFP_ZERO); | |
379 | if (!page) | |
380 | return NULL; | |
4f804943 KS |
381 | if (!kernel && !pgtable_page_ctor(page)) { |
382 | __free_page(page); | |
383 | return NULL; | |
384 | } | |
5c1f6ee9 AK |
385 | |
386 | ret = page_address(page); | |
387 | spin_lock(&mm->page_table_lock); | |
388 | /* | |
389 | * If we find pgtable_page set, we return | |
390 | * the allocated page with single fragement | |
391 | * count. | |
392 | */ | |
393 | if (likely(!mm->context.pte_frag)) { | |
394 | atomic_set(&page->_count, PTE_FRAG_NR); | |
395 | mm->context.pte_frag = ret + PTE_FRAG_SIZE; | |
396 | } | |
397 | spin_unlock(&mm->page_table_lock); | |
398 | ||
5c1f6ee9 AK |
399 | return (pte_t *)ret; |
400 | } | |
401 | ||
402 | pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) | |
403 | { | |
404 | pte_t *pte; | |
405 | ||
406 | pte = get_from_cache(mm); | |
407 | if (pte) | |
408 | return pte; | |
409 | ||
410 | return __alloc_for_cache(mm, kernel); | |
411 | } | |
412 | ||
413 | void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) | |
414 | { | |
415 | struct page *page = virt_to_page(table); | |
416 | if (put_page_testzero(page)) { | |
417 | if (!kernel) | |
418 | pgtable_page_dtor(page); | |
419 | free_hot_cold_page(page, 0); | |
420 | } | |
421 | } | |
422 | ||
423 | #ifdef CONFIG_SMP | |
424 | static void page_table_free_rcu(void *table) | |
425 | { | |
426 | struct page *page = virt_to_page(table); | |
427 | if (put_page_testzero(page)) { | |
428 | pgtable_page_dtor(page); | |
429 | free_hot_cold_page(page, 0); | |
430 | } | |
431 | } | |
432 | ||
433 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) | |
434 | { | |
435 | unsigned long pgf = (unsigned long)table; | |
436 | ||
437 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
438 | pgf |= shift; | |
439 | tlb_remove_table(tlb, (void *)pgf); | |
440 | } | |
441 | ||
442 | void __tlb_remove_table(void *_table) | |
443 | { | |
444 | void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); | |
445 | unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; | |
446 | ||
447 | if (!shift) | |
448 | /* PTE page needs special handling */ | |
449 | page_table_free_rcu(table); | |
450 | else { | |
451 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
452 | kmem_cache_free(PGT_CACHE(shift), table); | |
453 | } | |
454 | } | |
455 | #else | |
456 | void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) | |
457 | { | |
458 | if (!shift) { | |
459 | /* PTE page needs special handling */ | |
460 | struct page *page = virt_to_page(table); | |
461 | if (put_page_testzero(page)) { | |
462 | pgtable_page_dtor(page); | |
463 | free_hot_cold_page(page, 0); | |
464 | } | |
465 | } else { | |
466 | BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); | |
467 | kmem_cache_free(PGT_CACHE(shift), table); | |
468 | } | |
469 | } | |
470 | #endif | |
471 | #endif /* CONFIG_PPC_64K_PAGES */ | |
074c2eae AK |
472 | |
473 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | |
474 | ||
475 | /* | |
476 | * This is called when relaxing access to a hugepage. It's also called in the page | |
477 | * fault path when we don't hit any of the major fault cases, ie, a minor | |
478 | * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have | |
479 | * handled those two for us, we additionally deal with missing execute | |
480 | * permission here on some processors | |
481 | */ | |
482 | int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, | |
483 | pmd_t *pmdp, pmd_t entry, int dirty) | |
484 | { | |
485 | int changed; | |
486 | #ifdef CONFIG_DEBUG_VM | |
487 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
488 | assert_spin_locked(&vma->vm_mm->page_table_lock); | |
489 | #endif | |
490 | changed = !pmd_same(*(pmdp), entry); | |
491 | if (changed) { | |
492 | __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); | |
493 | /* | |
494 | * Since we are not supporting SW TLB systems, we don't | |
495 | * have any thing similar to flush_tlb_page_nohash() | |
496 | */ | |
497 | } | |
498 | return changed; | |
499 | } | |
500 | ||
501 | unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, | |
502 | pmd_t *pmdp, unsigned long clr) | |
503 | { | |
504 | ||
505 | unsigned long old, tmp; | |
506 | ||
507 | #ifdef CONFIG_DEBUG_VM | |
508 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
509 | assert_spin_locked(&mm->page_table_lock); | |
510 | #endif | |
511 | ||
512 | #ifdef PTE_ATOMIC_UPDATES | |
513 | __asm__ __volatile__( | |
514 | "1: ldarx %0,0,%3\n\ | |
515 | andi. %1,%0,%6\n\ | |
516 | bne- 1b \n\ | |
517 | andc %1,%0,%4 \n\ | |
518 | stdcx. %1,0,%3 \n\ | |
519 | bne- 1b" | |
520 | : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) | |
521 | : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY) | |
522 | : "cc" ); | |
523 | #else | |
524 | old = pmd_val(*pmdp); | |
525 | *pmdp = __pmd(old & ~clr); | |
526 | #endif | |
527 | if (old & _PAGE_HASHPTE) | |
528 | hpte_do_hugepage_flush(mm, addr, pmdp); | |
529 | return old; | |
530 | } | |
531 | ||
532 | pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, | |
533 | pmd_t *pmdp) | |
534 | { | |
535 | pmd_t pmd; | |
536 | ||
537 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
538 | if (pmd_trans_huge(*pmdp)) { | |
539 | pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); | |
540 | } else { | |
541 | /* | |
542 | * khugepaged calls this for normal pmd | |
543 | */ | |
544 | pmd = *pmdp; | |
545 | pmd_clear(pmdp); | |
546 | /* | |
547 | * Wait for all pending hash_page to finish. This is needed | |
548 | * in case of subpage collapse. When we collapse normal pages | |
549 | * to hugepage, we first clear the pmd, then invalidate all | |
550 | * the PTE entries. The assumption here is that any low level | |
551 | * page fault will see a none pmd and take the slow path that | |
552 | * will wait on mmap_sem. But we could very well be in a | |
553 | * hash_page with local ptep pointer value. Such a hash page | |
554 | * can result in adding new HPTE entries for normal subpages. | |
555 | * That means we could be modifying the page content as we | |
556 | * copy them to a huge page. So wait for parallel hash_page | |
557 | * to finish before invalidating HPTE entries. We can do this | |
558 | * by sending an IPI to all the cpus and executing a dummy | |
559 | * function there. | |
560 | */ | |
561 | kick_all_cpus_sync(); | |
562 | /* | |
563 | * Now invalidate the hpte entries in the range | |
564 | * covered by pmd. This make sure we take a | |
565 | * fault and will find the pmd as none, which will | |
566 | * result in a major fault which takes mmap_sem and | |
567 | * hence wait for collapse to complete. Without this | |
568 | * the __collapse_huge_page_copy can result in copying | |
569 | * the old content. | |
570 | */ | |
571 | flush_tlb_pmd_range(vma->vm_mm, &pmd, address); | |
572 | } | |
573 | return pmd; | |
574 | } | |
575 | ||
576 | int pmdp_test_and_clear_young(struct vm_area_struct *vma, | |
577 | unsigned long address, pmd_t *pmdp) | |
578 | { | |
579 | return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); | |
580 | } | |
581 | ||
582 | /* | |
583 | * We currently remove entries from the hashtable regardless of whether | |
584 | * the entry was young or dirty. The generic routines only flush if the | |
585 | * entry was young or dirty which is not good enough. | |
586 | * | |
587 | * We should be more intelligent about this but for the moment we override | |
588 | * these functions and force a tlb flush unconditionally | |
589 | */ | |
590 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | |
591 | unsigned long address, pmd_t *pmdp) | |
592 | { | |
593 | return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); | |
594 | } | |
595 | ||
596 | /* | |
597 | * We mark the pmd splitting and invalidate all the hpte | |
598 | * entries for this hugepage. | |
599 | */ | |
600 | void pmdp_splitting_flush(struct vm_area_struct *vma, | |
601 | unsigned long address, pmd_t *pmdp) | |
602 | { | |
603 | unsigned long old, tmp; | |
604 | ||
605 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | |
606 | ||
607 | #ifdef CONFIG_DEBUG_VM | |
608 | WARN_ON(!pmd_trans_huge(*pmdp)); | |
609 | assert_spin_locked(&vma->vm_mm->page_table_lock); | |
610 | #endif | |
611 | ||
612 | #ifdef PTE_ATOMIC_UPDATES | |
613 | ||
614 | __asm__ __volatile__( | |
615 | "1: ldarx %0,0,%3\n\ | |
616 | andi. %1,%0,%6\n\ | |
617 | bne- 1b \n\ | |
618 | ori %1,%0,%4 \n\ | |
619 | stdcx. %1,0,%3 \n\ | |
620 | bne- 1b" | |
621 | : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) | |
622 | : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY) | |
623 | : "cc" ); | |
624 | #else | |
625 | old = pmd_val(*pmdp); | |
626 | *pmdp = __pmd(old | _PAGE_SPLITTING); | |
627 | #endif | |
628 | /* | |
629 | * If we didn't had the splitting flag set, go and flush the | |
630 | * HPTE entries. | |
631 | */ | |
632 | if (!(old & _PAGE_SPLITTING)) { | |
633 | /* We need to flush the hpte */ | |
634 | if (old & _PAGE_HASHPTE) | |
635 | hpte_do_hugepage_flush(vma->vm_mm, address, pmdp); | |
636 | } | |
637 | } | |
638 | ||
639 | /* | |
640 | * We want to put the pgtable in pmd and use pgtable for tracking | |
641 | * the base page size hptes | |
642 | */ | |
643 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | |
644 | pgtable_t pgtable) | |
645 | { | |
646 | pgtable_t *pgtable_slot; | |
647 | assert_spin_locked(&mm->page_table_lock); | |
648 | /* | |
649 | * we store the pgtable in the second half of PMD | |
650 | */ | |
651 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
652 | *pgtable_slot = pgtable; | |
653 | /* | |
654 | * expose the deposited pgtable to other cpus. | |
655 | * before we set the hugepage PTE at pmd level | |
656 | * hash fault code looks at the deposted pgtable | |
657 | * to store hash index values. | |
658 | */ | |
659 | smp_wmb(); | |
660 | } | |
661 | ||
662 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) | |
663 | { | |
664 | pgtable_t pgtable; | |
665 | pgtable_t *pgtable_slot; | |
666 | ||
667 | assert_spin_locked(&mm->page_table_lock); | |
668 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
669 | pgtable = *pgtable_slot; | |
670 | /* | |
671 | * Once we withdraw, mark the entry NULL. | |
672 | */ | |
673 | *pgtable_slot = NULL; | |
674 | /* | |
675 | * We store HPTE information in the deposited PTE fragment. | |
676 | * zero out the content on withdraw. | |
677 | */ | |
678 | memset(pgtable, 0, PTE_FRAG_SIZE); | |
679 | return pgtable; | |
680 | } | |
681 | ||
682 | /* | |
683 | * set a new huge pmd. We should not be called for updating | |
684 | * an existing pmd entry. That should go via pmd_hugepage_update. | |
685 | */ | |
686 | void set_pmd_at(struct mm_struct *mm, unsigned long addr, | |
687 | pmd_t *pmdp, pmd_t pmd) | |
688 | { | |
689 | #ifdef CONFIG_DEBUG_VM | |
690 | WARN_ON(!pmd_none(*pmdp)); | |
691 | assert_spin_locked(&mm->page_table_lock); | |
692 | WARN_ON(!pmd_trans_huge(pmd)); | |
693 | #endif | |
694 | return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); | |
695 | } | |
696 | ||
697 | void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | |
698 | pmd_t *pmdp) | |
699 | { | |
700 | pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT); | |
701 | } | |
702 | ||
703 | /* | |
704 | * A linux hugepage PMD was changed and the corresponding hash table entries | |
705 | * neesd to be flushed. | |
706 | */ | |
707 | void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, | |
708 | pmd_t *pmdp) | |
709 | { | |
710 | int ssize, i; | |
711 | unsigned long s_addr; | |
1a527286 | 712 | int max_hpte_count; |
074c2eae AK |
713 | unsigned int psize, valid; |
714 | unsigned char *hpte_slot_array; | |
715 | unsigned long hidx, vpn, vsid, hash, shift, slot; | |
716 | ||
717 | /* | |
718 | * Flush all the hptes mapping this hugepage | |
719 | */ | |
720 | s_addr = addr & HPAGE_PMD_MASK; | |
721 | hpte_slot_array = get_hpte_slot_array(pmdp); | |
722 | /* | |
723 | * IF we try to do a HUGE PTE update after a withdraw is done. | |
724 | * we will find the below NULL. This happens when we do | |
725 | * split_huge_page_pmd | |
726 | */ | |
727 | if (!hpte_slot_array) | |
728 | return; | |
729 | ||
730 | /* get the base page size */ | |
731 | psize = get_slice_psize(mm, s_addr); | |
074c2eae | 732 | |
1a527286 AK |
733 | if (ppc_md.hugepage_invalidate) |
734 | return ppc_md.hugepage_invalidate(mm, hpte_slot_array, | |
735 | s_addr, psize); | |
736 | /* | |
737 | * No bluk hpte removal support, invalidate each entry | |
738 | */ | |
739 | shift = mmu_psize_defs[psize].shift; | |
740 | max_hpte_count = HPAGE_PMD_SIZE >> shift; | |
741 | for (i = 0; i < max_hpte_count; i++) { | |
074c2eae AK |
742 | /* |
743 | * 8 bits per each hpte entries | |
744 | * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] | |
745 | */ | |
746 | valid = hpte_valid(hpte_slot_array, i); | |
747 | if (!valid) | |
748 | continue; | |
749 | hidx = hpte_hash_index(hpte_slot_array, i); | |
750 | ||
751 | /* get the vpn */ | |
752 | addr = s_addr + (i * (1ul << shift)); | |
753 | if (!is_kernel_addr(addr)) { | |
754 | ssize = user_segment_size(addr); | |
755 | vsid = get_vsid(mm->context.id, addr, ssize); | |
756 | WARN_ON(vsid == 0); | |
757 | } else { | |
758 | vsid = get_kernel_vsid(addr, mmu_kernel_ssize); | |
759 | ssize = mmu_kernel_ssize; | |
760 | } | |
761 | ||
762 | vpn = hpt_vpn(addr, vsid, ssize); | |
763 | hash = hpt_hash(vpn, shift, ssize); | |
764 | if (hidx & _PTEIDX_SECONDARY) | |
765 | hash = ~hash; | |
766 | ||
767 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | |
768 | slot += hidx & _PTEIDX_GROUP_IX; | |
769 | ppc_md.hpte_invalidate(slot, vpn, psize, | |
770 | MMU_PAGE_16M, ssize, 0); | |
771 | } | |
772 | } | |
773 | ||
774 | static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) | |
775 | { | |
776 | pmd_val(pmd) |= pgprot_val(pgprot); | |
777 | return pmd; | |
778 | } | |
779 | ||
780 | pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) | |
781 | { | |
782 | pmd_t pmd; | |
783 | /* | |
784 | * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always | |
785 | * set. We use this to check THP page at pmd level. | |
786 | * leaf pte for huge page, bottom two bits != 00 | |
787 | */ | |
788 | pmd_val(pmd) = pfn << PTE_RPN_SHIFT; | |
789 | pmd_val(pmd) |= _PAGE_THP_HUGE; | |
790 | pmd = pmd_set_protbits(pmd, pgprot); | |
791 | return pmd; | |
792 | } | |
793 | ||
794 | pmd_t mk_pmd(struct page *page, pgprot_t pgprot) | |
795 | { | |
796 | return pfn_pmd(page_to_pfn(page), pgprot); | |
797 | } | |
798 | ||
799 | pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | |
800 | { | |
801 | ||
802 | pmd_val(pmd) &= _HPAGE_CHG_MASK; | |
803 | pmd = pmd_set_protbits(pmd, newprot); | |
804 | return pmd; | |
805 | } | |
806 | ||
807 | /* | |
808 | * This is called at the end of handling a user page fault, when the | |
809 | * fault has been handled by updating a HUGE PMD entry in the linux page tables. | |
810 | * We use it to preload an HPTE into the hash table corresponding to | |
811 | * the updated linux HUGE PMD entry. | |
812 | */ | |
813 | void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, | |
814 | pmd_t *pmd) | |
815 | { | |
816 | return; | |
817 | } | |
818 | ||
819 | pmd_t pmdp_get_and_clear(struct mm_struct *mm, | |
820 | unsigned long addr, pmd_t *pmdp) | |
821 | { | |
822 | pmd_t old_pmd; | |
823 | pgtable_t pgtable; | |
824 | unsigned long old; | |
825 | pgtable_t *pgtable_slot; | |
826 | ||
827 | old = pmd_hugepage_update(mm, addr, pmdp, ~0UL); | |
828 | old_pmd = __pmd(old); | |
829 | /* | |
830 | * We have pmd == none and we are holding page_table_lock. | |
831 | * So we can safely go and clear the pgtable hash | |
832 | * index info. | |
833 | */ | |
834 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; | |
835 | pgtable = *pgtable_slot; | |
836 | /* | |
837 | * Let's zero out old valid and hash index details | |
838 | * hash fault look at them. | |
839 | */ | |
840 | memset(pgtable, 0, PTE_FRAG_SIZE); | |
841 | return old_pmd; | |
842 | } | |
437d4964 AK |
843 | |
844 | int has_transparent_hugepage(void) | |
845 | { | |
846 | if (!mmu_has_feature(MMU_FTR_16M_PAGE)) | |
847 | return 0; | |
848 | /* | |
849 | * We support THP only if PMD_SIZE is 16MB. | |
850 | */ | |
851 | if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) | |
852 | return 0; | |
853 | /* | |
854 | * We need to make sure that we support 16MB hugepage in a segement | |
855 | * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE | |
856 | * of 64K. | |
857 | */ | |
858 | /* | |
859 | * If we have 64K HPTE, we will be using that by default | |
860 | */ | |
861 | if (mmu_psize_defs[MMU_PAGE_64K].shift && | |
862 | (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) | |
863 | return 0; | |
864 | /* | |
865 | * Ok we only have 4K HPTE | |
866 | */ | |
867 | if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) | |
868 | return 0; | |
869 | ||
870 | return 1; | |
871 | } | |
074c2eae | 872 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |