]>
Commit | Line | Data |
---|---|---|
9f4c815c IM |
1 | /* |
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | |
1da177e4 | 3 | * Thanks to Ben LaHaise for precious feedback. |
9f4c815c | 4 | */ |
1da177e4 | 5 | #include <linux/highmem.h> |
8192206d | 6 | #include <linux/bootmem.h> |
1da177e4 | 7 | #include <linux/module.h> |
9f4c815c | 8 | #include <linux/sched.h> |
1da177e4 | 9 | #include <linux/slab.h> |
9f4c815c | 10 | #include <linux/mm.h> |
76ebd054 | 11 | #include <linux/interrupt.h> |
9f4c815c | 12 | |
950f9d95 | 13 | #include <asm/e820.h> |
1da177e4 LT |
14 | #include <asm/processor.h> |
15 | #include <asm/tlbflush.h> | |
f8af095d | 16 | #include <asm/sections.h> |
9f4c815c IM |
17 | #include <asm/uaccess.h> |
18 | #include <asm/pgalloc.h> | |
c31c7d48 | 19 | #include <asm/proto.h> |
1da177e4 | 20 | |
9df84993 IM |
21 | /* |
22 | * The current flushing context - we pass it instead of 5 arguments: | |
23 | */ | |
72e458df TG |
24 | struct cpa_data { |
25 | unsigned long vaddr; | |
72e458df TG |
26 | pgprot_t mask_set; |
27 | pgprot_t mask_clr; | |
65e074df | 28 | int numpages; |
f4ae5da0 | 29 | int flushtlb; |
c31c7d48 | 30 | unsigned long pfn; |
72e458df TG |
31 | }; |
32 | ||
c31c7d48 TG |
33 | #ifdef CONFIG_X86_64 |
34 | ||
35 | static inline unsigned long highmap_start_pfn(void) | |
36 | { | |
37 | return __pa(_text) >> PAGE_SHIFT; | |
38 | } | |
39 | ||
40 | static inline unsigned long highmap_end_pfn(void) | |
41 | { | |
42 | return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; | |
43 | } | |
44 | ||
45 | #endif | |
46 | ||
ed724be6 AV |
47 | static inline int |
48 | within(unsigned long addr, unsigned long start, unsigned long end) | |
687c4825 | 49 | { |
ed724be6 AV |
50 | return addr >= start && addr < end; |
51 | } | |
52 | ||
d7c8f21a TG |
53 | /* |
54 | * Flushing functions | |
55 | */ | |
cd8ddf1a | 56 | |
cd8ddf1a TG |
57 | /** |
58 | * clflush_cache_range - flush a cache range with clflush | |
59 | * @addr: virtual start address | |
60 | * @size: number of bytes to flush | |
61 | * | |
62 | * clflush is an unordered instruction which needs fencing with mfence | |
63 | * to avoid ordering issues. | |
64 | */ | |
4c61afcd | 65 | void clflush_cache_range(void *vaddr, unsigned int size) |
d7c8f21a | 66 | { |
4c61afcd | 67 | void *vend = vaddr + size - 1; |
d7c8f21a | 68 | |
cd8ddf1a | 69 | mb(); |
4c61afcd IM |
70 | |
71 | for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) | |
72 | clflush(vaddr); | |
73 | /* | |
74 | * Flush any possible final partial cacheline: | |
75 | */ | |
76 | clflush(vend); | |
77 | ||
cd8ddf1a | 78 | mb(); |
d7c8f21a TG |
79 | } |
80 | ||
af1e6844 | 81 | static void __cpa_flush_all(void *arg) |
d7c8f21a | 82 | { |
6bb8383b AK |
83 | unsigned long cache = (unsigned long)arg; |
84 | ||
d7c8f21a TG |
85 | /* |
86 | * Flush all to work around Errata in early athlons regarding | |
87 | * large page flushing. | |
88 | */ | |
89 | __flush_tlb_all(); | |
90 | ||
6bb8383b | 91 | if (cache && boot_cpu_data.x86_model >= 4) |
d7c8f21a TG |
92 | wbinvd(); |
93 | } | |
94 | ||
6bb8383b | 95 | static void cpa_flush_all(unsigned long cache) |
d7c8f21a TG |
96 | { |
97 | BUG_ON(irqs_disabled()); | |
98 | ||
6bb8383b | 99 | on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); |
d7c8f21a TG |
100 | } |
101 | ||
57a6a46a TG |
102 | static void __cpa_flush_range(void *arg) |
103 | { | |
57a6a46a TG |
104 | /* |
105 | * We could optimize that further and do individual per page | |
106 | * tlb invalidates for a low number of pages. Caveat: we must | |
107 | * flush the high aliases on 64bit as well. | |
108 | */ | |
109 | __flush_tlb_all(); | |
57a6a46a TG |
110 | } |
111 | ||
6bb8383b | 112 | static void cpa_flush_range(unsigned long start, int numpages, int cache) |
57a6a46a | 113 | { |
4c61afcd IM |
114 | unsigned int i, level; |
115 | unsigned long addr; | |
116 | ||
57a6a46a | 117 | BUG_ON(irqs_disabled()); |
4c61afcd | 118 | WARN_ON(PAGE_ALIGN(start) != start); |
57a6a46a | 119 | |
3b233e52 | 120 | on_each_cpu(__cpa_flush_range, NULL, 1, 1); |
57a6a46a | 121 | |
6bb8383b AK |
122 | if (!cache) |
123 | return; | |
124 | ||
3b233e52 TG |
125 | /* |
126 | * We only need to flush on one CPU, | |
127 | * clflush is a MESI-coherent instruction that | |
128 | * will cause all other CPUs to flush the same | |
129 | * cachelines: | |
130 | */ | |
4c61afcd IM |
131 | for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { |
132 | pte_t *pte = lookup_address(addr, &level); | |
133 | ||
134 | /* | |
135 | * Only flush present addresses: | |
136 | */ | |
7bfb72e8 | 137 | if (pte && (pte_val(*pte) & _PAGE_PRESENT)) |
4c61afcd IM |
138 | clflush_cache_range((void *) addr, PAGE_SIZE); |
139 | } | |
57a6a46a TG |
140 | } |
141 | ||
ed724be6 AV |
142 | /* |
143 | * Certain areas of memory on x86 require very specific protection flags, | |
144 | * for example the BIOS area or kernel text. Callers don't always get this | |
145 | * right (again, ioremap() on BIOS memory is not uncommon) so this function | |
146 | * checks and fixes these known static required protection bits. | |
147 | */ | |
c31c7d48 TG |
148 | static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, |
149 | unsigned long pfn) | |
ed724be6 AV |
150 | { |
151 | pgprot_t forbidden = __pgprot(0); | |
152 | ||
687c4825 | 153 | /* |
ed724be6 AV |
154 | * The BIOS area between 640k and 1Mb needs to be executable for |
155 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. | |
687c4825 | 156 | */ |
c31c7d48 | 157 | if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) |
ed724be6 AV |
158 | pgprot_val(forbidden) |= _PAGE_NX; |
159 | ||
160 | /* | |
161 | * The kernel text needs to be executable for obvious reasons | |
c31c7d48 TG |
162 | * Does not cover __inittext since that is gone later on. On |
163 | * 64bit we do not enforce !NX on the low mapping | |
ed724be6 AV |
164 | */ |
165 | if (within(address, (unsigned long)_text, (unsigned long)_etext)) | |
166 | pgprot_val(forbidden) |= _PAGE_NX; | |
cc0f21bb | 167 | |
cc0f21bb | 168 | /* |
c31c7d48 TG |
169 | * The .rodata section needs to be read-only. Using the pfn |
170 | * catches all aliases. | |
cc0f21bb | 171 | */ |
c31c7d48 TG |
172 | if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, |
173 | __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) | |
cc0f21bb | 174 | pgprot_val(forbidden) |= _PAGE_RW; |
ed724be6 AV |
175 | |
176 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); | |
687c4825 IM |
177 | |
178 | return prot; | |
179 | } | |
180 | ||
9a14aefc TG |
181 | /* |
182 | * Lookup the page table entry for a virtual address. Return a pointer | |
183 | * to the entry and the level of the mapping. | |
184 | * | |
185 | * Note: We return pud and pmd either when the entry is marked large | |
186 | * or when the present bit is not set. Otherwise we would return a | |
187 | * pointer to a nonexisting mapping. | |
188 | */ | |
da7bfc50 | 189 | pte_t *lookup_address(unsigned long address, unsigned int *level) |
9f4c815c | 190 | { |
1da177e4 LT |
191 | pgd_t *pgd = pgd_offset_k(address); |
192 | pud_t *pud; | |
193 | pmd_t *pmd; | |
9f4c815c | 194 | |
30551bb3 TG |
195 | *level = PG_LEVEL_NONE; |
196 | ||
1da177e4 LT |
197 | if (pgd_none(*pgd)) |
198 | return NULL; | |
9df84993 | 199 | |
1da177e4 LT |
200 | pud = pud_offset(pgd, address); |
201 | if (pud_none(*pud)) | |
202 | return NULL; | |
c2f71ee2 AK |
203 | |
204 | *level = PG_LEVEL_1G; | |
205 | if (pud_large(*pud) || !pud_present(*pud)) | |
206 | return (pte_t *)pud; | |
207 | ||
1da177e4 LT |
208 | pmd = pmd_offset(pud, address); |
209 | if (pmd_none(*pmd)) | |
210 | return NULL; | |
30551bb3 TG |
211 | |
212 | *level = PG_LEVEL_2M; | |
9a14aefc | 213 | if (pmd_large(*pmd) || !pmd_present(*pmd)) |
1da177e4 | 214 | return (pte_t *)pmd; |
1da177e4 | 215 | |
30551bb3 | 216 | *level = PG_LEVEL_4K; |
9df84993 | 217 | |
9f4c815c IM |
218 | return pte_offset_kernel(pmd, address); |
219 | } | |
220 | ||
9df84993 IM |
221 | /* |
222 | * Set the new pmd in all the pgds we know about: | |
223 | */ | |
9a3dc780 | 224 | static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) |
9f4c815c | 225 | { |
9f4c815c IM |
226 | /* change init_mm */ |
227 | set_pte_atomic(kpte, pte); | |
44af6c41 | 228 | #ifdef CONFIG_X86_32 |
e4b71dcf | 229 | if (!SHARED_KERNEL_PMD) { |
44af6c41 IM |
230 | struct page *page; |
231 | ||
e3ed910d | 232 | list_for_each_entry(page, &pgd_list, lru) { |
44af6c41 IM |
233 | pgd_t *pgd; |
234 | pud_t *pud; | |
235 | pmd_t *pmd; | |
236 | ||
237 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | |
238 | pud = pud_offset(pgd, address); | |
239 | pmd = pmd_offset(pud, address); | |
240 | set_pte_atomic((pte_t *)pmd, pte); | |
241 | } | |
1da177e4 | 242 | } |
44af6c41 | 243 | #endif |
1da177e4 LT |
244 | } |
245 | ||
9df84993 IM |
246 | static int |
247 | try_preserve_large_page(pte_t *kpte, unsigned long address, | |
248 | struct cpa_data *cpa) | |
65e074df | 249 | { |
c31c7d48 | 250 | unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; |
65e074df TG |
251 | pte_t new_pte, old_pte, *tmp; |
252 | pgprot_t old_prot, new_prot; | |
fac84939 | 253 | int i, do_split = 1; |
da7bfc50 | 254 | unsigned int level; |
65e074df TG |
255 | |
256 | spin_lock_irqsave(&pgd_lock, flags); | |
257 | /* | |
258 | * Check for races, another CPU might have split this page | |
259 | * up already: | |
260 | */ | |
261 | tmp = lookup_address(address, &level); | |
262 | if (tmp != kpte) | |
263 | goto out_unlock; | |
264 | ||
265 | switch (level) { | |
266 | case PG_LEVEL_2M: | |
31422c51 AK |
267 | psize = PMD_PAGE_SIZE; |
268 | pmask = PMD_PAGE_MASK; | |
65e074df | 269 | break; |
f07333fd | 270 | #ifdef CONFIG_X86_64 |
65e074df | 271 | case PG_LEVEL_1G: |
5d3c8b21 AK |
272 | psize = PUD_PAGE_SIZE; |
273 | pmask = PUD_PAGE_MASK; | |
f07333fd AK |
274 | break; |
275 | #endif | |
65e074df | 276 | default: |
beaff633 | 277 | do_split = -EINVAL; |
65e074df TG |
278 | goto out_unlock; |
279 | } | |
280 | ||
281 | /* | |
282 | * Calculate the number of pages, which fit into this large | |
283 | * page starting at address: | |
284 | */ | |
285 | nextpage_addr = (address + psize) & pmask; | |
286 | numpages = (nextpage_addr - address) >> PAGE_SHIFT; | |
287 | if (numpages < cpa->numpages) | |
288 | cpa->numpages = numpages; | |
289 | ||
290 | /* | |
291 | * We are safe now. Check whether the new pgprot is the same: | |
292 | */ | |
293 | old_pte = *kpte; | |
294 | old_prot = new_prot = pte_pgprot(old_pte); | |
295 | ||
296 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); | |
297 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | |
c31c7d48 TG |
298 | |
299 | /* | |
300 | * old_pte points to the large page base address. So we need | |
301 | * to add the offset of the virtual address: | |
302 | */ | |
303 | pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); | |
304 | cpa->pfn = pfn; | |
305 | ||
306 | new_prot = static_protections(new_prot, address, pfn); | |
65e074df | 307 | |
fac84939 TG |
308 | /* |
309 | * We need to check the full range, whether | |
310 | * static_protection() requires a different pgprot for one of | |
311 | * the pages in the range we try to preserve: | |
312 | */ | |
313 | addr = address + PAGE_SIZE; | |
c31c7d48 TG |
314 | pfn++; |
315 | for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { | |
316 | pgprot_t chk_prot = static_protections(new_prot, addr, pfn); | |
fac84939 TG |
317 | |
318 | if (pgprot_val(chk_prot) != pgprot_val(new_prot)) | |
319 | goto out_unlock; | |
320 | } | |
321 | ||
65e074df TG |
322 | /* |
323 | * If there are no changes, return. maxpages has been updated | |
324 | * above: | |
325 | */ | |
326 | if (pgprot_val(new_prot) == pgprot_val(old_prot)) { | |
beaff633 | 327 | do_split = 0; |
65e074df TG |
328 | goto out_unlock; |
329 | } | |
330 | ||
331 | /* | |
332 | * We need to change the attributes. Check, whether we can | |
333 | * change the large page in one go. We request a split, when | |
334 | * the address is not aligned and the number of pages is | |
335 | * smaller than the number of pages in the large page. Note | |
336 | * that we limited the number of possible pages already to | |
337 | * the number of pages in the large page. | |
338 | */ | |
339 | if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { | |
340 | /* | |
341 | * The address is aligned and the number of pages | |
342 | * covers the full page. | |
343 | */ | |
344 | new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); | |
345 | __set_pmd_pte(kpte, address, new_pte); | |
346 | cpa->flushtlb = 1; | |
beaff633 | 347 | do_split = 0; |
65e074df TG |
348 | } |
349 | ||
350 | out_unlock: | |
351 | spin_unlock_irqrestore(&pgd_lock, flags); | |
9df84993 | 352 | |
beaff633 | 353 | return do_split; |
65e074df TG |
354 | } |
355 | ||
76ebd054 TG |
356 | static LIST_HEAD(page_pool); |
357 | static unsigned long pool_size, pool_pages, pool_low; | |
358 | static unsigned long pool_used, pool_failed, pool_refill; | |
359 | ||
360 | static void cpa_fill_pool(void) | |
361 | { | |
362 | struct page *p; | |
363 | gfp_t gfp = GFP_KERNEL; | |
364 | ||
365 | /* Do not allocate from interrupt context */ | |
366 | if (in_irq() || irqs_disabled()) | |
367 | return; | |
368 | /* | |
369 | * Check unlocked. I does not matter when we have one more | |
370 | * page in the pool. The bit lock avoids recursive pool | |
371 | * allocations: | |
372 | */ | |
373 | if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) | |
374 | return; | |
375 | ||
376 | #ifdef CONFIG_DEBUG_PAGEALLOC | |
377 | /* | |
378 | * We could do: | |
379 | * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; | |
380 | * but this fails on !PREEMPT kernels | |
381 | */ | |
382 | gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; | |
383 | #endif | |
384 | ||
385 | while (pool_pages < pool_size) { | |
386 | p = alloc_pages(gfp, 0); | |
387 | if (!p) { | |
388 | pool_failed++; | |
389 | break; | |
390 | } | |
391 | spin_lock_irq(&pgd_lock); | |
392 | list_add(&p->lru, &page_pool); | |
393 | pool_pages++; | |
394 | spin_unlock_irq(&pgd_lock); | |
395 | } | |
396 | clear_bit_unlock(0, &pool_refill); | |
397 | } | |
398 | ||
399 | #define SHIFT_MB (20 - PAGE_SHIFT) | |
400 | #define ROUND_MB_GB ((1 << 10) - 1) | |
401 | #define SHIFT_MB_GB 10 | |
402 | #define POOL_PAGES_PER_GB 16 | |
403 | ||
404 | void __init cpa_init(void) | |
405 | { | |
406 | struct sysinfo si; | |
407 | unsigned long gb; | |
408 | ||
409 | si_meminfo(&si); | |
410 | /* | |
411 | * Calculate the number of pool pages: | |
412 | * | |
413 | * Convert totalram (nr of pages) to MiB and round to the next | |
414 | * GiB. Shift MiB to Gib and multiply the result by | |
415 | * POOL_PAGES_PER_GB: | |
416 | */ | |
417 | gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; | |
418 | pool_size = POOL_PAGES_PER_GB * gb; | |
419 | pool_low = pool_size; | |
420 | ||
421 | cpa_fill_pool(); | |
422 | printk(KERN_DEBUG | |
423 | "CPA: page pool initialized %lu of %lu pages preallocated\n", | |
424 | pool_pages, pool_size); | |
425 | } | |
426 | ||
7afe15b9 | 427 | static int split_large_page(pte_t *kpte, unsigned long address) |
bb5c2dbd | 428 | { |
7b610eec | 429 | unsigned long flags, pfn, pfninc = 1; |
9df84993 | 430 | unsigned int i, level; |
bb5c2dbd | 431 | pte_t *pbase, *tmp; |
9df84993 | 432 | pgprot_t ref_prot; |
bb5c2dbd IM |
433 | struct page *base; |
434 | ||
eb5b5f02 TG |
435 | /* |
436 | * Get a page from the pool. The pool list is protected by the | |
437 | * pgd_lock, which we have to take anyway for the split | |
438 | * operation: | |
439 | */ | |
440 | spin_lock_irqsave(&pgd_lock, flags); | |
441 | if (list_empty(&page_pool)) { | |
442 | spin_unlock_irqrestore(&pgd_lock, flags); | |
bb5c2dbd | 443 | return -ENOMEM; |
eb5b5f02 TG |
444 | } |
445 | ||
446 | base = list_first_entry(&page_pool, struct page, lru); | |
447 | list_del(&base->lru); | |
448 | pool_pages--; | |
449 | ||
450 | if (pool_pages < pool_low) | |
451 | pool_low = pool_pages; | |
bb5c2dbd | 452 | |
bb5c2dbd IM |
453 | /* |
454 | * Check for races, another CPU might have split this page | |
455 | * up for us already: | |
456 | */ | |
457 | tmp = lookup_address(address, &level); | |
6ce9fc17 | 458 | if (tmp != kpte) |
bb5c2dbd IM |
459 | goto out_unlock; |
460 | ||
bb5c2dbd | 461 | pbase = (pte_t *)page_address(base); |
44af6c41 | 462 | #ifdef CONFIG_X86_32 |
bb5c2dbd | 463 | paravirt_alloc_pt(&init_mm, page_to_pfn(base)); |
44af6c41 | 464 | #endif |
07cf89c0 | 465 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); |
bb5c2dbd | 466 | |
f07333fd AK |
467 | #ifdef CONFIG_X86_64 |
468 | if (level == PG_LEVEL_1G) { | |
469 | pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; | |
470 | pgprot_val(ref_prot) |= _PAGE_PSE; | |
f07333fd AK |
471 | } |
472 | #endif | |
473 | ||
63c1dcf4 TG |
474 | /* |
475 | * Get the target pfn from the original entry: | |
476 | */ | |
477 | pfn = pte_pfn(*kpte); | |
f07333fd | 478 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) |
63c1dcf4 | 479 | set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); |
bb5c2dbd IM |
480 | |
481 | /* | |
07cf89c0 | 482 | * Install the new, split up pagetable. Important details here: |
4c881ca1 HY |
483 | * |
484 | * On Intel the NX bit of all levels must be cleared to make a | |
485 | * page executable. See section 4.13.2 of Intel 64 and IA-32 | |
486 | * Architectures Software Developer's Manual). | |
07cf89c0 TG |
487 | * |
488 | * Mark the entry present. The current mapping might be | |
489 | * set to not present, which we preserved above. | |
bb5c2dbd | 490 | */ |
4c881ca1 | 491 | ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); |
07cf89c0 | 492 | pgprot_val(ref_prot) |= _PAGE_PRESENT; |
9a3dc780 | 493 | __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); |
bb5c2dbd IM |
494 | base = NULL; |
495 | ||
496 | out_unlock: | |
eb5b5f02 TG |
497 | /* |
498 | * If we dropped out via the lookup_address check under | |
499 | * pgd_lock then stick the page back into the pool: | |
500 | */ | |
501 | if (base) { | |
502 | list_add(&base->lru, &page_pool); | |
503 | pool_pages++; | |
504 | } else | |
505 | pool_used++; | |
9a3dc780 | 506 | spin_unlock_irqrestore(&pgd_lock, flags); |
bb5c2dbd | 507 | |
bb5c2dbd IM |
508 | return 0; |
509 | } | |
510 | ||
c31c7d48 | 511 | static int __change_page_attr(struct cpa_data *cpa, int primary) |
9f4c815c | 512 | { |
c31c7d48 | 513 | unsigned long address = cpa->vaddr; |
da7bfc50 HH |
514 | int do_split, err; |
515 | unsigned int level; | |
c31c7d48 | 516 | pte_t *kpte, old_pte; |
1da177e4 | 517 | |
97f99fed | 518 | repeat: |
f0646e43 | 519 | kpte = lookup_address(address, &level); |
1da177e4 | 520 | if (!kpte) |
c31c7d48 TG |
521 | return primary ? -EINVAL : 0; |
522 | ||
523 | old_pte = *kpte; | |
524 | if (!pte_val(old_pte)) { | |
525 | if (!primary) | |
526 | return 0; | |
527 | printk(KERN_WARNING "CPA: called for zero pte. " | |
528 | "vaddr = %lx cpa->vaddr = %lx\n", address, | |
529 | cpa->vaddr); | |
530 | WARN_ON(1); | |
1da177e4 | 531 | return -EINVAL; |
c31c7d48 | 532 | } |
9f4c815c | 533 | |
30551bb3 | 534 | if (level == PG_LEVEL_4K) { |
c31c7d48 | 535 | pte_t new_pte; |
626c2c9d | 536 | pgprot_t new_prot = pte_pgprot(old_pte); |
c31c7d48 | 537 | unsigned long pfn = pte_pfn(old_pte); |
86f03989 | 538 | |
72e458df TG |
539 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); |
540 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | |
86f03989 | 541 | |
c31c7d48 | 542 | new_prot = static_protections(new_prot, address, pfn); |
86f03989 | 543 | |
626c2c9d AV |
544 | /* |
545 | * We need to keep the pfn from the existing PTE, | |
546 | * after all we're only going to change it's attributes | |
547 | * not the memory it points to | |
548 | */ | |
c31c7d48 TG |
549 | new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); |
550 | cpa->pfn = pfn; | |
f4ae5da0 TG |
551 | /* |
552 | * Do we really change anything ? | |
553 | */ | |
554 | if (pte_val(old_pte) != pte_val(new_pte)) { | |
555 | set_pte_atomic(kpte, new_pte); | |
556 | cpa->flushtlb = 1; | |
557 | } | |
65e074df TG |
558 | cpa->numpages = 1; |
559 | return 0; | |
1da177e4 | 560 | } |
65e074df TG |
561 | |
562 | /* | |
563 | * Check, whether we can keep the large page intact | |
564 | * and just change the pte: | |
565 | */ | |
beaff633 | 566 | do_split = try_preserve_large_page(kpte, address, cpa); |
65e074df TG |
567 | /* |
568 | * When the range fits into the existing large page, | |
569 | * return. cp->numpages and cpa->tlbflush have been updated in | |
570 | * try_large_page: | |
571 | */ | |
87f7f8fe IM |
572 | if (do_split <= 0) |
573 | return do_split; | |
65e074df TG |
574 | |
575 | /* | |
576 | * We have to split the large page: | |
577 | */ | |
87f7f8fe IM |
578 | err = split_large_page(kpte, address); |
579 | if (!err) { | |
580 | cpa->flushtlb = 1; | |
581 | goto repeat; | |
582 | } | |
beaff633 | 583 | |
87f7f8fe | 584 | return err; |
9f4c815c | 585 | } |
1da177e4 | 586 | |
c31c7d48 TG |
587 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); |
588 | ||
589 | static int cpa_process_alias(struct cpa_data *cpa) | |
1da177e4 | 590 | { |
c31c7d48 | 591 | struct cpa_data alias_cpa; |
f34b439f | 592 | int ret = 0; |
44af6c41 | 593 | |
c31c7d48 TG |
594 | if (cpa->pfn > max_pfn_mapped) |
595 | return 0; | |
626c2c9d | 596 | |
f34b439f TG |
597 | /* |
598 | * No need to redo, when the primary call touched the direct | |
599 | * mapping already: | |
600 | */ | |
601 | if (!within(cpa->vaddr, PAGE_OFFSET, | |
602 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { | |
44af6c41 | 603 | |
f34b439f TG |
604 | alias_cpa = *cpa; |
605 | alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); | |
606 | ||
607 | ret = __change_page_attr_set_clr(&alias_cpa, 0); | |
608 | } | |
44af6c41 | 609 | |
44af6c41 | 610 | #ifdef CONFIG_X86_64 |
c31c7d48 TG |
611 | if (ret) |
612 | return ret; | |
f34b439f TG |
613 | /* |
614 | * No need to redo, when the primary call touched the high | |
615 | * mapping already: | |
616 | */ | |
617 | if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) | |
618 | return 0; | |
619 | ||
488fd995 | 620 | /* |
0879750f TG |
621 | * If the physical address is inside the kernel map, we need |
622 | * to touch the high mapped kernel as well: | |
488fd995 | 623 | */ |
c31c7d48 TG |
624 | if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) |
625 | return 0; | |
0879750f | 626 | |
c31c7d48 TG |
627 | alias_cpa = *cpa; |
628 | alias_cpa.vaddr = | |
629 | (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; | |
630 | ||
631 | /* | |
632 | * The high mapping range is imprecise, so ignore the return value. | |
633 | */ | |
634 | __change_page_attr_set_clr(&alias_cpa, 0); | |
488fd995 | 635 | #endif |
c31c7d48 | 636 | return ret; |
1da177e4 LT |
637 | } |
638 | ||
c31c7d48 | 639 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) |
ff31452b | 640 | { |
65e074df | 641 | int ret, numpages = cpa->numpages; |
ff31452b | 642 | |
65e074df TG |
643 | while (numpages) { |
644 | /* | |
645 | * Store the remaining nr of pages for the large page | |
646 | * preservation check. | |
647 | */ | |
648 | cpa->numpages = numpages; | |
c31c7d48 TG |
649 | |
650 | ret = __change_page_attr(cpa, checkalias); | |
ff31452b TG |
651 | if (ret) |
652 | return ret; | |
ff31452b | 653 | |
c31c7d48 TG |
654 | if (checkalias) { |
655 | ret = cpa_process_alias(cpa); | |
656 | if (ret) | |
657 | return ret; | |
658 | } | |
659 | ||
65e074df TG |
660 | /* |
661 | * Adjust the number of pages with the result of the | |
662 | * CPA operation. Either a large page has been | |
663 | * preserved or a single page update happened. | |
664 | */ | |
665 | BUG_ON(cpa->numpages > numpages); | |
666 | numpages -= cpa->numpages; | |
667 | cpa->vaddr += cpa->numpages * PAGE_SIZE; | |
668 | } | |
ff31452b TG |
669 | return 0; |
670 | } | |
671 | ||
6bb8383b AK |
672 | static inline int cache_attr(pgprot_t attr) |
673 | { | |
674 | return pgprot_val(attr) & | |
675 | (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); | |
676 | } | |
677 | ||
ff31452b TG |
678 | static int change_page_attr_set_clr(unsigned long addr, int numpages, |
679 | pgprot_t mask_set, pgprot_t mask_clr) | |
680 | { | |
72e458df | 681 | struct cpa_data cpa; |
af96e443 | 682 | int ret, cache, checkalias; |
331e4065 TG |
683 | |
684 | /* | |
685 | * Check, if we are requested to change a not supported | |
686 | * feature: | |
687 | */ | |
688 | mask_set = canon_pgprot(mask_set); | |
689 | mask_clr = canon_pgprot(mask_clr); | |
690 | if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) | |
691 | return 0; | |
692 | ||
69b1415e TG |
693 | /* Ensure we are PAGE_SIZE aligned */ |
694 | if (addr & ~PAGE_MASK) { | |
695 | addr &= PAGE_MASK; | |
696 | /* | |
697 | * People should not be passing in unaligned addresses: | |
698 | */ | |
699 | WARN_ON_ONCE(1); | |
700 | } | |
701 | ||
72e458df TG |
702 | cpa.vaddr = addr; |
703 | cpa.numpages = numpages; | |
704 | cpa.mask_set = mask_set; | |
705 | cpa.mask_clr = mask_clr; | |
f4ae5da0 | 706 | cpa.flushtlb = 0; |
72e458df | 707 | |
af96e443 TG |
708 | /* No alias checking for _NX bit modifications */ |
709 | checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; | |
710 | ||
711 | ret = __change_page_attr_set_clr(&cpa, checkalias); | |
ff31452b | 712 | |
f4ae5da0 TG |
713 | /* |
714 | * Check whether we really changed something: | |
715 | */ | |
716 | if (!cpa.flushtlb) | |
76ebd054 | 717 | goto out; |
f4ae5da0 | 718 | |
6bb8383b AK |
719 | /* |
720 | * No need to flush, when we did not set any of the caching | |
721 | * attributes: | |
722 | */ | |
723 | cache = cache_attr(mask_set); | |
724 | ||
57a6a46a TG |
725 | /* |
726 | * On success we use clflush, when the CPU supports it to | |
727 | * avoid the wbindv. If the CPU does not support it and in the | |
af1e6844 | 728 | * error case we fall back to cpa_flush_all (which uses |
57a6a46a TG |
729 | * wbindv): |
730 | */ | |
731 | if (!ret && cpu_has_clflush) | |
6bb8383b | 732 | cpa_flush_range(addr, numpages, cache); |
57a6a46a | 733 | else |
6bb8383b | 734 | cpa_flush_all(cache); |
ff31452b | 735 | |
76ebd054 TG |
736 | out: |
737 | cpa_fill_pool(); | |
ff31452b TG |
738 | return ret; |
739 | } | |
740 | ||
56744546 TG |
741 | static inline int change_page_attr_set(unsigned long addr, int numpages, |
742 | pgprot_t mask) | |
75cbade8 | 743 | { |
56744546 | 744 | return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); |
75cbade8 AV |
745 | } |
746 | ||
56744546 TG |
747 | static inline int change_page_attr_clear(unsigned long addr, int numpages, |
748 | pgprot_t mask) | |
72932c7a | 749 | { |
5827040d | 750 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); |
72932c7a TG |
751 | } |
752 | ||
753 | int set_memory_uc(unsigned long addr, int numpages) | |
754 | { | |
755 | return change_page_attr_set(addr, numpages, | |
756 | __pgprot(_PAGE_PCD | _PAGE_PWT)); | |
75cbade8 AV |
757 | } |
758 | EXPORT_SYMBOL(set_memory_uc); | |
759 | ||
760 | int set_memory_wb(unsigned long addr, int numpages) | |
761 | { | |
72932c7a TG |
762 | return change_page_attr_clear(addr, numpages, |
763 | __pgprot(_PAGE_PCD | _PAGE_PWT)); | |
75cbade8 AV |
764 | } |
765 | EXPORT_SYMBOL(set_memory_wb); | |
766 | ||
767 | int set_memory_x(unsigned long addr, int numpages) | |
768 | { | |
72932c7a | 769 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); |
75cbade8 AV |
770 | } |
771 | EXPORT_SYMBOL(set_memory_x); | |
772 | ||
773 | int set_memory_nx(unsigned long addr, int numpages) | |
774 | { | |
72932c7a | 775 | return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); |
75cbade8 AV |
776 | } |
777 | EXPORT_SYMBOL(set_memory_nx); | |
778 | ||
779 | int set_memory_ro(unsigned long addr, int numpages) | |
780 | { | |
72932c7a | 781 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); |
75cbade8 | 782 | } |
75cbade8 AV |
783 | |
784 | int set_memory_rw(unsigned long addr, int numpages) | |
785 | { | |
72932c7a | 786 | return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); |
75cbade8 | 787 | } |
f62d0f00 IM |
788 | |
789 | int set_memory_np(unsigned long addr, int numpages) | |
790 | { | |
72932c7a | 791 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); |
f62d0f00 | 792 | } |
75cbade8 AV |
793 | |
794 | int set_pages_uc(struct page *page, int numpages) | |
795 | { | |
796 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 797 | |
d7c8f21a | 798 | return set_memory_uc(addr, numpages); |
75cbade8 AV |
799 | } |
800 | EXPORT_SYMBOL(set_pages_uc); | |
801 | ||
802 | int set_pages_wb(struct page *page, int numpages) | |
803 | { | |
804 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 805 | |
d7c8f21a | 806 | return set_memory_wb(addr, numpages); |
75cbade8 AV |
807 | } |
808 | EXPORT_SYMBOL(set_pages_wb); | |
809 | ||
810 | int set_pages_x(struct page *page, int numpages) | |
811 | { | |
812 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 813 | |
d7c8f21a | 814 | return set_memory_x(addr, numpages); |
75cbade8 AV |
815 | } |
816 | EXPORT_SYMBOL(set_pages_x); | |
817 | ||
818 | int set_pages_nx(struct page *page, int numpages) | |
819 | { | |
820 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 821 | |
d7c8f21a | 822 | return set_memory_nx(addr, numpages); |
75cbade8 AV |
823 | } |
824 | EXPORT_SYMBOL(set_pages_nx); | |
825 | ||
826 | int set_pages_ro(struct page *page, int numpages) | |
827 | { | |
828 | unsigned long addr = (unsigned long)page_address(page); | |
75cbade8 | 829 | |
d7c8f21a | 830 | return set_memory_ro(addr, numpages); |
75cbade8 | 831 | } |
75cbade8 AV |
832 | |
833 | int set_pages_rw(struct page *page, int numpages) | |
834 | { | |
835 | unsigned long addr = (unsigned long)page_address(page); | |
e81d5dc4 | 836 | |
d7c8f21a | 837 | return set_memory_rw(addr, numpages); |
78c94aba IM |
838 | } |
839 | ||
1da177e4 | 840 | #ifdef CONFIG_DEBUG_PAGEALLOC |
f62d0f00 IM |
841 | |
842 | static int __set_pages_p(struct page *page, int numpages) | |
843 | { | |
72e458df TG |
844 | struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), |
845 | .numpages = numpages, | |
846 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), | |
847 | .mask_clr = __pgprot(0)}; | |
72932c7a | 848 | |
c31c7d48 | 849 | return __change_page_attr_set_clr(&cpa, 1); |
f62d0f00 IM |
850 | } |
851 | ||
852 | static int __set_pages_np(struct page *page, int numpages) | |
853 | { | |
72e458df TG |
854 | struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), |
855 | .numpages = numpages, | |
856 | .mask_set = __pgprot(0), | |
857 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; | |
72932c7a | 858 | |
c31c7d48 | 859 | return __change_page_attr_set_clr(&cpa, 1); |
f62d0f00 IM |
860 | } |
861 | ||
1da177e4 LT |
862 | void kernel_map_pages(struct page *page, int numpages, int enable) |
863 | { | |
864 | if (PageHighMem(page)) | |
865 | return; | |
9f4c815c | 866 | if (!enable) { |
f9b8404c IM |
867 | debug_check_no_locks_freed(page_address(page), |
868 | numpages * PAGE_SIZE); | |
9f4c815c | 869 | } |
de5097c2 | 870 | |
12d6f21e IM |
871 | /* |
872 | * If page allocator is not up yet then do not call c_p_a(): | |
873 | */ | |
874 | if (!debug_pagealloc_enabled) | |
875 | return; | |
876 | ||
9f4c815c | 877 | /* |
f8d8406b IM |
878 | * The return value is ignored as the calls cannot fail. |
879 | * Large pages are kept enabled at boot time, and are | |
880 | * split up quickly with DEBUG_PAGEALLOC. If a splitup | |
881 | * fails here (due to temporary memory shortage) no damage | |
882 | * is done because we just keep the largepage intact up | |
883 | * to the next attempt when it will likely be split up: | |
1da177e4 | 884 | */ |
f62d0f00 IM |
885 | if (enable) |
886 | __set_pages_p(page, numpages); | |
887 | else | |
888 | __set_pages_np(page, numpages); | |
9f4c815c IM |
889 | |
890 | /* | |
e4b71dcf IM |
891 | * We should perform an IPI and flush all tlbs, |
892 | * but that can deadlock->flush only current cpu: | |
1da177e4 LT |
893 | */ |
894 | __flush_tlb_all(); | |
76ebd054 TG |
895 | |
896 | /* | |
897 | * Try to refill the page pool here. We can do this only after | |
898 | * the tlb flush. | |
899 | */ | |
900 | cpa_fill_pool(); | |
1da177e4 LT |
901 | } |
902 | #endif | |
d1028a15 AV |
903 | |
904 | /* | |
905 | * The testcases use internal knowledge of the implementation that shouldn't | |
906 | * be exposed to the rest of the kernel. Include these directly here. | |
907 | */ | |
908 | #ifdef CONFIG_CPA_DEBUG | |
909 | #include "pageattr-test.c" | |
910 | #endif |