]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - arch/x86/xen/mmu_pv.c
MAINTAINERS: Update MAX77802 PMIC entry
[mirror_ubuntu-artful-kernel.git] / arch / x86 / xen / mmu_pv.c
1 /*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
41 #include <linux/sched/mm.h>
42 #include <linux/highmem.h>
43 #include <linux/debugfs.h>
44 #include <linux/bug.h>
45 #include <linux/vmalloc.h>
46 #include <linux/export.h>
47 #include <linux/init.h>
48 #include <linux/gfp.h>
49 #include <linux/memblock.h>
50 #include <linux/seq_file.h>
51 #include <linux/crash_dump.h>
52 #ifdef CONFIG_KEXEC_CORE
53 #include <linux/kexec.h>
54 #endif
55
56 #include <trace/events/xen.h>
57
58 #include <asm/pgtable.h>
59 #include <asm/tlbflush.h>
60 #include <asm/fixmap.h>
61 #include <asm/mmu_context.h>
62 #include <asm/setup.h>
63 #include <asm/paravirt.h>
64 #include <asm/e820/api.h>
65 #include <asm/linkage.h>
66 #include <asm/page.h>
67 #include <asm/init.h>
68 #include <asm/pat.h>
69 #include <asm/smp.h>
70
71 #include <asm/xen/hypercall.h>
72 #include <asm/xen/hypervisor.h>
73
74 #include <xen/xen.h>
75 #include <xen/page.h>
76 #include <xen/interface/xen.h>
77 #include <xen/interface/hvm/hvm_op.h>
78 #include <xen/interface/version.h>
79 #include <xen/interface/memory.h>
80 #include <xen/hvc-console.h>
81
82 #include "multicalls.h"
83 #include "mmu.h"
84 #include "debugfs.h"
85
86 #ifdef CONFIG_X86_32
87 /*
88 * Identity map, in addition to plain kernel map. This needs to be
89 * large enough to allocate page table pages to allocate the rest.
90 * Each page can map 2MB.
91 */
92 #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
93 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
94 #endif
95 #ifdef CONFIG_X86_64
96 /* l3 pud for userspace vsyscall mapping */
97 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
98 #endif /* CONFIG_X86_64 */
99
100 /*
101 * Note about cr3 (pagetable base) values:
102 *
103 * xen_cr3 contains the current logical cr3 value; it contains the
104 * last set cr3. This may not be the current effective cr3, because
105 * its update may be being lazily deferred. However, a vcpu looking
106 * at its own cr3 can use this value knowing that it everything will
107 * be self-consistent.
108 *
109 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
110 * hypercall to set the vcpu cr3 is complete (so it may be a little
111 * out of date, but it will never be set early). If one vcpu is
112 * looking at another vcpu's cr3 value, it should use this variable.
113 */
114 DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
115 DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
116
117 static phys_addr_t xen_pt_base, xen_pt_size __initdata;
118
119 /*
120 * Just beyond the highest usermode address. STACK_TOP_MAX has a
121 * redzone above it, so round it up to a PGD boundary.
122 */
123 #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
124
125 void make_lowmem_page_readonly(void *vaddr)
126 {
127 pte_t *pte, ptev;
128 unsigned long address = (unsigned long)vaddr;
129 unsigned int level;
130
131 pte = lookup_address(address, &level);
132 if (pte == NULL)
133 return; /* vaddr missing */
134
135 ptev = pte_wrprotect(*pte);
136
137 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
138 BUG();
139 }
140
141 void make_lowmem_page_readwrite(void *vaddr)
142 {
143 pte_t *pte, ptev;
144 unsigned long address = (unsigned long)vaddr;
145 unsigned int level;
146
147 pte = lookup_address(address, &level);
148 if (pte == NULL)
149 return; /* vaddr missing */
150
151 ptev = pte_mkwrite(*pte);
152
153 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
154 BUG();
155 }
156
157
158 static bool xen_page_pinned(void *ptr)
159 {
160 struct page *page = virt_to_page(ptr);
161
162 return PagePinned(page);
163 }
164
165 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
166 {
167 struct multicall_space mcs;
168 struct mmu_update *u;
169
170 trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
171
172 mcs = xen_mc_entry(sizeof(*u));
173 u = mcs.args;
174
175 /* ptep might be kmapped when using 32-bit HIGHPTE */
176 u->ptr = virt_to_machine(ptep).maddr;
177 u->val = pte_val_ma(pteval);
178
179 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
180
181 xen_mc_issue(PARAVIRT_LAZY_MMU);
182 }
183 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
184
185 static void xen_extend_mmu_update(const struct mmu_update *update)
186 {
187 struct multicall_space mcs;
188 struct mmu_update *u;
189
190 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
191
192 if (mcs.mc != NULL) {
193 mcs.mc->args[1]++;
194 } else {
195 mcs = __xen_mc_entry(sizeof(*u));
196 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
197 }
198
199 u = mcs.args;
200 *u = *update;
201 }
202
203 static void xen_extend_mmuext_op(const struct mmuext_op *op)
204 {
205 struct multicall_space mcs;
206 struct mmuext_op *u;
207
208 mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
209
210 if (mcs.mc != NULL) {
211 mcs.mc->args[1]++;
212 } else {
213 mcs = __xen_mc_entry(sizeof(*u));
214 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
215 }
216
217 u = mcs.args;
218 *u = *op;
219 }
220
221 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
222 {
223 struct mmu_update u;
224
225 preempt_disable();
226
227 xen_mc_batch();
228
229 /* ptr may be ioremapped for 64-bit pagetable setup */
230 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
231 u.val = pmd_val_ma(val);
232 xen_extend_mmu_update(&u);
233
234 xen_mc_issue(PARAVIRT_LAZY_MMU);
235
236 preempt_enable();
237 }
238
239 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
240 {
241 trace_xen_mmu_set_pmd(ptr, val);
242
243 /* If page is not pinned, we can just update the entry
244 directly */
245 if (!xen_page_pinned(ptr)) {
246 *ptr = val;
247 return;
248 }
249
250 xen_set_pmd_hyper(ptr, val);
251 }
252
253 /*
254 * Associate a virtual page frame with a given physical page frame
255 * and protection flags for that frame.
256 */
257 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
258 {
259 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
260 }
261
262 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
263 {
264 struct mmu_update u;
265
266 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
267 return false;
268
269 xen_mc_batch();
270
271 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
272 u.val = pte_val_ma(pteval);
273 xen_extend_mmu_update(&u);
274
275 xen_mc_issue(PARAVIRT_LAZY_MMU);
276
277 return true;
278 }
279
280 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
281 {
282 if (!xen_batched_set_pte(ptep, pteval)) {
283 /*
284 * Could call native_set_pte() here and trap and
285 * emulate the PTE write but with 32-bit guests this
286 * needs two traps (one for each of the two 32-bit
287 * words in the PTE) so do one hypercall directly
288 * instead.
289 */
290 struct mmu_update u;
291
292 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
293 u.val = pte_val_ma(pteval);
294 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
295 }
296 }
297
298 static void xen_set_pte(pte_t *ptep, pte_t pteval)
299 {
300 trace_xen_mmu_set_pte(ptep, pteval);
301 __xen_set_pte(ptep, pteval);
302 }
303
304 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
305 pte_t *ptep, pte_t pteval)
306 {
307 trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
308 __xen_set_pte(ptep, pteval);
309 }
310
311 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
312 unsigned long addr, pte_t *ptep)
313 {
314 /* Just return the pte as-is. We preserve the bits on commit */
315 trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
316 return *ptep;
317 }
318
319 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
320 pte_t *ptep, pte_t pte)
321 {
322 struct mmu_update u;
323
324 trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
325 xen_mc_batch();
326
327 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
328 u.val = pte_val_ma(pte);
329 xen_extend_mmu_update(&u);
330
331 xen_mc_issue(PARAVIRT_LAZY_MMU);
332 }
333
334 /* Assume pteval_t is equivalent to all the other *val_t types. */
335 static pteval_t pte_mfn_to_pfn(pteval_t val)
336 {
337 if (val & _PAGE_PRESENT) {
338 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
339 unsigned long pfn = mfn_to_pfn(mfn);
340
341 pteval_t flags = val & PTE_FLAGS_MASK;
342 if (unlikely(pfn == ~0))
343 val = flags & ~_PAGE_PRESENT;
344 else
345 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
346 }
347
348 return val;
349 }
350
351 static pteval_t pte_pfn_to_mfn(pteval_t val)
352 {
353 if (val & _PAGE_PRESENT) {
354 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
355 pteval_t flags = val & PTE_FLAGS_MASK;
356 unsigned long mfn;
357
358 if (!xen_feature(XENFEAT_auto_translated_physmap))
359 mfn = __pfn_to_mfn(pfn);
360 else
361 mfn = pfn;
362 /*
363 * If there's no mfn for the pfn, then just create an
364 * empty non-present pte. Unfortunately this loses
365 * information about the original pfn, so
366 * pte_mfn_to_pfn is asymmetric.
367 */
368 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
369 mfn = 0;
370 flags = 0;
371 } else
372 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
373 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
374 }
375
376 return val;
377 }
378
379 __visible pteval_t xen_pte_val(pte_t pte)
380 {
381 pteval_t pteval = pte.pte;
382
383 return pte_mfn_to_pfn(pteval);
384 }
385 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
386
387 __visible pgdval_t xen_pgd_val(pgd_t pgd)
388 {
389 return pte_mfn_to_pfn(pgd.pgd);
390 }
391 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
392
393 __visible pte_t xen_make_pte(pteval_t pte)
394 {
395 pte = pte_pfn_to_mfn(pte);
396
397 return native_make_pte(pte);
398 }
399 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
400
401 __visible pgd_t xen_make_pgd(pgdval_t pgd)
402 {
403 pgd = pte_pfn_to_mfn(pgd);
404 return native_make_pgd(pgd);
405 }
406 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
407
408 __visible pmdval_t xen_pmd_val(pmd_t pmd)
409 {
410 return pte_mfn_to_pfn(pmd.pmd);
411 }
412 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
413
414 static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
415 {
416 struct mmu_update u;
417
418 preempt_disable();
419
420 xen_mc_batch();
421
422 /* ptr may be ioremapped for 64-bit pagetable setup */
423 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
424 u.val = pud_val_ma(val);
425 xen_extend_mmu_update(&u);
426
427 xen_mc_issue(PARAVIRT_LAZY_MMU);
428
429 preempt_enable();
430 }
431
432 static void xen_set_pud(pud_t *ptr, pud_t val)
433 {
434 trace_xen_mmu_set_pud(ptr, val);
435
436 /* If page is not pinned, we can just update the entry
437 directly */
438 if (!xen_page_pinned(ptr)) {
439 *ptr = val;
440 return;
441 }
442
443 xen_set_pud_hyper(ptr, val);
444 }
445
446 #ifdef CONFIG_X86_PAE
447 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
448 {
449 trace_xen_mmu_set_pte_atomic(ptep, pte);
450 set_64bit((u64 *)ptep, native_pte_val(pte));
451 }
452
453 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
454 {
455 trace_xen_mmu_pte_clear(mm, addr, ptep);
456 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
457 native_pte_clear(mm, addr, ptep);
458 }
459
460 static void xen_pmd_clear(pmd_t *pmdp)
461 {
462 trace_xen_mmu_pmd_clear(pmdp);
463 set_pmd(pmdp, __pmd(0));
464 }
465 #endif /* CONFIG_X86_PAE */
466
467 __visible pmd_t xen_make_pmd(pmdval_t pmd)
468 {
469 pmd = pte_pfn_to_mfn(pmd);
470 return native_make_pmd(pmd);
471 }
472 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
473
474 #if CONFIG_PGTABLE_LEVELS == 4
475 __visible pudval_t xen_pud_val(pud_t pud)
476 {
477 return pte_mfn_to_pfn(pud.pud);
478 }
479 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
480
481 __visible pud_t xen_make_pud(pudval_t pud)
482 {
483 pud = pte_pfn_to_mfn(pud);
484
485 return native_make_pud(pud);
486 }
487 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
488
489 static pgd_t *xen_get_user_pgd(pgd_t *pgd)
490 {
491 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
492 unsigned offset = pgd - pgd_page;
493 pgd_t *user_ptr = NULL;
494
495 if (offset < pgd_index(USER_LIMIT)) {
496 struct page *page = virt_to_page(pgd_page);
497 user_ptr = (pgd_t *)page->private;
498 if (user_ptr)
499 user_ptr += offset;
500 }
501
502 return user_ptr;
503 }
504
505 static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
506 {
507 struct mmu_update u;
508
509 u.ptr = virt_to_machine(ptr).maddr;
510 u.val = p4d_val_ma(val);
511 xen_extend_mmu_update(&u);
512 }
513
514 /*
515 * Raw hypercall-based set_p4d, intended for in early boot before
516 * there's a page structure. This implies:
517 * 1. The only existing pagetable is the kernel's
518 * 2. It is always pinned
519 * 3. It has no user pagetable attached to it
520 */
521 static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
522 {
523 preempt_disable();
524
525 xen_mc_batch();
526
527 __xen_set_p4d_hyper(ptr, val);
528
529 xen_mc_issue(PARAVIRT_LAZY_MMU);
530
531 preempt_enable();
532 }
533
534 static void xen_set_p4d(p4d_t *ptr, p4d_t val)
535 {
536 pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
537 pgd_t pgd_val;
538
539 trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
540
541 /* If page is not pinned, we can just update the entry
542 directly */
543 if (!xen_page_pinned(ptr)) {
544 *ptr = val;
545 if (user_ptr) {
546 WARN_ON(xen_page_pinned(user_ptr));
547 pgd_val.pgd = p4d_val_ma(val);
548 *user_ptr = pgd_val;
549 }
550 return;
551 }
552
553 /* If it's pinned, then we can at least batch the kernel and
554 user updates together. */
555 xen_mc_batch();
556
557 __xen_set_p4d_hyper(ptr, val);
558 if (user_ptr)
559 __xen_set_p4d_hyper((p4d_t *)user_ptr, val);
560
561 xen_mc_issue(PARAVIRT_LAZY_MMU);
562 }
563 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
564
565 static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
566 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
567 bool last, unsigned long limit)
568 {
569 int i, nr, flush = 0;
570
571 nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
572 for (i = 0; i < nr; i++) {
573 if (!pmd_none(pmd[i]))
574 flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
575 }
576 return flush;
577 }
578
579 static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
580 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
581 bool last, unsigned long limit)
582 {
583 int i, nr, flush = 0;
584
585 nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
586 for (i = 0; i < nr; i++) {
587 pmd_t *pmd;
588
589 if (pud_none(pud[i]))
590 continue;
591
592 pmd = pmd_offset(&pud[i], 0);
593 if (PTRS_PER_PMD > 1)
594 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
595 flush |= xen_pmd_walk(mm, pmd, func,
596 last && i == nr - 1, limit);
597 }
598 return flush;
599 }
600
601 static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
602 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
603 bool last, unsigned long limit)
604 {
605 int i, nr, flush = 0;
606
607 nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
608 for (i = 0; i < nr; i++) {
609 pud_t *pud;
610
611 if (p4d_none(p4d[i]))
612 continue;
613
614 pud = pud_offset(&p4d[i], 0);
615 if (PTRS_PER_PUD > 1)
616 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
617 flush |= xen_pud_walk(mm, pud, func,
618 last && i == nr - 1, limit);
619 }
620 return flush;
621 }
622
623 /*
624 * (Yet another) pagetable walker. This one is intended for pinning a
625 * pagetable. This means that it walks a pagetable and calls the
626 * callback function on each page it finds making up the page table,
627 * at every level. It walks the entire pagetable, but it only bothers
628 * pinning pte pages which are below limit. In the normal case this
629 * will be STACK_TOP_MAX, but at boot we need to pin up to
630 * FIXADDR_TOP.
631 *
632 * For 32-bit the important bit is that we don't pin beyond there,
633 * because then we start getting into Xen's ptes.
634 *
635 * For 64-bit, we must skip the Xen hole in the middle of the address
636 * space, just after the big x86-64 virtual hole.
637 */
638 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
639 int (*func)(struct mm_struct *mm, struct page *,
640 enum pt_level),
641 unsigned long limit)
642 {
643 int i, nr, flush = 0;
644 unsigned hole_low, hole_high;
645
646 /* The limit is the last byte to be touched */
647 limit--;
648 BUG_ON(limit >= FIXADDR_TOP);
649
650 if (xen_feature(XENFEAT_auto_translated_physmap))
651 return 0;
652
653 /*
654 * 64-bit has a great big hole in the middle of the address
655 * space, which contains the Xen mappings. On 32-bit these
656 * will end up making a zero-sized hole and so is a no-op.
657 */
658 hole_low = pgd_index(USER_LIMIT);
659 hole_high = pgd_index(PAGE_OFFSET);
660
661 nr = pgd_index(limit) + 1;
662 for (i = 0; i < nr; i++) {
663 p4d_t *p4d;
664
665 if (i >= hole_low && i < hole_high)
666 continue;
667
668 if (pgd_none(pgd[i]))
669 continue;
670
671 p4d = p4d_offset(&pgd[i], 0);
672 if (PTRS_PER_P4D > 1)
673 flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
674 flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
675 }
676
677 /* Do the top level last, so that the callbacks can use it as
678 a cue to do final things like tlb flushes. */
679 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
680
681 return flush;
682 }
683
684 static int xen_pgd_walk(struct mm_struct *mm,
685 int (*func)(struct mm_struct *mm, struct page *,
686 enum pt_level),
687 unsigned long limit)
688 {
689 return __xen_pgd_walk(mm, mm->pgd, func, limit);
690 }
691
692 /* If we're using split pte locks, then take the page's lock and
693 return a pointer to it. Otherwise return NULL. */
694 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
695 {
696 spinlock_t *ptl = NULL;
697
698 #if USE_SPLIT_PTE_PTLOCKS
699 ptl = ptlock_ptr(page);
700 spin_lock_nest_lock(ptl, &mm->page_table_lock);
701 #endif
702
703 return ptl;
704 }
705
706 static void xen_pte_unlock(void *v)
707 {
708 spinlock_t *ptl = v;
709 spin_unlock(ptl);
710 }
711
712 static void xen_do_pin(unsigned level, unsigned long pfn)
713 {
714 struct mmuext_op op;
715
716 op.cmd = level;
717 op.arg1.mfn = pfn_to_mfn(pfn);
718
719 xen_extend_mmuext_op(&op);
720 }
721
722 static int xen_pin_page(struct mm_struct *mm, struct page *page,
723 enum pt_level level)
724 {
725 unsigned pgfl = TestSetPagePinned(page);
726 int flush;
727
728 if (pgfl)
729 flush = 0; /* already pinned */
730 else if (PageHighMem(page))
731 /* kmaps need flushing if we found an unpinned
732 highpage */
733 flush = 1;
734 else {
735 void *pt = lowmem_page_address(page);
736 unsigned long pfn = page_to_pfn(page);
737 struct multicall_space mcs = __xen_mc_entry(0);
738 spinlock_t *ptl;
739
740 flush = 0;
741
742 /*
743 * We need to hold the pagetable lock between the time
744 * we make the pagetable RO and when we actually pin
745 * it. If we don't, then other users may come in and
746 * attempt to update the pagetable by writing it,
747 * which will fail because the memory is RO but not
748 * pinned, so Xen won't do the trap'n'emulate.
749 *
750 * If we're using split pte locks, we can't hold the
751 * entire pagetable's worth of locks during the
752 * traverse, because we may wrap the preempt count (8
753 * bits). The solution is to mark RO and pin each PTE
754 * page while holding the lock. This means the number
755 * of locks we end up holding is never more than a
756 * batch size (~32 entries, at present).
757 *
758 * If we're not using split pte locks, we needn't pin
759 * the PTE pages independently, because we're
760 * protected by the overall pagetable lock.
761 */
762 ptl = NULL;
763 if (level == PT_PTE)
764 ptl = xen_pte_lock(page, mm);
765
766 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
767 pfn_pte(pfn, PAGE_KERNEL_RO),
768 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
769
770 if (ptl) {
771 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
772
773 /* Queue a deferred unlock for when this batch
774 is completed. */
775 xen_mc_callback(xen_pte_unlock, ptl);
776 }
777 }
778
779 return flush;
780 }
781
782 /* This is called just after a mm has been created, but it has not
783 been used yet. We need to make sure that its pagetable is all
784 read-only, and can be pinned. */
785 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
786 {
787 trace_xen_mmu_pgd_pin(mm, pgd);
788
789 xen_mc_batch();
790
791 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
792 /* re-enable interrupts for flushing */
793 xen_mc_issue(0);
794
795 kmap_flush_unused();
796
797 xen_mc_batch();
798 }
799
800 #ifdef CONFIG_X86_64
801 {
802 pgd_t *user_pgd = xen_get_user_pgd(pgd);
803
804 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
805
806 if (user_pgd) {
807 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
808 xen_do_pin(MMUEXT_PIN_L4_TABLE,
809 PFN_DOWN(__pa(user_pgd)));
810 }
811 }
812 #else /* CONFIG_X86_32 */
813 #ifdef CONFIG_X86_PAE
814 /* Need to make sure unshared kernel PMD is pinnable */
815 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
816 PT_PMD);
817 #endif
818 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
819 #endif /* CONFIG_X86_64 */
820 xen_mc_issue(0);
821 }
822
823 static void xen_pgd_pin(struct mm_struct *mm)
824 {
825 __xen_pgd_pin(mm, mm->pgd);
826 }
827
828 /*
829 * On save, we need to pin all pagetables to make sure they get their
830 * mfns turned into pfns. Search the list for any unpinned pgds and pin
831 * them (unpinned pgds are not currently in use, probably because the
832 * process is under construction or destruction).
833 *
834 * Expected to be called in stop_machine() ("equivalent to taking
835 * every spinlock in the system"), so the locking doesn't really
836 * matter all that much.
837 */
838 void xen_mm_pin_all(void)
839 {
840 struct page *page;
841
842 spin_lock(&pgd_lock);
843
844 list_for_each_entry(page, &pgd_list, lru) {
845 if (!PagePinned(page)) {
846 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
847 SetPageSavePinned(page);
848 }
849 }
850
851 spin_unlock(&pgd_lock);
852 }
853
854 /*
855 * The init_mm pagetable is really pinned as soon as its created, but
856 * that's before we have page structures to store the bits. So do all
857 * the book-keeping now.
858 */
859 static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
860 enum pt_level level)
861 {
862 SetPagePinned(page);
863 return 0;
864 }
865
866 static void __init xen_mark_init_mm_pinned(void)
867 {
868 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
869 }
870
871 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
872 enum pt_level level)
873 {
874 unsigned pgfl = TestClearPagePinned(page);
875
876 if (pgfl && !PageHighMem(page)) {
877 void *pt = lowmem_page_address(page);
878 unsigned long pfn = page_to_pfn(page);
879 spinlock_t *ptl = NULL;
880 struct multicall_space mcs;
881
882 /*
883 * Do the converse to pin_page. If we're using split
884 * pte locks, we must be holding the lock for while
885 * the pte page is unpinned but still RO to prevent
886 * concurrent updates from seeing it in this
887 * partially-pinned state.
888 */
889 if (level == PT_PTE) {
890 ptl = xen_pte_lock(page, mm);
891
892 if (ptl)
893 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
894 }
895
896 mcs = __xen_mc_entry(0);
897
898 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
899 pfn_pte(pfn, PAGE_KERNEL),
900 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
901
902 if (ptl) {
903 /* unlock when batch completed */
904 xen_mc_callback(xen_pte_unlock, ptl);
905 }
906 }
907
908 return 0; /* never need to flush on unpin */
909 }
910
911 /* Release a pagetables pages back as normal RW */
912 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
913 {
914 trace_xen_mmu_pgd_unpin(mm, pgd);
915
916 xen_mc_batch();
917
918 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
919
920 #ifdef CONFIG_X86_64
921 {
922 pgd_t *user_pgd = xen_get_user_pgd(pgd);
923
924 if (user_pgd) {
925 xen_do_pin(MMUEXT_UNPIN_TABLE,
926 PFN_DOWN(__pa(user_pgd)));
927 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
928 }
929 }
930 #endif
931
932 #ifdef CONFIG_X86_PAE
933 /* Need to make sure unshared kernel PMD is unpinned */
934 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
935 PT_PMD);
936 #endif
937
938 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
939
940 xen_mc_issue(0);
941 }
942
943 static void xen_pgd_unpin(struct mm_struct *mm)
944 {
945 __xen_pgd_unpin(mm, mm->pgd);
946 }
947
948 /*
949 * On resume, undo any pinning done at save, so that the rest of the
950 * kernel doesn't see any unexpected pinned pagetables.
951 */
952 void xen_mm_unpin_all(void)
953 {
954 struct page *page;
955
956 spin_lock(&pgd_lock);
957
958 list_for_each_entry(page, &pgd_list, lru) {
959 if (PageSavePinned(page)) {
960 BUG_ON(!PagePinned(page));
961 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
962 ClearPageSavePinned(page);
963 }
964 }
965
966 spin_unlock(&pgd_lock);
967 }
968
969 static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
970 {
971 spin_lock(&next->page_table_lock);
972 xen_pgd_pin(next);
973 spin_unlock(&next->page_table_lock);
974 }
975
976 static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
977 {
978 spin_lock(&mm->page_table_lock);
979 xen_pgd_pin(mm);
980 spin_unlock(&mm->page_table_lock);
981 }
982
983
984 #ifdef CONFIG_SMP
985 /* Another cpu may still have their %cr3 pointing at the pagetable, so
986 we need to repoint it somewhere else before we can unpin it. */
987 static void drop_other_mm_ref(void *info)
988 {
989 struct mm_struct *mm = info;
990 struct mm_struct *active_mm;
991
992 active_mm = this_cpu_read(cpu_tlbstate.active_mm);
993
994 if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
995 leave_mm(smp_processor_id());
996
997 /* If this cpu still has a stale cr3 reference, then make sure
998 it has been flushed. */
999 if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1000 load_cr3(swapper_pg_dir);
1001 }
1002
1003 static void xen_drop_mm_ref(struct mm_struct *mm)
1004 {
1005 cpumask_var_t mask;
1006 unsigned cpu;
1007
1008 if (current->active_mm == mm) {
1009 if (current->mm == mm)
1010 load_cr3(swapper_pg_dir);
1011 else
1012 leave_mm(smp_processor_id());
1013 }
1014
1015 /* Get the "official" set of cpus referring to our pagetable. */
1016 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1017 for_each_online_cpu(cpu) {
1018 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1019 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1020 continue;
1021 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1022 }
1023 return;
1024 }
1025 cpumask_copy(mask, mm_cpumask(mm));
1026
1027 /* It's possible that a vcpu may have a stale reference to our
1028 cr3, because its in lazy mode, and it hasn't yet flushed
1029 its set of pending hypercalls yet. In this case, we can
1030 look at its actual current cr3 value, and force it to flush
1031 if needed. */
1032 for_each_online_cpu(cpu) {
1033 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1034 cpumask_set_cpu(cpu, mask);
1035 }
1036
1037 if (!cpumask_empty(mask))
1038 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1039 free_cpumask_var(mask);
1040 }
1041 #else
1042 static void xen_drop_mm_ref(struct mm_struct *mm)
1043 {
1044 if (current->active_mm == mm)
1045 load_cr3(swapper_pg_dir);
1046 }
1047 #endif
1048
1049 /*
1050 * While a process runs, Xen pins its pagetables, which means that the
1051 * hypervisor forces it to be read-only, and it controls all updates
1052 * to it. This means that all pagetable updates have to go via the
1053 * hypervisor, which is moderately expensive.
1054 *
1055 * Since we're pulling the pagetable down, we switch to use init_mm,
1056 * unpin old process pagetable and mark it all read-write, which
1057 * allows further operations on it to be simple memory accesses.
1058 *
1059 * The only subtle point is that another CPU may be still using the
1060 * pagetable because of lazy tlb flushing. This means we need need to
1061 * switch all CPUs off this pagetable before we can unpin it.
1062 */
1063 static void xen_exit_mmap(struct mm_struct *mm)
1064 {
1065 get_cpu(); /* make sure we don't move around */
1066 xen_drop_mm_ref(mm);
1067 put_cpu();
1068
1069 spin_lock(&mm->page_table_lock);
1070
1071 /* pgd may not be pinned in the error exit path of execve */
1072 if (xen_page_pinned(mm->pgd))
1073 xen_pgd_unpin(mm);
1074
1075 spin_unlock(&mm->page_table_lock);
1076 }
1077
1078 static void xen_post_allocator_init(void);
1079
1080 static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1081 {
1082 struct mmuext_op op;
1083
1084 op.cmd = cmd;
1085 op.arg1.mfn = pfn_to_mfn(pfn);
1086 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1087 BUG();
1088 }
1089
1090 #ifdef CONFIG_X86_64
1091 static void __init xen_cleanhighmap(unsigned long vaddr,
1092 unsigned long vaddr_end)
1093 {
1094 unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1095 pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1096
1097 /* NOTE: The loop is more greedy than the cleanup_highmap variant.
1098 * We include the PMD passed in on _both_ boundaries. */
1099 for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
1100 pmd++, vaddr += PMD_SIZE) {
1101 if (pmd_none(*pmd))
1102 continue;
1103 if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1104 set_pmd(pmd, __pmd(0));
1105 }
1106 /* In case we did something silly, we should crash in this function
1107 * instead of somewhere later and be confusing. */
1108 xen_mc_flush();
1109 }
1110
1111 /*
1112 * Make a page range writeable and free it.
1113 */
1114 static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1115 {
1116 void *vaddr = __va(paddr);
1117 void *vaddr_end = vaddr + size;
1118
1119 for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1120 make_lowmem_page_readwrite(vaddr);
1121
1122 memblock_free(paddr, size);
1123 }
1124
1125 static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1126 {
1127 unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1128
1129 if (unpin)
1130 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1131 ClearPagePinned(virt_to_page(__va(pa)));
1132 xen_free_ro_pages(pa, PAGE_SIZE);
1133 }
1134
1135 static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
1136 {
1137 unsigned long pa;
1138 pte_t *pte_tbl;
1139 int i;
1140
1141 if (pmd_large(*pmd)) {
1142 pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1143 xen_free_ro_pages(pa, PMD_SIZE);
1144 return;
1145 }
1146
1147 pte_tbl = pte_offset_kernel(pmd, 0);
1148 for (i = 0; i < PTRS_PER_PTE; i++) {
1149 if (pte_none(pte_tbl[i]))
1150 continue;
1151 pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
1152 xen_free_ro_pages(pa, PAGE_SIZE);
1153 }
1154 set_pmd(pmd, __pmd(0));
1155 xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
1156 }
1157
1158 static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
1159 {
1160 unsigned long pa;
1161 pmd_t *pmd_tbl;
1162 int i;
1163
1164 if (pud_large(*pud)) {
1165 pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1166 xen_free_ro_pages(pa, PUD_SIZE);
1167 return;
1168 }
1169
1170 pmd_tbl = pmd_offset(pud, 0);
1171 for (i = 0; i < PTRS_PER_PMD; i++) {
1172 if (pmd_none(pmd_tbl[i]))
1173 continue;
1174 xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
1175 }
1176 set_pud(pud, __pud(0));
1177 xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
1178 }
1179
1180 static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
1181 {
1182 unsigned long pa;
1183 pud_t *pud_tbl;
1184 int i;
1185
1186 if (p4d_large(*p4d)) {
1187 pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
1188 xen_free_ro_pages(pa, P4D_SIZE);
1189 return;
1190 }
1191
1192 pud_tbl = pud_offset(p4d, 0);
1193 for (i = 0; i < PTRS_PER_PUD; i++) {
1194 if (pud_none(pud_tbl[i]))
1195 continue;
1196 xen_cleanmfnmap_pud(pud_tbl + i, unpin);
1197 }
1198 set_p4d(p4d, __p4d(0));
1199 xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
1200 }
1201
1202 /*
1203 * Since it is well isolated we can (and since it is perhaps large we should)
1204 * also free the page tables mapping the initial P->M table.
1205 */
1206 static void __init xen_cleanmfnmap(unsigned long vaddr)
1207 {
1208 pgd_t *pgd;
1209 p4d_t *p4d;
1210 unsigned int i;
1211 bool unpin;
1212
1213 unpin = (vaddr == 2 * PGDIR_SIZE);
1214 vaddr &= PMD_MASK;
1215 pgd = pgd_offset_k(vaddr);
1216 p4d = p4d_offset(pgd, 0);
1217 for (i = 0; i < PTRS_PER_P4D; i++) {
1218 if (p4d_none(p4d[i]))
1219 continue;
1220 xen_cleanmfnmap_p4d(p4d + i, unpin);
1221 }
1222 if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
1223 set_pgd(pgd, __pgd(0));
1224 xen_cleanmfnmap_free_pgtbl(p4d, unpin);
1225 }
1226 }
1227
1228 static void __init xen_pagetable_p2m_free(void)
1229 {
1230 unsigned long size;
1231 unsigned long addr;
1232
1233 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1234
1235 /* No memory or already called. */
1236 if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1237 return;
1238
1239 /* using __ka address and sticking INVALID_P2M_ENTRY! */
1240 memset((void *)xen_start_info->mfn_list, 0xff, size);
1241
1242 addr = xen_start_info->mfn_list;
1243 /*
1244 * We could be in __ka space.
1245 * We roundup to the PMD, which means that if anybody at this stage is
1246 * using the __ka address of xen_start_info or
1247 * xen_start_info->shared_info they are in going to crash. Fortunatly
1248 * we have already revectored in xen_setup_kernel_pagetable and in
1249 * xen_setup_shared_info.
1250 */
1251 size = roundup(size, PMD_SIZE);
1252
1253 if (addr >= __START_KERNEL_map) {
1254 xen_cleanhighmap(addr, addr + size);
1255 size = PAGE_ALIGN(xen_start_info->nr_pages *
1256 sizeof(unsigned long));
1257 memblock_free(__pa(addr), size);
1258 } else {
1259 xen_cleanmfnmap(addr);
1260 }
1261 }
1262
1263 static void __init xen_pagetable_cleanhighmap(void)
1264 {
1265 unsigned long size;
1266 unsigned long addr;
1267
1268 /* At this stage, cleanup_highmap has already cleaned __ka space
1269 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1270 * the ramdisk). We continue on, erasing PMD entries that point to page
1271 * tables - do note that they are accessible at this stage via __va.
1272 * For good measure we also round up to the PMD - which means that if
1273 * anybody is using __ka address to the initial boot-stack - and try
1274 * to use it - they are going to crash. The xen_start_info has been
1275 * taken care of already in xen_setup_kernel_pagetable. */
1276 addr = xen_start_info->pt_base;
1277 size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1278
1279 xen_cleanhighmap(addr, addr + size);
1280 xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1281 #ifdef DEBUG
1282 /* This is superfluous and is not necessary, but you know what
1283 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1284 * anything at this stage. */
1285 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1286 #endif
1287 }
1288 #endif
1289
1290 static void __init xen_pagetable_p2m_setup(void)
1291 {
1292 if (xen_feature(XENFEAT_auto_translated_physmap))
1293 return;
1294
1295 xen_vmalloc_p2m_tree();
1296
1297 #ifdef CONFIG_X86_64
1298 xen_pagetable_p2m_free();
1299
1300 xen_pagetable_cleanhighmap();
1301 #endif
1302 /* And revector! Bye bye old array */
1303 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1304 }
1305
1306 static void __init xen_pagetable_init(void)
1307 {
1308 paging_init();
1309 xen_post_allocator_init();
1310
1311 xen_pagetable_p2m_setup();
1312
1313 /* Allocate and initialize top and mid mfn levels for p2m structure */
1314 xen_build_mfn_list_list();
1315
1316 /* Remap memory freed due to conflicts with E820 map */
1317 if (!xen_feature(XENFEAT_auto_translated_physmap))
1318 xen_remap_memory();
1319
1320 xen_setup_shared_info();
1321 }
1322 static void xen_write_cr2(unsigned long cr2)
1323 {
1324 this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1325 }
1326
1327 static unsigned long xen_read_cr2(void)
1328 {
1329 return this_cpu_read(xen_vcpu)->arch.cr2;
1330 }
1331
1332 unsigned long xen_read_cr2_direct(void)
1333 {
1334 return this_cpu_read(xen_vcpu_info.arch.cr2);
1335 }
1336
1337 static void xen_flush_tlb(void)
1338 {
1339 struct mmuext_op *op;
1340 struct multicall_space mcs;
1341
1342 trace_xen_mmu_flush_tlb(0);
1343
1344 preempt_disable();
1345
1346 mcs = xen_mc_entry(sizeof(*op));
1347
1348 op = mcs.args;
1349 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1350 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1351
1352 xen_mc_issue(PARAVIRT_LAZY_MMU);
1353
1354 preempt_enable();
1355 }
1356
1357 static void xen_flush_tlb_single(unsigned long addr)
1358 {
1359 struct mmuext_op *op;
1360 struct multicall_space mcs;
1361
1362 trace_xen_mmu_flush_tlb_single(addr);
1363
1364 preempt_disable();
1365
1366 mcs = xen_mc_entry(sizeof(*op));
1367 op = mcs.args;
1368 op->cmd = MMUEXT_INVLPG_LOCAL;
1369 op->arg1.linear_addr = addr & PAGE_MASK;
1370 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1371
1372 xen_mc_issue(PARAVIRT_LAZY_MMU);
1373
1374 preempt_enable();
1375 }
1376
1377 static void xen_flush_tlb_others(const struct cpumask *cpus,
1378 struct mm_struct *mm, unsigned long start,
1379 unsigned long end)
1380 {
1381 struct {
1382 struct mmuext_op op;
1383 #ifdef CONFIG_SMP
1384 DECLARE_BITMAP(mask, num_processors);
1385 #else
1386 DECLARE_BITMAP(mask, NR_CPUS);
1387 #endif
1388 } *args;
1389 struct multicall_space mcs;
1390
1391 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1392
1393 if (cpumask_empty(cpus))
1394 return; /* nothing to do */
1395
1396 mcs = xen_mc_entry(sizeof(*args));
1397 args = mcs.args;
1398 args->op.arg2.vcpumask = to_cpumask(args->mask);
1399
1400 /* Remove us, and any offline CPUS. */
1401 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1402 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1403
1404 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1405 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1406 args->op.cmd = MMUEXT_INVLPG_MULTI;
1407 args->op.arg1.linear_addr = start;
1408 }
1409
1410 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1411
1412 xen_mc_issue(PARAVIRT_LAZY_MMU);
1413 }
1414
1415 static unsigned long xen_read_cr3(void)
1416 {
1417 return this_cpu_read(xen_cr3);
1418 }
1419
1420 static void set_current_cr3(void *v)
1421 {
1422 this_cpu_write(xen_current_cr3, (unsigned long)v);
1423 }
1424
1425 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1426 {
1427 struct mmuext_op op;
1428 unsigned long mfn;
1429
1430 trace_xen_mmu_write_cr3(kernel, cr3);
1431
1432 if (cr3)
1433 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1434 else
1435 mfn = 0;
1436
1437 WARN_ON(mfn == 0 && kernel);
1438
1439 op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1440 op.arg1.mfn = mfn;
1441
1442 xen_extend_mmuext_op(&op);
1443
1444 if (kernel) {
1445 this_cpu_write(xen_cr3, cr3);
1446
1447 /* Update xen_current_cr3 once the batch has actually
1448 been submitted. */
1449 xen_mc_callback(set_current_cr3, (void *)cr3);
1450 }
1451 }
1452 static void xen_write_cr3(unsigned long cr3)
1453 {
1454 BUG_ON(preemptible());
1455
1456 xen_mc_batch(); /* disables interrupts */
1457
1458 /* Update while interrupts are disabled, so its atomic with
1459 respect to ipis */
1460 this_cpu_write(xen_cr3, cr3);
1461
1462 __xen_write_cr3(true, cr3);
1463
1464 #ifdef CONFIG_X86_64
1465 {
1466 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1467 if (user_pgd)
1468 __xen_write_cr3(false, __pa(user_pgd));
1469 else
1470 __xen_write_cr3(false, 0);
1471 }
1472 #endif
1473
1474 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1475 }
1476
1477 #ifdef CONFIG_X86_64
1478 /*
1479 * At the start of the day - when Xen launches a guest, it has already
1480 * built pagetables for the guest. We diligently look over them
1481 * in xen_setup_kernel_pagetable and graft as appropriate them in the
1482 * init_level4_pgt and its friends. Then when we are happy we load
1483 * the new init_level4_pgt - and continue on.
1484 *
1485 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1486 * up the rest of the pagetables. When it has completed it loads the cr3.
1487 * N.B. that baremetal would start at 'start_kernel' (and the early
1488 * #PF handler would create bootstrap pagetables) - so we are running
1489 * with the same assumptions as what to do when write_cr3 is executed
1490 * at this point.
1491 *
1492 * Since there are no user-page tables at all, we have two variants
1493 * of xen_write_cr3 - the early bootup (this one), and the late one
1494 * (xen_write_cr3). The reason we have to do that is that in 64-bit
1495 * the Linux kernel and user-space are both in ring 3 while the
1496 * hypervisor is in ring 0.
1497 */
1498 static void __init xen_write_cr3_init(unsigned long cr3)
1499 {
1500 BUG_ON(preemptible());
1501
1502 xen_mc_batch(); /* disables interrupts */
1503
1504 /* Update while interrupts are disabled, so its atomic with
1505 respect to ipis */
1506 this_cpu_write(xen_cr3, cr3);
1507
1508 __xen_write_cr3(true, cr3);
1509
1510 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1511 }
1512 #endif
1513
1514 static int xen_pgd_alloc(struct mm_struct *mm)
1515 {
1516 pgd_t *pgd = mm->pgd;
1517 int ret = 0;
1518
1519 BUG_ON(PagePinned(virt_to_page(pgd)));
1520
1521 #ifdef CONFIG_X86_64
1522 {
1523 struct page *page = virt_to_page(pgd);
1524 pgd_t *user_pgd;
1525
1526 BUG_ON(page->private != 0);
1527
1528 ret = -ENOMEM;
1529
1530 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1531 page->private = (unsigned long)user_pgd;
1532
1533 if (user_pgd != NULL) {
1534 #ifdef CONFIG_X86_VSYSCALL_EMULATION
1535 user_pgd[pgd_index(VSYSCALL_ADDR)] =
1536 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1537 #endif
1538 ret = 0;
1539 }
1540
1541 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1542 }
1543 #endif
1544 return ret;
1545 }
1546
1547 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1548 {
1549 #ifdef CONFIG_X86_64
1550 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1551
1552 if (user_pgd)
1553 free_page((unsigned long)user_pgd);
1554 #endif
1555 }
1556
1557 /*
1558 * Init-time set_pte while constructing initial pagetables, which
1559 * doesn't allow RO page table pages to be remapped RW.
1560 *
1561 * If there is no MFN for this PFN then this page is initially
1562 * ballooned out so clear the PTE (as in decrease_reservation() in
1563 * drivers/xen/balloon.c).
1564 *
1565 * Many of these PTE updates are done on unpinned and writable pages
1566 * and doing a hypercall for these is unnecessary and expensive. At
1567 * this point it is not possible to tell if a page is pinned or not,
1568 * so always write the PTE directly and rely on Xen trapping and
1569 * emulating any updates as necessary.
1570 */
1571 __visible pte_t xen_make_pte_init(pteval_t pte)
1572 {
1573 #ifdef CONFIG_X86_64
1574 unsigned long pfn;
1575
1576 /*
1577 * Pages belonging to the initial p2m list mapped outside the default
1578 * address range must be mapped read-only. This region contains the
1579 * page tables for mapping the p2m list, too, and page tables MUST be
1580 * mapped read-only.
1581 */
1582 pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
1583 if (xen_start_info->mfn_list < __START_KERNEL_map &&
1584 pfn >= xen_start_info->first_p2m_pfn &&
1585 pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1586 pte &= ~_PAGE_RW;
1587 #endif
1588 pte = pte_pfn_to_mfn(pte);
1589 return native_make_pte(pte);
1590 }
1591 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
1592
1593 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1594 {
1595 #ifdef CONFIG_X86_32
1596 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1597 if (pte_mfn(pte) != INVALID_P2M_ENTRY
1598 && pte_val_ma(*ptep) & _PAGE_PRESENT)
1599 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1600 pte_val_ma(pte));
1601 #endif
1602 native_set_pte(ptep, pte);
1603 }
1604
1605 /* Early in boot, while setting up the initial pagetable, assume
1606 everything is pinned. */
1607 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1608 {
1609 #ifdef CONFIG_FLATMEM
1610 BUG_ON(mem_map); /* should only be used early */
1611 #endif
1612 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1613 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1614 }
1615
1616 /* Used for pmd and pud */
1617 static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1618 {
1619 #ifdef CONFIG_FLATMEM
1620 BUG_ON(mem_map); /* should only be used early */
1621 #endif
1622 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1623 }
1624
1625 /* Early release_pte assumes that all pts are pinned, since there's
1626 only init_mm and anything attached to that is pinned. */
1627 static void __init xen_release_pte_init(unsigned long pfn)
1628 {
1629 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1630 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1631 }
1632
1633 static void __init xen_release_pmd_init(unsigned long pfn)
1634 {
1635 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1636 }
1637
1638 static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1639 {
1640 struct multicall_space mcs;
1641 struct mmuext_op *op;
1642
1643 mcs = __xen_mc_entry(sizeof(*op));
1644 op = mcs.args;
1645 op->cmd = cmd;
1646 op->arg1.mfn = pfn_to_mfn(pfn);
1647
1648 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1649 }
1650
1651 static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1652 {
1653 struct multicall_space mcs;
1654 unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1655
1656 mcs = __xen_mc_entry(0);
1657 MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1658 pfn_pte(pfn, prot), 0);
1659 }
1660
1661 /* This needs to make sure the new pte page is pinned iff its being
1662 attached to a pinned pagetable. */
1663 static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1664 unsigned level)
1665 {
1666 bool pinned = PagePinned(virt_to_page(mm->pgd));
1667
1668 trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1669
1670 if (pinned) {
1671 struct page *page = pfn_to_page(pfn);
1672
1673 SetPagePinned(page);
1674
1675 if (!PageHighMem(page)) {
1676 xen_mc_batch();
1677
1678 __set_pfn_prot(pfn, PAGE_KERNEL_RO);
1679
1680 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1681 __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1682
1683 xen_mc_issue(PARAVIRT_LAZY_MMU);
1684 } else {
1685 /* make sure there are no stray mappings of
1686 this page */
1687 kmap_flush_unused();
1688 }
1689 }
1690 }
1691
1692 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1693 {
1694 xen_alloc_ptpage(mm, pfn, PT_PTE);
1695 }
1696
1697 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1698 {
1699 xen_alloc_ptpage(mm, pfn, PT_PMD);
1700 }
1701
1702 /* This should never happen until we're OK to use struct page */
1703 static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1704 {
1705 struct page *page = pfn_to_page(pfn);
1706 bool pinned = PagePinned(page);
1707
1708 trace_xen_mmu_release_ptpage(pfn, level, pinned);
1709
1710 if (pinned) {
1711 if (!PageHighMem(page)) {
1712 xen_mc_batch();
1713
1714 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1715 __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1716
1717 __set_pfn_prot(pfn, PAGE_KERNEL);
1718
1719 xen_mc_issue(PARAVIRT_LAZY_MMU);
1720 }
1721 ClearPagePinned(page);
1722 }
1723 }
1724
1725 static void xen_release_pte(unsigned long pfn)
1726 {
1727 xen_release_ptpage(pfn, PT_PTE);
1728 }
1729
1730 static void xen_release_pmd(unsigned long pfn)
1731 {
1732 xen_release_ptpage(pfn, PT_PMD);
1733 }
1734
1735 #if CONFIG_PGTABLE_LEVELS >= 4
1736 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1737 {
1738 xen_alloc_ptpage(mm, pfn, PT_PUD);
1739 }
1740
1741 static void xen_release_pud(unsigned long pfn)
1742 {
1743 xen_release_ptpage(pfn, PT_PUD);
1744 }
1745 #endif
1746
1747 void __init xen_reserve_top(void)
1748 {
1749 #ifdef CONFIG_X86_32
1750 unsigned long top = HYPERVISOR_VIRT_START;
1751 struct xen_platform_parameters pp;
1752
1753 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1754 top = pp.virt_start;
1755
1756 reserve_top_address(-top);
1757 #endif /* CONFIG_X86_32 */
1758 }
1759
1760 /*
1761 * Like __va(), but returns address in the kernel mapping (which is
1762 * all we have until the physical memory mapping has been set up.
1763 */
1764 static void * __init __ka(phys_addr_t paddr)
1765 {
1766 #ifdef CONFIG_X86_64
1767 return (void *)(paddr + __START_KERNEL_map);
1768 #else
1769 return __va(paddr);
1770 #endif
1771 }
1772
1773 /* Convert a machine address to physical address */
1774 static unsigned long __init m2p(phys_addr_t maddr)
1775 {
1776 phys_addr_t paddr;
1777
1778 maddr &= PTE_PFN_MASK;
1779 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1780
1781 return paddr;
1782 }
1783
1784 /* Convert a machine address to kernel virtual */
1785 static void * __init m2v(phys_addr_t maddr)
1786 {
1787 return __ka(m2p(maddr));
1788 }
1789
1790 /* Set the page permissions on an identity-mapped pages */
1791 static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1792 unsigned long flags)
1793 {
1794 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1795 pte_t pte = pfn_pte(pfn, prot);
1796
1797 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1798 BUG();
1799 }
1800 static void __init set_page_prot(void *addr, pgprot_t prot)
1801 {
1802 return set_page_prot_flags(addr, prot, UVMF_NONE);
1803 }
1804 #ifdef CONFIG_X86_32
1805 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1806 {
1807 unsigned pmdidx, pteidx;
1808 unsigned ident_pte;
1809 unsigned long pfn;
1810
1811 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1812 PAGE_SIZE);
1813
1814 ident_pte = 0;
1815 pfn = 0;
1816 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1817 pte_t *pte_page;
1818
1819 /* Reuse or allocate a page of ptes */
1820 if (pmd_present(pmd[pmdidx]))
1821 pte_page = m2v(pmd[pmdidx].pmd);
1822 else {
1823 /* Check for free pte pages */
1824 if (ident_pte == LEVEL1_IDENT_ENTRIES)
1825 break;
1826
1827 pte_page = &level1_ident_pgt[ident_pte];
1828 ident_pte += PTRS_PER_PTE;
1829
1830 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1831 }
1832
1833 /* Install mappings */
1834 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1835 pte_t pte;
1836
1837 if (pfn > max_pfn_mapped)
1838 max_pfn_mapped = pfn;
1839
1840 if (!pte_none(pte_page[pteidx]))
1841 continue;
1842
1843 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1844 pte_page[pteidx] = pte;
1845 }
1846 }
1847
1848 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1849 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1850
1851 set_page_prot(pmd, PAGE_KERNEL_RO);
1852 }
1853 #endif
1854 void __init xen_setup_machphys_mapping(void)
1855 {
1856 struct xen_machphys_mapping mapping;
1857
1858 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1859 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1860 machine_to_phys_nr = mapping.max_mfn + 1;
1861 } else {
1862 machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1863 }
1864 #ifdef CONFIG_X86_32
1865 WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1866 < machine_to_phys_mapping);
1867 #endif
1868 }
1869
1870 #ifdef CONFIG_X86_64
1871 static void __init convert_pfn_mfn(void *v)
1872 {
1873 pte_t *pte = v;
1874 int i;
1875
1876 /* All levels are converted the same way, so just treat them
1877 as ptes. */
1878 for (i = 0; i < PTRS_PER_PTE; i++)
1879 pte[i] = xen_make_pte(pte[i].pte);
1880 }
1881 static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1882 unsigned long addr)
1883 {
1884 if (*pt_base == PFN_DOWN(__pa(addr))) {
1885 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1886 clear_page((void *)addr);
1887 (*pt_base)++;
1888 }
1889 if (*pt_end == PFN_DOWN(__pa(addr))) {
1890 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1891 clear_page((void *)addr);
1892 (*pt_end)--;
1893 }
1894 }
1895 /*
1896 * Set up the initial kernel pagetable.
1897 *
1898 * We can construct this by grafting the Xen provided pagetable into
1899 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1900 * level2_ident_pgt, and level2_kernel_pgt. This means that only the
1901 * kernel has a physical mapping to start with - but that's enough to
1902 * get __va working. We need to fill in the rest of the physical
1903 * mapping once some sort of allocator has been set up.
1904 */
1905 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1906 {
1907 pud_t *l3;
1908 pmd_t *l2;
1909 unsigned long addr[3];
1910 unsigned long pt_base, pt_end;
1911 unsigned i;
1912
1913 /* max_pfn_mapped is the last pfn mapped in the initial memory
1914 * mappings. Considering that on Xen after the kernel mappings we
1915 * have the mappings of some pages that don't exist in pfn space, we
1916 * set max_pfn_mapped to the last real pfn mapped. */
1917 if (xen_start_info->mfn_list < __START_KERNEL_map)
1918 max_pfn_mapped = xen_start_info->first_p2m_pfn;
1919 else
1920 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1921
1922 pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1923 pt_end = pt_base + xen_start_info->nr_pt_frames;
1924
1925 /* Zap identity mapping */
1926 init_level4_pgt[0] = __pgd(0);
1927
1928 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1929 /* Pre-constructed entries are in pfn, so convert to mfn */
1930 /* L4[272] -> level3_ident_pgt
1931 * L4[511] -> level3_kernel_pgt */
1932 convert_pfn_mfn(init_level4_pgt);
1933
1934 /* L3_i[0] -> level2_ident_pgt */
1935 convert_pfn_mfn(level3_ident_pgt);
1936 /* L3_k[510] -> level2_kernel_pgt
1937 * L3_k[511] -> level2_fixmap_pgt */
1938 convert_pfn_mfn(level3_kernel_pgt);
1939
1940 /* L3_k[511][506] -> level1_fixmap_pgt */
1941 convert_pfn_mfn(level2_fixmap_pgt);
1942 }
1943 /* We get [511][511] and have Xen's version of level2_kernel_pgt */
1944 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1945 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1946
1947 addr[0] = (unsigned long)pgd;
1948 addr[1] = (unsigned long)l3;
1949 addr[2] = (unsigned long)l2;
1950 /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1951 * Both L4[272][0] and L4[511][510] have entries that point to the same
1952 * L2 (PMD) tables. Meaning that if you modify it in __va space
1953 * it will be also modified in the __ka space! (But if you just
1954 * modify the PMD table to point to other PTE's or none, then you
1955 * are OK - which is what cleanup_highmap does) */
1956 copy_page(level2_ident_pgt, l2);
1957 /* Graft it onto L4[511][510] */
1958 copy_page(level2_kernel_pgt, l2);
1959
1960 /* Copy the initial P->M table mappings if necessary. */
1961 i = pgd_index(xen_start_info->mfn_list);
1962 if (i && i < pgd_index(__START_KERNEL_map))
1963 init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1964
1965 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1966 /* Make pagetable pieces RO */
1967 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1968 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1969 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1970 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1971 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1972 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1973 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1974 set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1975
1976 /* Pin down new L4 */
1977 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1978 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1979
1980 /* Unpin Xen-provided one */
1981 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1982
1983 /*
1984 * At this stage there can be no user pgd, and no page
1985 * structure to attach it to, so make sure we just set kernel
1986 * pgd.
1987 */
1988 xen_mc_batch();
1989 __xen_write_cr3(true, __pa(init_level4_pgt));
1990 xen_mc_issue(PARAVIRT_LAZY_CPU);
1991 } else
1992 native_write_cr3(__pa(init_level4_pgt));
1993
1994 /* We can't that easily rip out L3 and L2, as the Xen pagetables are
1995 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
1996 * the initial domain. For guests using the toolstack, they are in:
1997 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1998 * rip out the [L4] (pgd), but for guests we shave off three pages.
1999 */
2000 for (i = 0; i < ARRAY_SIZE(addr); i++)
2001 check_pt_base(&pt_base, &pt_end, addr[i]);
2002
2003 /* Our (by three pages) smaller Xen pagetable that we are using */
2004 xen_pt_base = PFN_PHYS(pt_base);
2005 xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
2006 memblock_reserve(xen_pt_base, xen_pt_size);
2007
2008 /* Revector the xen_start_info */
2009 xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
2010 }
2011
2012 /*
2013 * Read a value from a physical address.
2014 */
2015 static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
2016 {
2017 unsigned long *vaddr;
2018 unsigned long val;
2019
2020 vaddr = early_memremap_ro(addr, sizeof(val));
2021 val = *vaddr;
2022 early_memunmap(vaddr, sizeof(val));
2023 return val;
2024 }
2025
2026 /*
2027 * Translate a virtual address to a physical one without relying on mapped
2028 * page tables. Don't rely on big pages being aligned in (guest) physical
2029 * space!
2030 */
2031 static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2032 {
2033 phys_addr_t pa;
2034 pgd_t pgd;
2035 pud_t pud;
2036 pmd_t pmd;
2037 pte_t pte;
2038
2039 pa = read_cr3();
2040 pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
2041 sizeof(pgd)));
2042 if (!pgd_present(pgd))
2043 return 0;
2044
2045 pa = pgd_val(pgd) & PTE_PFN_MASK;
2046 pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
2047 sizeof(pud)));
2048 if (!pud_present(pud))
2049 return 0;
2050 pa = pud_val(pud) & PTE_PFN_MASK;
2051 if (pud_large(pud))
2052 return pa + (vaddr & ~PUD_MASK);
2053
2054 pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
2055 sizeof(pmd)));
2056 if (!pmd_present(pmd))
2057 return 0;
2058 pa = pmd_val(pmd) & PTE_PFN_MASK;
2059 if (pmd_large(pmd))
2060 return pa + (vaddr & ~PMD_MASK);
2061
2062 pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
2063 sizeof(pte)));
2064 if (!pte_present(pte))
2065 return 0;
2066 pa = pte_pfn(pte) << PAGE_SHIFT;
2067
2068 return pa | (vaddr & ~PAGE_MASK);
2069 }
2070
2071 /*
2072 * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
2073 * this area.
2074 */
2075 void __init xen_relocate_p2m(void)
2076 {
2077 phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
2078 unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
2079 int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
2080 pte_t *pt;
2081 pmd_t *pmd;
2082 pud_t *pud;
2083 p4d_t *p4d = NULL;
2084 pgd_t *pgd;
2085 unsigned long *new_p2m;
2086 int save_pud;
2087
2088 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
2089 n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
2090 n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
2091 n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
2092 n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
2093 if (PTRS_PER_P4D > 1)
2094 n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
2095 else
2096 n_p4d = 0;
2097 n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
2098
2099 new_area = xen_find_free_area(PFN_PHYS(n_frames));
2100 if (!new_area) {
2101 xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
2102 BUG();
2103 }
2104
2105 /*
2106 * Setup the page tables for addressing the new p2m list.
2107 * We have asked the hypervisor to map the p2m list at the user address
2108 * PUD_SIZE. It may have done so, or it may have used a kernel space
2109 * address depending on the Xen version.
2110 * To avoid any possible virtual address collision, just use
2111 * 2 * PUD_SIZE for the new area.
2112 */
2113 p4d_phys = new_area;
2114 pud_phys = p4d_phys + PFN_PHYS(n_p4d);
2115 pmd_phys = pud_phys + PFN_PHYS(n_pud);
2116 pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2117 p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2118
2119 pgd = __va(read_cr3());
2120 new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2121 idx_p4d = 0;
2122 save_pud = n_pud;
2123 do {
2124 if (n_p4d > 0) {
2125 p4d = early_memremap(p4d_phys, PAGE_SIZE);
2126 clear_page(p4d);
2127 n_pud = min(save_pud, PTRS_PER_P4D);
2128 }
2129 for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
2130 pud = early_memremap(pud_phys, PAGE_SIZE);
2131 clear_page(pud);
2132 for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
2133 idx_pmd++) {
2134 pmd = early_memremap(pmd_phys, PAGE_SIZE);
2135 clear_page(pmd);
2136 for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
2137 idx_pt++) {
2138 pt = early_memremap(pt_phys, PAGE_SIZE);
2139 clear_page(pt);
2140 for (idx_pte = 0;
2141 idx_pte < min(n_pte, PTRS_PER_PTE);
2142 idx_pte++) {
2143 set_pte(pt + idx_pte,
2144 pfn_pte(p2m_pfn, PAGE_KERNEL));
2145 p2m_pfn++;
2146 }
2147 n_pte -= PTRS_PER_PTE;
2148 early_memunmap(pt, PAGE_SIZE);
2149 make_lowmem_page_readonly(__va(pt_phys));
2150 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2151 PFN_DOWN(pt_phys));
2152 set_pmd(pmd + idx_pt,
2153 __pmd(_PAGE_TABLE | pt_phys));
2154 pt_phys += PAGE_SIZE;
2155 }
2156 n_pt -= PTRS_PER_PMD;
2157 early_memunmap(pmd, PAGE_SIZE);
2158 make_lowmem_page_readonly(__va(pmd_phys));
2159 pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2160 PFN_DOWN(pmd_phys));
2161 set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
2162 pmd_phys += PAGE_SIZE;
2163 }
2164 n_pmd -= PTRS_PER_PUD;
2165 early_memunmap(pud, PAGE_SIZE);
2166 make_lowmem_page_readonly(__va(pud_phys));
2167 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2168 if (n_p4d > 0)
2169 set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
2170 else
2171 set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2172 pud_phys += PAGE_SIZE;
2173 }
2174 if (n_p4d > 0) {
2175 save_pud -= PTRS_PER_P4D;
2176 early_memunmap(p4d, PAGE_SIZE);
2177 make_lowmem_page_readonly(__va(p4d_phys));
2178 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
2179 set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
2180 p4d_phys += PAGE_SIZE;
2181 }
2182 } while (++idx_p4d < n_p4d);
2183
2184 /* Now copy the old p2m info to the new area. */
2185 memcpy(new_p2m, xen_p2m_addr, size);
2186 xen_p2m_addr = new_p2m;
2187
2188 /* Release the old p2m list and set new list info. */
2189 p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
2190 BUG_ON(!p2m_pfn);
2191 p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
2192
2193 if (xen_start_info->mfn_list < __START_KERNEL_map) {
2194 pfn = xen_start_info->first_p2m_pfn;
2195 pfn_end = xen_start_info->first_p2m_pfn +
2196 xen_start_info->nr_p2m_frames;
2197 set_pgd(pgd + 1, __pgd(0));
2198 } else {
2199 pfn = p2m_pfn;
2200 pfn_end = p2m_pfn_end;
2201 }
2202
2203 memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
2204 while (pfn < pfn_end) {
2205 if (pfn == p2m_pfn) {
2206 pfn = p2m_pfn_end;
2207 continue;
2208 }
2209 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
2210 pfn++;
2211 }
2212
2213 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2214 xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
2215 xen_start_info->nr_p2m_frames = n_frames;
2216 }
2217
2218 #else /* !CONFIG_X86_64 */
2219 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2220 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2221
2222 static void __init xen_write_cr3_init(unsigned long cr3)
2223 {
2224 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2225
2226 BUG_ON(read_cr3() != __pa(initial_page_table));
2227 BUG_ON(cr3 != __pa(swapper_pg_dir));
2228
2229 /*
2230 * We are switching to swapper_pg_dir for the first time (from
2231 * initial_page_table) and therefore need to mark that page
2232 * read-only and then pin it.
2233 *
2234 * Xen disallows sharing of kernel PMDs for PAE
2235 * guests. Therefore we must copy the kernel PMD from
2236 * initial_page_table into a new kernel PMD to be used in
2237 * swapper_pg_dir.
2238 */
2239 swapper_kernel_pmd =
2240 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2241 copy_page(swapper_kernel_pmd, initial_kernel_pmd);
2242 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2243 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2244 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2245
2246 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2247 xen_write_cr3(cr3);
2248 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2249
2250 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2251 PFN_DOWN(__pa(initial_page_table)));
2252 set_page_prot(initial_page_table, PAGE_KERNEL);
2253 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2254
2255 pv_mmu_ops.write_cr3 = &xen_write_cr3;
2256 }
2257
2258 /*
2259 * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
2260 * not the first page table in the page table pool.
2261 * Iterate through the initial page tables to find the real page table base.
2262 */
2263 static phys_addr_t xen_find_pt_base(pmd_t *pmd)
2264 {
2265 phys_addr_t pt_base, paddr;
2266 unsigned pmdidx;
2267
2268 pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
2269
2270 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
2271 if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
2272 paddr = m2p(pmd[pmdidx].pmd);
2273 pt_base = min(pt_base, paddr);
2274 }
2275
2276 return pt_base;
2277 }
2278
2279 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
2280 {
2281 pmd_t *kernel_pmd;
2282
2283 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2284
2285 xen_pt_base = xen_find_pt_base(kernel_pmd);
2286 xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
2287
2288 initial_kernel_pmd =
2289 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2290
2291 max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
2292
2293 copy_page(initial_kernel_pmd, kernel_pmd);
2294
2295 xen_map_identity_early(initial_kernel_pmd, max_pfn);
2296
2297 copy_page(initial_page_table, pgd);
2298 initial_page_table[KERNEL_PGD_BOUNDARY] =
2299 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2300
2301 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2302 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2303 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2304
2305 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2306
2307 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2308 PFN_DOWN(__pa(initial_page_table)));
2309 xen_write_cr3(__pa(initial_page_table));
2310
2311 memblock_reserve(xen_pt_base, xen_pt_size);
2312 }
2313 #endif /* CONFIG_X86_64 */
2314
2315 void __init xen_reserve_special_pages(void)
2316 {
2317 phys_addr_t paddr;
2318
2319 memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2320 if (xen_start_info->store_mfn) {
2321 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2322 memblock_reserve(paddr, PAGE_SIZE);
2323 }
2324 if (!xen_initial_domain()) {
2325 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2326 memblock_reserve(paddr, PAGE_SIZE);
2327 }
2328 }
2329
2330 void __init xen_pt_check_e820(void)
2331 {
2332 if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
2333 xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
2334 BUG();
2335 }
2336 }
2337
2338 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2339
2340 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2341 {
2342 pte_t pte;
2343
2344 phys >>= PAGE_SHIFT;
2345
2346 switch (idx) {
2347 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2348 case FIX_RO_IDT:
2349 #ifdef CONFIG_X86_32
2350 case FIX_WP_TEST:
2351 # ifdef CONFIG_HIGHMEM
2352 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2353 # endif
2354 #elif defined(CONFIG_X86_VSYSCALL_EMULATION)
2355 case VSYSCALL_PAGE:
2356 #endif
2357 case FIX_TEXT_POKE0:
2358 case FIX_TEXT_POKE1:
2359 case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
2360 /* All local page mappings */
2361 pte = pfn_pte(phys, prot);
2362 break;
2363
2364 #ifdef CONFIG_X86_LOCAL_APIC
2365 case FIX_APIC_BASE: /* maps dummy local APIC */
2366 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2367 break;
2368 #endif
2369
2370 #ifdef CONFIG_X86_IO_APIC
2371 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2372 /*
2373 * We just don't map the IO APIC - all access is via
2374 * hypercalls. Keep the address in the pte for reference.
2375 */
2376 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2377 break;
2378 #endif
2379
2380 case FIX_PARAVIRT_BOOTMAP:
2381 /* This is an MFN, but it isn't an IO mapping from the
2382 IO domain */
2383 pte = mfn_pte(phys, prot);
2384 break;
2385
2386 default:
2387 /* By default, set_fixmap is used for hardware mappings */
2388 pte = mfn_pte(phys, prot);
2389 break;
2390 }
2391
2392 __native_set_fixmap(idx, pte);
2393
2394 #ifdef CONFIG_X86_VSYSCALL_EMULATION
2395 /* Replicate changes to map the vsyscall page into the user
2396 pagetable vsyscall mapping. */
2397 if (idx == VSYSCALL_PAGE) {
2398 unsigned long vaddr = __fix_to_virt(idx);
2399 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2400 }
2401 #endif
2402 }
2403
2404 static void __init xen_post_allocator_init(void)
2405 {
2406 if (xen_feature(XENFEAT_auto_translated_physmap))
2407 return;
2408
2409 pv_mmu_ops.set_pte = xen_set_pte;
2410 pv_mmu_ops.set_pmd = xen_set_pmd;
2411 pv_mmu_ops.set_pud = xen_set_pud;
2412 #if CONFIG_PGTABLE_LEVELS >= 4
2413 pv_mmu_ops.set_p4d = xen_set_p4d;
2414 #endif
2415
2416 /* This will work as long as patching hasn't happened yet
2417 (which it hasn't) */
2418 pv_mmu_ops.alloc_pte = xen_alloc_pte;
2419 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2420 pv_mmu_ops.release_pte = xen_release_pte;
2421 pv_mmu_ops.release_pmd = xen_release_pmd;
2422 #if CONFIG_PGTABLE_LEVELS >= 4
2423 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2424 pv_mmu_ops.release_pud = xen_release_pud;
2425 #endif
2426 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
2427
2428 #ifdef CONFIG_X86_64
2429 pv_mmu_ops.write_cr3 = &xen_write_cr3;
2430 SetPagePinned(virt_to_page(level3_user_vsyscall));
2431 #endif
2432 xen_mark_init_mm_pinned();
2433 }
2434
2435 static void xen_leave_lazy_mmu(void)
2436 {
2437 preempt_disable();
2438 xen_mc_flush();
2439 paravirt_leave_lazy_mmu();
2440 preempt_enable();
2441 }
2442
2443 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2444 .read_cr2 = xen_read_cr2,
2445 .write_cr2 = xen_write_cr2,
2446
2447 .read_cr3 = xen_read_cr3,
2448 .write_cr3 = xen_write_cr3_init,
2449
2450 .flush_tlb_user = xen_flush_tlb,
2451 .flush_tlb_kernel = xen_flush_tlb,
2452 .flush_tlb_single = xen_flush_tlb_single,
2453 .flush_tlb_others = xen_flush_tlb_others,
2454
2455 .pte_update = paravirt_nop,
2456
2457 .pgd_alloc = xen_pgd_alloc,
2458 .pgd_free = xen_pgd_free,
2459
2460 .alloc_pte = xen_alloc_pte_init,
2461 .release_pte = xen_release_pte_init,
2462 .alloc_pmd = xen_alloc_pmd_init,
2463 .release_pmd = xen_release_pmd_init,
2464
2465 .set_pte = xen_set_pte_init,
2466 .set_pte_at = xen_set_pte_at,
2467 .set_pmd = xen_set_pmd_hyper,
2468
2469 .ptep_modify_prot_start = __ptep_modify_prot_start,
2470 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2471
2472 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2473 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2474
2475 .make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
2476 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2477
2478 #ifdef CONFIG_X86_PAE
2479 .set_pte_atomic = xen_set_pte_atomic,
2480 .pte_clear = xen_pte_clear,
2481 .pmd_clear = xen_pmd_clear,
2482 #endif /* CONFIG_X86_PAE */
2483 .set_pud = xen_set_pud_hyper,
2484
2485 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2486 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2487
2488 #if CONFIG_PGTABLE_LEVELS >= 4
2489 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2490 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2491 .set_p4d = xen_set_p4d_hyper,
2492
2493 .alloc_pud = xen_alloc_pmd_init,
2494 .release_pud = xen_release_pmd_init,
2495 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
2496
2497 .activate_mm = xen_activate_mm,
2498 .dup_mmap = xen_dup_mmap,
2499 .exit_mmap = xen_exit_mmap,
2500
2501 .lazy_mode = {
2502 .enter = paravirt_enter_lazy_mmu,
2503 .leave = xen_leave_lazy_mmu,
2504 .flush = paravirt_flush_lazy_mmu,
2505 },
2506
2507 .set_fixmap = xen_set_fixmap,
2508 };
2509
2510 void __init xen_init_mmu_ops(void)
2511 {
2512 x86_init.paging.pagetable_init = xen_pagetable_init;
2513
2514 if (xen_feature(XENFEAT_auto_translated_physmap))
2515 return;
2516
2517 pv_mmu_ops = xen_mmu_ops;
2518
2519 memset(dummy_mapping, 0xff, PAGE_SIZE);
2520 }
2521
2522 /* Protected by xen_reservation_lock. */
2523 #define MAX_CONTIG_ORDER 9 /* 2MB */
2524 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2525
2526 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2527 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2528 unsigned long *in_frames,
2529 unsigned long *out_frames)
2530 {
2531 int i;
2532 struct multicall_space mcs;
2533
2534 xen_mc_batch();
2535 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2536 mcs = __xen_mc_entry(0);
2537
2538 if (in_frames)
2539 in_frames[i] = virt_to_mfn(vaddr);
2540
2541 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2542 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2543
2544 if (out_frames)
2545 out_frames[i] = virt_to_pfn(vaddr);
2546 }
2547 xen_mc_issue(0);
2548 }
2549
2550 /*
2551 * Update the pfn-to-mfn mappings for a virtual address range, either to
2552 * point to an array of mfns, or contiguously from a single starting
2553 * mfn.
2554 */
2555 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2556 unsigned long *mfns,
2557 unsigned long first_mfn)
2558 {
2559 unsigned i, limit;
2560 unsigned long mfn;
2561
2562 xen_mc_batch();
2563
2564 limit = 1u << order;
2565 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2566 struct multicall_space mcs;
2567 unsigned flags;
2568
2569 mcs = __xen_mc_entry(0);
2570 if (mfns)
2571 mfn = mfns[i];
2572 else
2573 mfn = first_mfn + i;
2574
2575 if (i < (limit - 1))
2576 flags = 0;
2577 else {
2578 if (order == 0)
2579 flags = UVMF_INVLPG | UVMF_ALL;
2580 else
2581 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2582 }
2583
2584 MULTI_update_va_mapping(mcs.mc, vaddr,
2585 mfn_pte(mfn, PAGE_KERNEL), flags);
2586
2587 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2588 }
2589
2590 xen_mc_issue(0);
2591 }
2592
2593 /*
2594 * Perform the hypercall to exchange a region of our pfns to point to
2595 * memory with the required contiguous alignment. Takes the pfns as
2596 * input, and populates mfns as output.
2597 *
2598 * Returns a success code indicating whether the hypervisor was able to
2599 * satisfy the request or not.
2600 */
2601 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2602 unsigned long *pfns_in,
2603 unsigned long extents_out,
2604 unsigned int order_out,
2605 unsigned long *mfns_out,
2606 unsigned int address_bits)
2607 {
2608 long rc;
2609 int success;
2610
2611 struct xen_memory_exchange exchange = {
2612 .in = {
2613 .nr_extents = extents_in,
2614 .extent_order = order_in,
2615 .extent_start = pfns_in,
2616 .domid = DOMID_SELF
2617 },
2618 .out = {
2619 .nr_extents = extents_out,
2620 .extent_order = order_out,
2621 .extent_start = mfns_out,
2622 .address_bits = address_bits,
2623 .domid = DOMID_SELF
2624 }
2625 };
2626
2627 BUG_ON(extents_in << order_in != extents_out << order_out);
2628
2629 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2630 success = (exchange.nr_exchanged == extents_in);
2631
2632 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2633 BUG_ON(success && (rc != 0));
2634
2635 return success;
2636 }
2637
2638 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2639 unsigned int address_bits,
2640 dma_addr_t *dma_handle)
2641 {
2642 unsigned long *in_frames = discontig_frames, out_frame;
2643 unsigned long flags;
2644 int success;
2645 unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2646
2647 /*
2648 * Currently an auto-translated guest will not perform I/O, nor will
2649 * it require PAE page directories below 4GB. Therefore any calls to
2650 * this function are redundant and can be ignored.
2651 */
2652
2653 if (xen_feature(XENFEAT_auto_translated_physmap))
2654 return 0;
2655
2656 if (unlikely(order > MAX_CONTIG_ORDER))
2657 return -ENOMEM;
2658
2659 memset((void *) vstart, 0, PAGE_SIZE << order);
2660
2661 spin_lock_irqsave(&xen_reservation_lock, flags);
2662
2663 /* 1. Zap current PTEs, remembering MFNs. */
2664 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2665
2666 /* 2. Get a new contiguous memory extent. */
2667 out_frame = virt_to_pfn(vstart);
2668 success = xen_exchange_memory(1UL << order, 0, in_frames,
2669 1, order, &out_frame,
2670 address_bits);
2671
2672 /* 3. Map the new extent in place of old pages. */
2673 if (success)
2674 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2675 else
2676 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2677
2678 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2679
2680 *dma_handle = virt_to_machine(vstart).maddr;
2681 return success ? 0 : -ENOMEM;
2682 }
2683 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2684
2685 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2686 {
2687 unsigned long *out_frames = discontig_frames, in_frame;
2688 unsigned long flags;
2689 int success;
2690 unsigned long vstart;
2691
2692 if (xen_feature(XENFEAT_auto_translated_physmap))
2693 return;
2694
2695 if (unlikely(order > MAX_CONTIG_ORDER))
2696 return;
2697
2698 vstart = (unsigned long)phys_to_virt(pstart);
2699 memset((void *) vstart, 0, PAGE_SIZE << order);
2700
2701 spin_lock_irqsave(&xen_reservation_lock, flags);
2702
2703 /* 1. Find start MFN of contiguous extent. */
2704 in_frame = virt_to_mfn(vstart);
2705
2706 /* 2. Zap current PTEs. */
2707 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2708
2709 /* 3. Do the exchange for non-contiguous MFNs. */
2710 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2711 0, out_frames, 0);
2712
2713 /* 4. Map new pages in place of old pages. */
2714 if (success)
2715 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2716 else
2717 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2718
2719 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2720 }
2721 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2722
2723 #ifdef CONFIG_KEXEC_CORE
2724 phys_addr_t paddr_vmcoreinfo_note(void)
2725 {
2726 if (xen_pv_domain())
2727 return virt_to_machine(&vmcoreinfo_note).maddr;
2728 else
2729 return __pa_symbol(&vmcoreinfo_note);
2730 }
2731 #endif /* CONFIG_KEXEC_CORE */