]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - arch/x86/xen/mmu.c
Use arbitrary_virt_to_machine() to deal with ioremapped pud updates.
[mirror_ubuntu-zesty-kernel.git] / arch / x86 / xen / mmu.c
CommitLineData
3b827c1b
JF
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
f120f13e 41#include <linux/sched.h>
f4f97b3e 42#include <linux/highmem.h>
994025ca 43#include <linux/debugfs.h>
3b827c1b 44#include <linux/bug.h>
d2cb2145 45#include <linux/vmalloc.h>
44408ad7 46#include <linux/module.h>
5a0e3ad6 47#include <linux/gfp.h>
a9ce6bc1 48#include <linux/memblock.h>
2222e71b 49#include <linux/seq_file.h>
3b827c1b
JF
50
51#include <asm/pgtable.h>
52#include <asm/tlbflush.h>
5deb30d1 53#include <asm/fixmap.h>
3b827c1b 54#include <asm/mmu_context.h>
319f3ba5 55#include <asm/setup.h>
f4f97b3e 56#include <asm/paravirt.h>
7347b408 57#include <asm/e820.h>
cbcd79c2 58#include <asm/linkage.h>
08bbc9da 59#include <asm/page.h>
fef5ba79 60#include <asm/init.h>
41f2e477 61#include <asm/pat.h>
3b827c1b
JF
62
63#include <asm/xen/hypercall.h>
f4f97b3e 64#include <asm/xen/hypervisor.h>
3b827c1b 65
c0011dbf 66#include <xen/xen.h>
3b827c1b
JF
67#include <xen/page.h>
68#include <xen/interface/xen.h>
59151001 69#include <xen/interface/hvm/hvm_op.h>
319f3ba5 70#include <xen/interface/version.h>
c0011dbf 71#include <xen/interface/memory.h>
319f3ba5 72#include <xen/hvc-console.h>
3b827c1b 73
f4f97b3e 74#include "multicalls.h"
3b827c1b 75#include "mmu.h"
994025ca
JF
76#include "debugfs.h"
77
19001c8c
AN
78/*
79 * Protects atomic reservation decrease/increase against concurrent increases.
06f521d5 80 * Also protects non-atomic updates of current_pages and balloon lists.
19001c8c
AN
81 */
82DEFINE_SPINLOCK(xen_reservation_lock);
83
319f3ba5
JF
84/*
85 * Identity map, in addition to plain kernel map. This needs to be
86 * large enough to allocate page table pages to allocate the rest.
87 * Each page can map 2MB.
88 */
764f0138
JF
89#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
90static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
319f3ba5
JF
91
92#ifdef CONFIG_X86_64
93/* l3 pud for userspace vsyscall mapping */
94static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
95#endif /* CONFIG_X86_64 */
96
97/*
98 * Note about cr3 (pagetable base) values:
99 *
100 * xen_cr3 contains the current logical cr3 value; it contains the
101 * last set cr3. This may not be the current effective cr3, because
102 * its update may be being lazily deferred. However, a vcpu looking
103 * at its own cr3 can use this value knowing that it everything will
104 * be self-consistent.
105 *
106 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
107 * hypercall to set the vcpu cr3 is complete (so it may be a little
108 * out of date, but it will never be set early). If one vcpu is
109 * looking at another vcpu's cr3 value, it should use this variable.
110 */
111DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
112DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
113
114
d6182fbf
JF
115/*
116 * Just beyond the highest usermode address. STACK_TOP_MAX has a
117 * redzone above it, so round it up to a PGD boundary.
118 */
119#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
120
9976b39b
JF
121unsigned long arbitrary_virt_to_mfn(void *vaddr)
122{
123 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
124
125 return PFN_DOWN(maddr.maddr);
126}
127
ce803e70 128xmaddr_t arbitrary_virt_to_machine(void *vaddr)
3b827c1b 129{
ce803e70 130 unsigned long address = (unsigned long)vaddr;
da7bfc50 131 unsigned int level;
9f32d21c
CL
132 pte_t *pte;
133 unsigned offset;
3b827c1b 134
9f32d21c
CL
135 /*
136 * if the PFN is in the linear mapped vaddr range, we can just use
137 * the (quick) virt_to_machine() p2m lookup
138 */
139 if (virt_addr_valid(vaddr))
140 return virt_to_machine(vaddr);
141
142 /* otherwise we have to do a (slower) full page-table walk */
3b827c1b 143
9f32d21c
CL
144 pte = lookup_address(address, &level);
145 BUG_ON(pte == NULL);
146 offset = address & ~PAGE_MASK;
ebd879e3 147 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
3b827c1b 148}
de23be5f 149EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
3b827c1b
JF
150
151void make_lowmem_page_readonly(void *vaddr)
152{
153 pte_t *pte, ptev;
154 unsigned long address = (unsigned long)vaddr;
da7bfc50 155 unsigned int level;
3b827c1b 156
f0646e43 157 pte = lookup_address(address, &level);
fef5ba79
JF
158 if (pte == NULL)
159 return; /* vaddr missing */
3b827c1b
JF
160
161 ptev = pte_wrprotect(*pte);
162
163 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
164 BUG();
165}
166
167void make_lowmem_page_readwrite(void *vaddr)
168{
169 pte_t *pte, ptev;
170 unsigned long address = (unsigned long)vaddr;
da7bfc50 171 unsigned int level;
3b827c1b 172
f0646e43 173 pte = lookup_address(address, &level);
fef5ba79
JF
174 if (pte == NULL)
175 return; /* vaddr missing */
3b827c1b
JF
176
177 ptev = pte_mkwrite(*pte);
178
179 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
180 BUG();
181}
182
183
7708ad64 184static bool xen_page_pinned(void *ptr)
e2426cf8
JF
185{
186 struct page *page = virt_to_page(ptr);
187
188 return PagePinned(page);
189}
190
eba3ff8b 191void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
c0011dbf
JF
192{
193 struct multicall_space mcs;
194 struct mmu_update *u;
195
196 mcs = xen_mc_entry(sizeof(*u));
197 u = mcs.args;
198
199 /* ptep might be kmapped when using 32-bit HIGHPTE */
d5108316 200 u->ptr = virt_to_machine(ptep).maddr;
c0011dbf
JF
201 u->val = pte_val_ma(pteval);
202
eba3ff8b 203 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
c0011dbf
JF
204
205 xen_mc_issue(PARAVIRT_LAZY_MMU);
206}
eba3ff8b
JF
207EXPORT_SYMBOL_GPL(xen_set_domain_pte);
208
7708ad64 209static void xen_extend_mmu_update(const struct mmu_update *update)
3b827c1b 210{
d66bf8fc
JF
211 struct multicall_space mcs;
212 struct mmu_update *u;
3b827c1b 213
400d3494
JF
214 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
215
994025ca 216 if (mcs.mc != NULL) {
400d3494 217 mcs.mc->args[1]++;
994025ca 218 } else {
400d3494
JF
219 mcs = __xen_mc_entry(sizeof(*u));
220 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
221 }
d66bf8fc 222
d66bf8fc 223 u = mcs.args;
400d3494
JF
224 *u = *update;
225}
226
4c13629f 227static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
400d3494
JF
228{
229 struct mmu_update u;
230
231 preempt_disable();
232
233 xen_mc_batch();
234
ce803e70 235 /* ptr may be ioremapped for 64-bit pagetable setup */
f05608d2 236 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494 237 u.val = pmd_val_ma(val);
7708ad64 238 xen_extend_mmu_update(&u);
d66bf8fc
JF
239
240 xen_mc_issue(PARAVIRT_LAZY_MMU);
241
242 preempt_enable();
3b827c1b
JF
243}
244
4c13629f 245static void xen_set_pmd(pmd_t *ptr, pmd_t val)
e2426cf8
JF
246{
247 /* If page is not pinned, we can just update the entry
248 directly */
7708ad64 249 if (!xen_page_pinned(ptr)) {
e2426cf8
JF
250 *ptr = val;
251 return;
252 }
253
254 xen_set_pmd_hyper(ptr, val);
255}
256
3b827c1b
JF
257/*
258 * Associate a virtual page frame with a given physical page frame
259 * and protection flags for that frame.
260 */
261void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
262{
836fe2f2 263 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
3b827c1b
JF
264}
265
4a35c13c 266static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
3b827c1b 267{
4a35c13c 268 struct mmu_update u;
994025ca 269
4a35c13c
JF
270 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
271 return false;
a99ac5e8 272
4a35c13c
JF
273 xen_mc_batch();
274
275 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
276 u.val = pte_val_ma(pteval);
277 xen_extend_mmu_update(&u);
a99ac5e8 278
4a35c13c
JF
279 xen_mc_issue(PARAVIRT_LAZY_MMU);
280
281 return true;
282}
283
4c13629f 284static void xen_set_pte(pte_t *ptep, pte_t pteval)
4a35c13c 285{
4a35c13c 286 if (!xen_batched_set_pte(ptep, pteval))
a99ac5e8 287 native_set_pte(ptep, pteval);
3b827c1b
JF
288}
289
4c13629f 290static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
4a35c13c
JF
291 pte_t *ptep, pte_t pteval)
292{
293 xen_set_pte(ptep, pteval);
294}
295
f63c2f24
T
296pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
297 unsigned long addr, pte_t *ptep)
947a69c9 298{
e57778a1
JF
299 /* Just return the pte as-is. We preserve the bits on commit */
300 return *ptep;
301}
302
303void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
304 pte_t *ptep, pte_t pte)
305{
400d3494 306 struct mmu_update u;
e57778a1 307
400d3494 308 xen_mc_batch();
947a69c9 309
d5108316 310 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
400d3494 311 u.val = pte_val_ma(pte);
7708ad64 312 xen_extend_mmu_update(&u);
947a69c9 313
e57778a1 314 xen_mc_issue(PARAVIRT_LAZY_MMU);
947a69c9
JF
315}
316
ebb9cfe2
JF
317/* Assume pteval_t is equivalent to all the other *val_t types. */
318static pteval_t pte_mfn_to_pfn(pteval_t val)
947a69c9 319{
ebb9cfe2 320 if (val & _PAGE_PRESENT) {
59438c9f 321 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
77be1fab 322 pteval_t flags = val & PTE_FLAGS_MASK;
d8355aca 323 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
ebb9cfe2 324 }
947a69c9 325
ebb9cfe2 326 return val;
947a69c9
JF
327}
328
ebb9cfe2 329static pteval_t pte_pfn_to_mfn(pteval_t val)
947a69c9 330{
ebb9cfe2 331 if (val & _PAGE_PRESENT) {
59438c9f 332 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
77be1fab 333 pteval_t flags = val & PTE_FLAGS_MASK;
fb38923e 334 unsigned long mfn;
cfd8951e 335
fb38923e
KRW
336 if (!xen_feature(XENFEAT_auto_translated_physmap))
337 mfn = get_phys_to_machine(pfn);
338 else
339 mfn = pfn;
cfd8951e
JF
340 /*
341 * If there's no mfn for the pfn, then just create an
342 * empty non-present pte. Unfortunately this loses
343 * information about the original pfn, so
344 * pte_mfn_to_pfn is asymmetric.
345 */
346 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
347 mfn = 0;
348 flags = 0;
fb38923e
KRW
349 } else {
350 /*
351 * Paramount to do this test _after_ the
352 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
353 * IDENTITY_FRAME_BIT resolves to true.
354 */
355 mfn &= ~FOREIGN_FRAME_BIT;
356 if (mfn & IDENTITY_FRAME_BIT) {
357 mfn &= ~IDENTITY_FRAME_BIT;
358 flags |= _PAGE_IOMAP;
359 }
cfd8951e 360 }
cfd8951e 361 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
947a69c9
JF
362 }
363
ebb9cfe2 364 return val;
947a69c9
JF
365}
366
c0011dbf
JF
367static pteval_t iomap_pte(pteval_t val)
368{
369 if (val & _PAGE_PRESENT) {
370 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
371 pteval_t flags = val & PTE_FLAGS_MASK;
372
373 /* We assume the pte frame number is a MFN, so
374 just use it as-is. */
375 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
376 }
377
378 return val;
379}
380
4c13629f 381static pteval_t xen_pte_val(pte_t pte)
947a69c9 382{
41f2e477 383 pteval_t pteval = pte.pte;
c0011dbf 384
41f2e477
JF
385 /* If this is a WC pte, convert back from Xen WC to Linux WC */
386 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
387 WARN_ON(!pat_enabled);
388 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
389 }
c0011dbf 390
41f2e477
JF
391 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
392 return pteval;
393
394 return pte_mfn_to_pfn(pteval);
947a69c9 395}
da5de7c2 396PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
947a69c9 397
4c13629f 398static pgdval_t xen_pgd_val(pgd_t pgd)
947a69c9 399{
ebb9cfe2 400 return pte_mfn_to_pfn(pgd.pgd);
947a69c9 401}
da5de7c2 402PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
947a69c9 403
41f2e477
JF
404/*
405 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
406 * are reserved for now, to correspond to the Intel-reserved PAT
407 * types.
408 *
409 * We expect Linux's PAT set as follows:
410 *
411 * Idx PTE flags Linux Xen Default
412 * 0 WB WB WB
413 * 1 PWT WC WT WT
414 * 2 PCD UC- UC- UC-
415 * 3 PCD PWT UC UC UC
416 * 4 PAT WB WC WB
417 * 5 PAT PWT WC WP WT
418 * 6 PAT PCD UC- UC UC-
419 * 7 PAT PCD PWT UC UC UC
420 */
421
422void xen_set_pat(u64 pat)
423{
424 /* We expect Linux to use a PAT setting of
425 * UC UC- WC WB (ignoring the PAT flag) */
426 WARN_ON(pat != 0x0007010600070106ull);
427}
428
4c13629f 429static pte_t xen_make_pte(pteval_t pte)
947a69c9 430{
7347b408
AN
431 phys_addr_t addr = (pte & PTE_PFN_MASK);
432
41f2e477
JF
433 /* If Linux is trying to set a WC pte, then map to the Xen WC.
434 * If _PAGE_PAT is set, then it probably means it is really
435 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
436 * things work out OK...
437 *
438 * (We should never see kernel mappings with _PAGE_PSE set,
439 * but we could see hugetlbfs mappings, I think.).
440 */
441 if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
442 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
443 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
444 }
445
7347b408
AN
446 /*
447 * Unprivileged domains are allowed to do IOMAPpings for
448 * PCI passthrough, but not map ISA space. The ISA
449 * mappings are just dummy local mappings to keep other
450 * parts of the kernel happy.
451 */
452 if (unlikely(pte & _PAGE_IOMAP) &&
453 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
c0011dbf 454 pte = iomap_pte(pte);
7347b408
AN
455 } else {
456 pte &= ~_PAGE_IOMAP;
c0011dbf 457 pte = pte_pfn_to_mfn(pte);
7347b408 458 }
c0011dbf 459
ebb9cfe2 460 return native_make_pte(pte);
947a69c9 461}
da5de7c2 462PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
947a69c9 463
fc25151d
KRW
464#ifdef CONFIG_XEN_DEBUG
465pte_t xen_make_pte_debug(pteval_t pte)
466{
467 phys_addr_t addr = (pte & PTE_PFN_MASK);
468 phys_addr_t other_addr;
469 bool io_page = false;
470 pte_t _pte;
471
472 if (pte & _PAGE_IOMAP)
473 io_page = true;
474
475 _pte = xen_make_pte(pte);
476
477 if (!addr)
478 return _pte;
479
480 if (io_page &&
481 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
482 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
d88885d0 483 WARN_ONCE(addr != other_addr,
fc25151d
KRW
484 "0x%lx is using VM_IO, but it is 0x%lx!\n",
485 (unsigned long)addr, (unsigned long)other_addr);
486 } else {
487 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
488 other_addr = (_pte.pte & PTE_PFN_MASK);
d88885d0 489 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
fc25151d
KRW
490 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
491 (unsigned long)addr);
492 }
493
494 return _pte;
495}
496PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
497#endif
498
4c13629f 499static pgd_t xen_make_pgd(pgdval_t pgd)
947a69c9 500{
ebb9cfe2
JF
501 pgd = pte_pfn_to_mfn(pgd);
502 return native_make_pgd(pgd);
947a69c9 503}
da5de7c2 504PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
947a69c9 505
4c13629f 506static pmdval_t xen_pmd_val(pmd_t pmd)
947a69c9 507{
ebb9cfe2 508 return pte_mfn_to_pfn(pmd.pmd);
947a69c9 509}
da5de7c2 510PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
28499143 511
4c13629f 512static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
f4f97b3e 513{
400d3494 514 struct mmu_update u;
f4f97b3e 515
d66bf8fc
JF
516 preempt_disable();
517
400d3494
JF
518 xen_mc_batch();
519
ce803e70 520 /* ptr may be ioremapped for 64-bit pagetable setup */
2a001f64 521 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494 522 u.val = pud_val_ma(val);
7708ad64 523 xen_extend_mmu_update(&u);
d66bf8fc
JF
524
525 xen_mc_issue(PARAVIRT_LAZY_MMU);
526
527 preempt_enable();
f4f97b3e
JF
528}
529
4c13629f 530static void xen_set_pud(pud_t *ptr, pud_t val)
e2426cf8
JF
531{
532 /* If page is not pinned, we can just update the entry
533 directly */
7708ad64 534 if (!xen_page_pinned(ptr)) {
e2426cf8
JF
535 *ptr = val;
536 return;
537 }
538
539 xen_set_pud_hyper(ptr, val);
540}
541
f6e58732 542#ifdef CONFIG_X86_PAE
4c13629f 543static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
3b827c1b 544{
f6e58732 545 set_64bit((u64 *)ptep, native_pte_val(pte));
3b827c1b
JF
546}
547
4c13629f 548static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
3b827c1b 549{
4a35c13c
JF
550 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
551 native_pte_clear(mm, addr, ptep);
3b827c1b
JF
552}
553
4c13629f 554static void xen_pmd_clear(pmd_t *pmdp)
3b827c1b 555{
e2426cf8 556 set_pmd(pmdp, __pmd(0));
3b827c1b 557}
f6e58732 558#endif /* CONFIG_X86_PAE */
3b827c1b 559
4c13629f 560static pmd_t xen_make_pmd(pmdval_t pmd)
3b827c1b 561{
ebb9cfe2 562 pmd = pte_pfn_to_mfn(pmd);
947a69c9 563 return native_make_pmd(pmd);
3b827c1b 564}
da5de7c2 565PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
3b827c1b 566
f6e58732 567#if PAGETABLE_LEVELS == 4
4c13629f 568static pudval_t xen_pud_val(pud_t pud)
f6e58732
JF
569{
570 return pte_mfn_to_pfn(pud.pud);
571}
da5de7c2 572PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
f6e58732 573
4c13629f 574static pud_t xen_make_pud(pudval_t pud)
f6e58732
JF
575{
576 pud = pte_pfn_to_mfn(pud);
577
578 return native_make_pud(pud);
579}
da5de7c2 580PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
f6e58732 581
4c13629f 582static pgd_t *xen_get_user_pgd(pgd_t *pgd)
f6e58732 583{
d6182fbf
JF
584 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
585 unsigned offset = pgd - pgd_page;
586 pgd_t *user_ptr = NULL;
f6e58732 587
d6182fbf
JF
588 if (offset < pgd_index(USER_LIMIT)) {
589 struct page *page = virt_to_page(pgd_page);
590 user_ptr = (pgd_t *)page->private;
591 if (user_ptr)
592 user_ptr += offset;
593 }
f6e58732 594
d6182fbf
JF
595 return user_ptr;
596}
597
598static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
599{
600 struct mmu_update u;
f6e58732
JF
601
602 u.ptr = virt_to_machine(ptr).maddr;
603 u.val = pgd_val_ma(val);
7708ad64 604 xen_extend_mmu_update(&u);
d6182fbf
JF
605}
606
607/*
608 * Raw hypercall-based set_pgd, intended for in early boot before
609 * there's a page structure. This implies:
610 * 1. The only existing pagetable is the kernel's
611 * 2. It is always pinned
612 * 3. It has no user pagetable attached to it
613 */
4c13629f 614static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
d6182fbf
JF
615{
616 preempt_disable();
617
618 xen_mc_batch();
619
620 __xen_set_pgd_hyper(ptr, val);
f6e58732
JF
621
622 xen_mc_issue(PARAVIRT_LAZY_MMU);
623
624 preempt_enable();
625}
626
4c13629f 627static void xen_set_pgd(pgd_t *ptr, pgd_t val)
f6e58732 628{
d6182fbf
JF
629 pgd_t *user_ptr = xen_get_user_pgd(ptr);
630
f6e58732
JF
631 /* If page is not pinned, we can just update the entry
632 directly */
7708ad64 633 if (!xen_page_pinned(ptr)) {
f6e58732 634 *ptr = val;
d6182fbf 635 if (user_ptr) {
7708ad64 636 WARN_ON(xen_page_pinned(user_ptr));
d6182fbf
JF
637 *user_ptr = val;
638 }
f6e58732
JF
639 return;
640 }
641
d6182fbf
JF
642 /* If it's pinned, then we can at least batch the kernel and
643 user updates together. */
644 xen_mc_batch();
645
646 __xen_set_pgd_hyper(ptr, val);
647 if (user_ptr)
648 __xen_set_pgd_hyper(user_ptr, val);
649
650 xen_mc_issue(PARAVIRT_LAZY_MMU);
f6e58732
JF
651}
652#endif /* PAGETABLE_LEVELS == 4 */
653
f4f97b3e 654/*
5deb30d1
JF
655 * (Yet another) pagetable walker. This one is intended for pinning a
656 * pagetable. This means that it walks a pagetable and calls the
657 * callback function on each page it finds making up the page table,
658 * at every level. It walks the entire pagetable, but it only bothers
659 * pinning pte pages which are below limit. In the normal case this
660 * will be STACK_TOP_MAX, but at boot we need to pin up to
661 * FIXADDR_TOP.
662 *
663 * For 32-bit the important bit is that we don't pin beyond there,
664 * because then we start getting into Xen's ptes.
665 *
666 * For 64-bit, we must skip the Xen hole in the middle of the address
667 * space, just after the big x86-64 virtual hole.
668 */
86bbc2c2
IC
669static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
670 int (*func)(struct mm_struct *mm, struct page *,
671 enum pt_level),
672 unsigned long limit)
3b827c1b 673{
f4f97b3e 674 int flush = 0;
5deb30d1
JF
675 unsigned hole_low, hole_high;
676 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
677 unsigned pgdidx, pudidx, pmdidx;
f4f97b3e 678
5deb30d1
JF
679 /* The limit is the last byte to be touched */
680 limit--;
681 BUG_ON(limit >= FIXADDR_TOP);
3b827c1b
JF
682
683 if (xen_feature(XENFEAT_auto_translated_physmap))
f4f97b3e
JF
684 return 0;
685
5deb30d1
JF
686 /*
687 * 64-bit has a great big hole in the middle of the address
688 * space, which contains the Xen mappings. On 32-bit these
689 * will end up making a zero-sized hole and so is a no-op.
690 */
d6182fbf 691 hole_low = pgd_index(USER_LIMIT);
5deb30d1
JF
692 hole_high = pgd_index(PAGE_OFFSET);
693
694 pgdidx_limit = pgd_index(limit);
695#if PTRS_PER_PUD > 1
696 pudidx_limit = pud_index(limit);
697#else
698 pudidx_limit = 0;
699#endif
700#if PTRS_PER_PMD > 1
701 pmdidx_limit = pmd_index(limit);
702#else
703 pmdidx_limit = 0;
704#endif
705
5deb30d1 706 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
f4f97b3e 707 pud_t *pud;
3b827c1b 708
5deb30d1
JF
709 if (pgdidx >= hole_low && pgdidx < hole_high)
710 continue;
f4f97b3e 711
5deb30d1 712 if (!pgd_val(pgd[pgdidx]))
3b827c1b 713 continue;
f4f97b3e 714
5deb30d1 715 pud = pud_offset(&pgd[pgdidx], 0);
3b827c1b
JF
716
717 if (PTRS_PER_PUD > 1) /* not folded */
eefb47f6 718 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
f4f97b3e 719
5deb30d1 720 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
f4f97b3e 721 pmd_t *pmd;
f4f97b3e 722
5deb30d1
JF
723 if (pgdidx == pgdidx_limit &&
724 pudidx > pudidx_limit)
725 goto out;
3b827c1b 726
5deb30d1 727 if (pud_none(pud[pudidx]))
3b827c1b 728 continue;
f4f97b3e 729
5deb30d1 730 pmd = pmd_offset(&pud[pudidx], 0);
3b827c1b
JF
731
732 if (PTRS_PER_PMD > 1) /* not folded */
eefb47f6 733 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
f4f97b3e 734
5deb30d1
JF
735 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
736 struct page *pte;
737
738 if (pgdidx == pgdidx_limit &&
739 pudidx == pudidx_limit &&
740 pmdidx > pmdidx_limit)
741 goto out;
3b827c1b 742
5deb30d1 743 if (pmd_none(pmd[pmdidx]))
3b827c1b
JF
744 continue;
745
5deb30d1 746 pte = pmd_page(pmd[pmdidx]);
eefb47f6 747 flush |= (*func)(mm, pte, PT_PTE);
3b827c1b
JF
748 }
749 }
750 }
11ad93e5 751
5deb30d1 752out:
11ad93e5
JF
753 /* Do the top level last, so that the callbacks can use it as
754 a cue to do final things like tlb flushes. */
eefb47f6 755 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
f4f97b3e
JF
756
757 return flush;
3b827c1b
JF
758}
759
86bbc2c2
IC
760static int xen_pgd_walk(struct mm_struct *mm,
761 int (*func)(struct mm_struct *mm, struct page *,
762 enum pt_level),
763 unsigned long limit)
764{
765 return __xen_pgd_walk(mm, mm->pgd, func, limit);
766}
767
7708ad64
JF
768/* If we're using split pte locks, then take the page's lock and
769 return a pointer to it. Otherwise return NULL. */
eefb47f6 770static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
74260714
JF
771{
772 spinlock_t *ptl = NULL;
773
f7d0b926 774#if USE_SPLIT_PTLOCKS
74260714 775 ptl = __pte_lockptr(page);
eefb47f6 776 spin_lock_nest_lock(ptl, &mm->page_table_lock);
74260714
JF
777#endif
778
779 return ptl;
780}
781
7708ad64 782static void xen_pte_unlock(void *v)
74260714
JF
783{
784 spinlock_t *ptl = v;
785 spin_unlock(ptl);
786}
787
788static void xen_do_pin(unsigned level, unsigned long pfn)
789{
790 struct mmuext_op *op;
791 struct multicall_space mcs;
792
793 mcs = __xen_mc_entry(sizeof(*op));
794 op = mcs.args;
795 op->cmd = level;
796 op->arg1.mfn = pfn_to_mfn(pfn);
797 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
798}
799
eefb47f6
JF
800static int xen_pin_page(struct mm_struct *mm, struct page *page,
801 enum pt_level level)
f4f97b3e 802{
d60cd46b 803 unsigned pgfl = TestSetPagePinned(page);
f4f97b3e
JF
804 int flush;
805
806 if (pgfl)
807 flush = 0; /* already pinned */
808 else if (PageHighMem(page))
809 /* kmaps need flushing if we found an unpinned
810 highpage */
811 flush = 1;
812 else {
813 void *pt = lowmem_page_address(page);
814 unsigned long pfn = page_to_pfn(page);
815 struct multicall_space mcs = __xen_mc_entry(0);
74260714 816 spinlock_t *ptl;
f4f97b3e
JF
817
818 flush = 0;
819
11ad93e5
JF
820 /*
821 * We need to hold the pagetable lock between the time
822 * we make the pagetable RO and when we actually pin
823 * it. If we don't, then other users may come in and
824 * attempt to update the pagetable by writing it,
825 * which will fail because the memory is RO but not
826 * pinned, so Xen won't do the trap'n'emulate.
827 *
828 * If we're using split pte locks, we can't hold the
829 * entire pagetable's worth of locks during the
830 * traverse, because we may wrap the preempt count (8
831 * bits). The solution is to mark RO and pin each PTE
832 * page while holding the lock. This means the number
833 * of locks we end up holding is never more than a
834 * batch size (~32 entries, at present).
835 *
836 * If we're not using split pte locks, we needn't pin
837 * the PTE pages independently, because we're
838 * protected by the overall pagetable lock.
839 */
74260714
JF
840 ptl = NULL;
841 if (level == PT_PTE)
eefb47f6 842 ptl = xen_pte_lock(page, mm);
74260714 843
f4f97b3e
JF
844 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
845 pfn_pte(pfn, PAGE_KERNEL_RO),
74260714
JF
846 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
847
11ad93e5 848 if (ptl) {
74260714
JF
849 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
850
74260714
JF
851 /* Queue a deferred unlock for when this batch
852 is completed. */
7708ad64 853 xen_mc_callback(xen_pte_unlock, ptl);
74260714 854 }
f4f97b3e
JF
855 }
856
857 return flush;
858}
3b827c1b 859
f4f97b3e
JF
860/* This is called just after a mm has been created, but it has not
861 been used yet. We need to make sure that its pagetable is all
862 read-only, and can be pinned. */
eefb47f6 863static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
3b827c1b 864{
f4f97b3e 865 xen_mc_batch();
3b827c1b 866
86bbc2c2 867 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
d05fdf31 868 /* re-enable interrupts for flushing */
f87e4cac 869 xen_mc_issue(0);
d05fdf31 870
f4f97b3e 871 kmap_flush_unused();
d05fdf31 872
f87e4cac
JF
873 xen_mc_batch();
874 }
f4f97b3e 875
d6182fbf
JF
876#ifdef CONFIG_X86_64
877 {
878 pgd_t *user_pgd = xen_get_user_pgd(pgd);
879
880 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
881
882 if (user_pgd) {
eefb47f6 883 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
f63c2f24
T
884 xen_do_pin(MMUEXT_PIN_L4_TABLE,
885 PFN_DOWN(__pa(user_pgd)));
d6182fbf
JF
886 }
887 }
888#else /* CONFIG_X86_32 */
5deb30d1
JF
889#ifdef CONFIG_X86_PAE
890 /* Need to make sure unshared kernel PMD is pinnable */
47cb2ed9 891 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
eefb47f6 892 PT_PMD);
5deb30d1 893#endif
28499143 894 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
d6182fbf 895#endif /* CONFIG_X86_64 */
f4f97b3e 896 xen_mc_issue(0);
3b827c1b
JF
897}
898
eefb47f6
JF
899static void xen_pgd_pin(struct mm_struct *mm)
900{
901 __xen_pgd_pin(mm, mm->pgd);
902}
903
0e91398f
JF
904/*
905 * On save, we need to pin all pagetables to make sure they get their
906 * mfns turned into pfns. Search the list for any unpinned pgds and pin
907 * them (unpinned pgds are not currently in use, probably because the
908 * process is under construction or destruction).
eefb47f6
JF
909 *
910 * Expected to be called in stop_machine() ("equivalent to taking
911 * every spinlock in the system"), so the locking doesn't really
912 * matter all that much.
0e91398f
JF
913 */
914void xen_mm_pin_all(void)
915{
0e91398f 916 struct page *page;
74260714 917
a79e53d8 918 spin_lock(&pgd_lock);
f4f97b3e 919
0e91398f
JF
920 list_for_each_entry(page, &pgd_list, lru) {
921 if (!PagePinned(page)) {
eefb47f6 922 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
0e91398f
JF
923 SetPageSavePinned(page);
924 }
925 }
926
a79e53d8 927 spin_unlock(&pgd_lock);
3b827c1b
JF
928}
929
c1f2f09e
EH
930/*
931 * The init_mm pagetable is really pinned as soon as its created, but
932 * that's before we have page structures to store the bits. So do all
933 * the book-keeping now.
934 */
eefb47f6
JF
935static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
936 enum pt_level level)
3b827c1b 937{
f4f97b3e
JF
938 SetPagePinned(page);
939 return 0;
940}
3b827c1b 941
b96229b5 942static void __init xen_mark_init_mm_pinned(void)
f4f97b3e 943{
eefb47f6 944 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
f4f97b3e 945}
3b827c1b 946
eefb47f6
JF
947static int xen_unpin_page(struct mm_struct *mm, struct page *page,
948 enum pt_level level)
f4f97b3e 949{
d60cd46b 950 unsigned pgfl = TestClearPagePinned(page);
3b827c1b 951
f4f97b3e
JF
952 if (pgfl && !PageHighMem(page)) {
953 void *pt = lowmem_page_address(page);
954 unsigned long pfn = page_to_pfn(page);
74260714
JF
955 spinlock_t *ptl = NULL;
956 struct multicall_space mcs;
957
11ad93e5
JF
958 /*
959 * Do the converse to pin_page. If we're using split
960 * pte locks, we must be holding the lock for while
961 * the pte page is unpinned but still RO to prevent
962 * concurrent updates from seeing it in this
963 * partially-pinned state.
964 */
74260714 965 if (level == PT_PTE) {
eefb47f6 966 ptl = xen_pte_lock(page, mm);
74260714 967
11ad93e5
JF
968 if (ptl)
969 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
74260714
JF
970 }
971
972 mcs = __xen_mc_entry(0);
f4f97b3e
JF
973
974 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
975 pfn_pte(pfn, PAGE_KERNEL),
74260714
JF
976 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
977
978 if (ptl) {
979 /* unlock when batch completed */
7708ad64 980 xen_mc_callback(xen_pte_unlock, ptl);
74260714 981 }
f4f97b3e
JF
982 }
983
984 return 0; /* never need to flush on unpin */
3b827c1b
JF
985}
986
f4f97b3e 987/* Release a pagetables pages back as normal RW */
eefb47f6 988static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
f4f97b3e 989{
f4f97b3e
JF
990 xen_mc_batch();
991
74260714 992 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e 993
d6182fbf
JF
994#ifdef CONFIG_X86_64
995 {
996 pgd_t *user_pgd = xen_get_user_pgd(pgd);
997
998 if (user_pgd) {
f63c2f24
T
999 xen_do_pin(MMUEXT_UNPIN_TABLE,
1000 PFN_DOWN(__pa(user_pgd)));
eefb47f6 1001 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
d6182fbf
JF
1002 }
1003 }
1004#endif
1005
5deb30d1
JF
1006#ifdef CONFIG_X86_PAE
1007 /* Need to make sure unshared kernel PMD is unpinned */
47cb2ed9 1008 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
eefb47f6 1009 PT_PMD);
5deb30d1 1010#endif
d6182fbf 1011
86bbc2c2 1012 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
f4f97b3e
JF
1013
1014 xen_mc_issue(0);
1015}
3b827c1b 1016
eefb47f6
JF
1017static void xen_pgd_unpin(struct mm_struct *mm)
1018{
1019 __xen_pgd_unpin(mm, mm->pgd);
1020}
1021
0e91398f
JF
1022/*
1023 * On resume, undo any pinning done at save, so that the rest of the
1024 * kernel doesn't see any unexpected pinned pagetables.
1025 */
1026void xen_mm_unpin_all(void)
1027{
0e91398f
JF
1028 struct page *page;
1029
a79e53d8 1030 spin_lock(&pgd_lock);
0e91398f
JF
1031
1032 list_for_each_entry(page, &pgd_list, lru) {
1033 if (PageSavePinned(page)) {
1034 BUG_ON(!PagePinned(page));
eefb47f6 1035 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
0e91398f
JF
1036 ClearPageSavePinned(page);
1037 }
1038 }
1039
a79e53d8 1040 spin_unlock(&pgd_lock);
0e91398f
JF
1041}
1042
4c13629f 1043static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
3b827c1b 1044{
f4f97b3e 1045 spin_lock(&next->page_table_lock);
eefb47f6 1046 xen_pgd_pin(next);
f4f97b3e 1047 spin_unlock(&next->page_table_lock);
3b827c1b
JF
1048}
1049
4c13629f 1050static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
3b827c1b 1051{
f4f97b3e 1052 spin_lock(&mm->page_table_lock);
eefb47f6 1053 xen_pgd_pin(mm);
f4f97b3e 1054 spin_unlock(&mm->page_table_lock);
3b827c1b
JF
1055}
1056
3b827c1b 1057
f87e4cac
JF
1058#ifdef CONFIG_SMP
1059/* Another cpu may still have their %cr3 pointing at the pagetable, so
1060 we need to repoint it somewhere else before we can unpin it. */
1061static void drop_other_mm_ref(void *info)
1062{
1063 struct mm_struct *mm = info;
ce87b3d3 1064 struct mm_struct *active_mm;
3b827c1b 1065
9eb912d1 1066 active_mm = percpu_read(cpu_tlbstate.active_mm);
ce87b3d3
JF
1067
1068 if (active_mm == mm)
f87e4cac 1069 leave_mm(smp_processor_id());
9f79991d
JF
1070
1071 /* If this cpu still has a stale cr3 reference, then make sure
1072 it has been flushed. */
7fd7d83d 1073 if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
9f79991d 1074 load_cr3(swapper_pg_dir);
f87e4cac 1075}
3b827c1b 1076
7708ad64 1077static void xen_drop_mm_ref(struct mm_struct *mm)
f87e4cac 1078{
e4d98207 1079 cpumask_var_t mask;
9f79991d
JF
1080 unsigned cpu;
1081
f87e4cac
JF
1082 if (current->active_mm == mm) {
1083 if (current->mm == mm)
1084 load_cr3(swapper_pg_dir);
1085 else
1086 leave_mm(smp_processor_id());
9f79991d
JF
1087 }
1088
1089 /* Get the "official" set of cpus referring to our pagetable. */
e4d98207
MT
1090 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1091 for_each_online_cpu(cpu) {
78f1c4d6 1092 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
e4d98207
MT
1093 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1094 continue;
1095 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1096 }
1097 return;
1098 }
78f1c4d6 1099 cpumask_copy(mask, mm_cpumask(mm));
9f79991d
JF
1100
1101 /* It's possible that a vcpu may have a stale reference to our
1102 cr3, because its in lazy mode, and it hasn't yet flushed
1103 its set of pending hypercalls yet. In this case, we can
1104 look at its actual current cr3 value, and force it to flush
1105 if needed. */
1106 for_each_online_cpu(cpu) {
1107 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
e4d98207 1108 cpumask_set_cpu(cpu, mask);
3b827c1b
JF
1109 }
1110
e4d98207
MT
1111 if (!cpumask_empty(mask))
1112 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1113 free_cpumask_var(mask);
f87e4cac
JF
1114}
1115#else
7708ad64 1116static void xen_drop_mm_ref(struct mm_struct *mm)
f87e4cac
JF
1117{
1118 if (current->active_mm == mm)
1119 load_cr3(swapper_pg_dir);
1120}
1121#endif
1122
1123/*
1124 * While a process runs, Xen pins its pagetables, which means that the
1125 * hypervisor forces it to be read-only, and it controls all updates
1126 * to it. This means that all pagetable updates have to go via the
1127 * hypervisor, which is moderately expensive.
1128 *
1129 * Since we're pulling the pagetable down, we switch to use init_mm,
1130 * unpin old process pagetable and mark it all read-write, which
1131 * allows further operations on it to be simple memory accesses.
1132 *
1133 * The only subtle point is that another CPU may be still using the
1134 * pagetable because of lazy tlb flushing. This means we need need to
1135 * switch all CPUs off this pagetable before we can unpin it.
1136 */
4c13629f 1137static void xen_exit_mmap(struct mm_struct *mm)
f87e4cac
JF
1138{
1139 get_cpu(); /* make sure we don't move around */
7708ad64 1140 xen_drop_mm_ref(mm);
f87e4cac 1141 put_cpu();
3b827c1b 1142
f120f13e 1143 spin_lock(&mm->page_table_lock);
df912ea4
JF
1144
1145 /* pgd may not be pinned in the error exit path of execve */
7708ad64 1146 if (xen_page_pinned(mm->pgd))
eefb47f6 1147 xen_pgd_unpin(mm);
74260714 1148
f120f13e 1149 spin_unlock(&mm->page_table_lock);
3b827c1b 1150}
994025ca 1151
319f3ba5
JF
1152static __init void xen_pagetable_setup_start(pgd_t *base)
1153{
1154}
1155
279b706b
SS
1156static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1157{
1158 /* reserve the range used */
1159 native_pagetable_reserve(start, end);
1160
1161 /* set as RW the rest */
1162 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1163 PFN_PHYS(pgt_buf_top));
1164 while (end < PFN_PHYS(pgt_buf_top)) {
1165 make_lowmem_page_readwrite(__va(end));
1166 end += PAGE_SIZE;
1167 }
1168}
1169
f1d7062a
TG
1170static void xen_post_allocator_init(void);
1171
319f3ba5
JF
1172static __init void xen_pagetable_setup_done(pgd_t *base)
1173{
1174 xen_setup_shared_info();
f1d7062a 1175 xen_post_allocator_init();
319f3ba5
JF
1176}
1177
1178static void xen_write_cr2(unsigned long cr2)
1179{
1180 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1181}
1182
1183static unsigned long xen_read_cr2(void)
1184{
1185 return percpu_read(xen_vcpu)->arch.cr2;
1186}
1187
1188unsigned long xen_read_cr2_direct(void)
1189{
1190 return percpu_read(xen_vcpu_info.arch.cr2);
1191}
1192
1193static void xen_flush_tlb(void)
1194{
1195 struct mmuext_op *op;
1196 struct multicall_space mcs;
1197
1198 preempt_disable();
1199
1200 mcs = xen_mc_entry(sizeof(*op));
1201
1202 op = mcs.args;
1203 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1204 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1205
1206 xen_mc_issue(PARAVIRT_LAZY_MMU);
1207
1208 preempt_enable();
1209}
1210
1211static void xen_flush_tlb_single(unsigned long addr)
1212{
1213 struct mmuext_op *op;
1214 struct multicall_space mcs;
1215
1216 preempt_disable();
1217
1218 mcs = xen_mc_entry(sizeof(*op));
1219 op = mcs.args;
1220 op->cmd = MMUEXT_INVLPG_LOCAL;
1221 op->arg1.linear_addr = addr & PAGE_MASK;
1222 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1223
1224 xen_mc_issue(PARAVIRT_LAZY_MMU);
1225
1226 preempt_enable();
1227}
1228
1229static void xen_flush_tlb_others(const struct cpumask *cpus,
1230 struct mm_struct *mm, unsigned long va)
1231{
1232 struct {
1233 struct mmuext_op op;
1234 DECLARE_BITMAP(mask, NR_CPUS);
1235 } *args;
1236 struct multicall_space mcs;
1237
e3f8a74e
JF
1238 if (cpumask_empty(cpus))
1239 return; /* nothing to do */
319f3ba5
JF
1240
1241 mcs = xen_mc_entry(sizeof(*args));
1242 args = mcs.args;
1243 args->op.arg2.vcpumask = to_cpumask(args->mask);
1244
1245 /* Remove us, and any offline CPUS. */
1246 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1247 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
319f3ba5
JF
1248
1249 if (va == TLB_FLUSH_ALL) {
1250 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1251 } else {
1252 args->op.cmd = MMUEXT_INVLPG_MULTI;
1253 args->op.arg1.linear_addr = va;
1254 }
1255
1256 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1257
319f3ba5
JF
1258 xen_mc_issue(PARAVIRT_LAZY_MMU);
1259}
1260
1261static unsigned long xen_read_cr3(void)
1262{
1263 return percpu_read(xen_cr3);
1264}
1265
1266static void set_current_cr3(void *v)
1267{
1268 percpu_write(xen_current_cr3, (unsigned long)v);
1269}
1270
1271static void __xen_write_cr3(bool kernel, unsigned long cr3)
1272{
1273 struct mmuext_op *op;
1274 struct multicall_space mcs;
1275 unsigned long mfn;
1276
1277 if (cr3)
1278 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1279 else
1280 mfn = 0;
1281
1282 WARN_ON(mfn == 0 && kernel);
1283
1284 mcs = __xen_mc_entry(sizeof(*op));
1285
1286 op = mcs.args;
1287 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1288 op->arg1.mfn = mfn;
1289
1290 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1291
1292 if (kernel) {
1293 percpu_write(xen_cr3, cr3);
1294
1295 /* Update xen_current_cr3 once the batch has actually
1296 been submitted. */
1297 xen_mc_callback(set_current_cr3, (void *)cr3);
1298 }
1299}
1300
1301static void xen_write_cr3(unsigned long cr3)
1302{
1303 BUG_ON(preemptible());
1304
1305 xen_mc_batch(); /* disables interrupts */
1306
1307 /* Update while interrupts are disabled, so its atomic with
1308 respect to ipis */
1309 percpu_write(xen_cr3, cr3);
1310
1311 __xen_write_cr3(true, cr3);
1312
1313#ifdef CONFIG_X86_64
1314 {
1315 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1316 if (user_pgd)
1317 __xen_write_cr3(false, __pa(user_pgd));
1318 else
1319 __xen_write_cr3(false, 0);
1320 }
1321#endif
1322
1323 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1324}
1325
1326static int xen_pgd_alloc(struct mm_struct *mm)
1327{
1328 pgd_t *pgd = mm->pgd;
1329 int ret = 0;
1330
1331 BUG_ON(PagePinned(virt_to_page(pgd)));
1332
1333#ifdef CONFIG_X86_64
1334 {
1335 struct page *page = virt_to_page(pgd);
1336 pgd_t *user_pgd;
1337
1338 BUG_ON(page->private != 0);
1339
1340 ret = -ENOMEM;
1341
1342 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1343 page->private = (unsigned long)user_pgd;
1344
1345 if (user_pgd != NULL) {
1346 user_pgd[pgd_index(VSYSCALL_START)] =
1347 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1348 ret = 0;
1349 }
1350
1351 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1352 }
1353#endif
1354
1355 return ret;
1356}
1357
1358static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1359{
1360#ifdef CONFIG_X86_64
1361 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1362
1363 if (user_pgd)
1364 free_page((unsigned long)user_pgd);
1365#endif
1366}
1367
ee176455 1368#ifdef CONFIG_X86_32
1f4f9315
JF
1369static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1370{
1371 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1372 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1373 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1374 pte_val_ma(pte));
ee176455
SS
1375
1376 return pte;
1377}
1378#else /* CONFIG_X86_64 */
1379static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1380{
1381 unsigned long pfn = pte_pfn(pte);
fef5ba79
JF
1382
1383 /*
1384 * If the new pfn is within the range of the newly allocated
1385 * kernel pagetable, and it isn't being mapped into an
d8aa5ec3
SS
1386 * early_ioremap fixmap slot as a freshly allocated page, make sure
1387 * it is RO.
fef5ba79 1388 */
d8aa5ec3 1389 if (((!is_early_ioremap_ptep(ptep) &&
b9269dc7 1390 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
d8aa5ec3 1391 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
fef5ba79 1392 pte = pte_wrprotect(pte);
1f4f9315
JF
1393
1394 return pte;
1395}
ee176455 1396#endif /* CONFIG_X86_64 */
1f4f9315
JF
1397
1398/* Init-time set_pte while constructing initial pagetables, which
1399 doesn't allow RO pagetable pages to be remapped RW */
1400static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1401{
1402 pte = mask_rw_pte(ptep, pte);
1403
1404 xen_set_pte(ptep, pte);
1405}
319f3ba5 1406
b96229b5
JF
1407static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1408{
1409 struct mmuext_op op;
1410 op.cmd = cmd;
1411 op.arg1.mfn = pfn_to_mfn(pfn);
1412 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1413 BUG();
1414}
1415
319f3ba5
JF
1416/* Early in boot, while setting up the initial pagetable, assume
1417 everything is pinned. */
1418static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1419{
b96229b5
JF
1420#ifdef CONFIG_FLATMEM
1421 BUG_ON(mem_map); /* should only be used early */
1422#endif
1423 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1424 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1425}
1426
1427/* Used for pmd and pud */
1428static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1429{
319f3ba5
JF
1430#ifdef CONFIG_FLATMEM
1431 BUG_ON(mem_map); /* should only be used early */
1432#endif
1433 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1434}
1435
1436/* Early release_pte assumes that all pts are pinned, since there's
1437 only init_mm and anything attached to that is pinned. */
b96229b5 1438static __init void xen_release_pte_init(unsigned long pfn)
319f3ba5 1439{
b96229b5 1440 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
319f3ba5
JF
1441 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1442}
1443
b96229b5 1444static __init void xen_release_pmd_init(unsigned long pfn)
319f3ba5 1445{
b96229b5 1446 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
319f3ba5
JF
1447}
1448
1449/* This needs to make sure the new pte page is pinned iff its being
1450 attached to a pinned pagetable. */
1451static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1452{
1453 struct page *page = pfn_to_page(pfn);
1454
1455 if (PagePinned(virt_to_page(mm->pgd))) {
1456 SetPagePinned(page);
1457
319f3ba5
JF
1458 if (!PageHighMem(page)) {
1459 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1460 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1461 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1462 } else {
1463 /* make sure there are no stray mappings of
1464 this page */
1465 kmap_flush_unused();
1466 }
1467 }
1468}
1469
1470static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1471{
1472 xen_alloc_ptpage(mm, pfn, PT_PTE);
1473}
1474
1475static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1476{
1477 xen_alloc_ptpage(mm, pfn, PT_PMD);
1478}
1479
1480/* This should never happen until we're OK to use struct page */
1481static void xen_release_ptpage(unsigned long pfn, unsigned level)
1482{
1483 struct page *page = pfn_to_page(pfn);
1484
1485 if (PagePinned(page)) {
1486 if (!PageHighMem(page)) {
1487 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1488 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1489 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1490 }
1491 ClearPagePinned(page);
1492 }
1493}
1494
1495static void xen_release_pte(unsigned long pfn)
1496{
1497 xen_release_ptpage(pfn, PT_PTE);
1498}
1499
1500static void xen_release_pmd(unsigned long pfn)
1501{
1502 xen_release_ptpage(pfn, PT_PMD);
1503}
1504
1505#if PAGETABLE_LEVELS == 4
1506static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1507{
1508 xen_alloc_ptpage(mm, pfn, PT_PUD);
1509}
1510
1511static void xen_release_pud(unsigned long pfn)
1512{
1513 xen_release_ptpage(pfn, PT_PUD);
1514}
1515#endif
1516
1517void __init xen_reserve_top(void)
1518{
1519#ifdef CONFIG_X86_32
1520 unsigned long top = HYPERVISOR_VIRT_START;
1521 struct xen_platform_parameters pp;
1522
1523 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1524 top = pp.virt_start;
1525
1526 reserve_top_address(-top);
1527#endif /* CONFIG_X86_32 */
1528}
1529
1530/*
1531 * Like __va(), but returns address in the kernel mapping (which is
1532 * all we have until the physical memory mapping has been set up.
1533 */
1534static void *__ka(phys_addr_t paddr)
1535{
1536#ifdef CONFIG_X86_64
1537 return (void *)(paddr + __START_KERNEL_map);
1538#else
1539 return __va(paddr);
1540#endif
1541}
1542
1543/* Convert a machine address to physical address */
1544static unsigned long m2p(phys_addr_t maddr)
1545{
1546 phys_addr_t paddr;
1547
1548 maddr &= PTE_PFN_MASK;
1549 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1550
1551 return paddr;
1552}
1553
1554/* Convert a machine address to kernel virtual */
1555static void *m2v(phys_addr_t maddr)
1556{
1557 return __ka(m2p(maddr));
1558}
1559
4ec5387c 1560/* Set the page permissions on an identity-mapped pages */
319f3ba5
JF
1561static void set_page_prot(void *addr, pgprot_t prot)
1562{
1563 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1564 pte_t pte = pfn_pte(pfn, prot);
1565
1566 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1567 BUG();
1568}
1569
1570static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1571{
1572 unsigned pmdidx, pteidx;
1573 unsigned ident_pte;
1574 unsigned long pfn;
1575
764f0138
JF
1576 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1577 PAGE_SIZE);
1578
319f3ba5
JF
1579 ident_pte = 0;
1580 pfn = 0;
1581 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1582 pte_t *pte_page;
1583
1584 /* Reuse or allocate a page of ptes */
1585 if (pmd_present(pmd[pmdidx]))
1586 pte_page = m2v(pmd[pmdidx].pmd);
1587 else {
1588 /* Check for free pte pages */
764f0138 1589 if (ident_pte == LEVEL1_IDENT_ENTRIES)
319f3ba5
JF
1590 break;
1591
1592 pte_page = &level1_ident_pgt[ident_pte];
1593 ident_pte += PTRS_PER_PTE;
1594
1595 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1596 }
1597
1598 /* Install mappings */
1599 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1600 pte_t pte;
1601
319f3ba5
JF
1602 if (!pte_none(pte_page[pteidx]))
1603 continue;
1604
1605 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1606 pte_page[pteidx] = pte;
1607 }
1608 }
1609
1610 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1611 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1612
1613 set_page_prot(pmd, PAGE_KERNEL_RO);
1614}
1615
7e77506a
IC
1616void __init xen_setup_machphys_mapping(void)
1617{
1618 struct xen_machphys_mapping mapping;
1619 unsigned long machine_to_phys_nr_ents;
1620
1621 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1622 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1623 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1624 } else {
1625 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1626 }
1627 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1628}
1629
319f3ba5
JF
1630#ifdef CONFIG_X86_64
1631static void convert_pfn_mfn(void *v)
1632{
1633 pte_t *pte = v;
1634 int i;
1635
1636 /* All levels are converted the same way, so just treat them
1637 as ptes. */
1638 for (i = 0; i < PTRS_PER_PTE; i++)
1639 pte[i] = xen_make_pte(pte[i].pte);
1640}
1641
1642/*
0d2eb44f 1643 * Set up the initial kernel pagetable.
319f3ba5
JF
1644 *
1645 * We can construct this by grafting the Xen provided pagetable into
1646 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1647 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1648 * means that only the kernel has a physical mapping to start with -
1649 * but that's enough to get __va working. We need to fill in the rest
1650 * of the physical mapping once some sort of allocator has been set
1651 * up.
1652 */
1653__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1654 unsigned long max_pfn)
1655{
1656 pud_t *l3;
1657 pmd_t *l2;
1658
14988a4d
SS
1659 /* max_pfn_mapped is the last pfn mapped in the initial memory
1660 * mappings. Considering that on Xen after the kernel mappings we
1661 * have the mappings of some pages that don't exist in pfn space, we
1662 * set max_pfn_mapped to the last real pfn mapped. */
1663 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1664
319f3ba5
JF
1665 /* Zap identity mapping */
1666 init_level4_pgt[0] = __pgd(0);
1667
1668 /* Pre-constructed entries are in pfn, so convert to mfn */
1669 convert_pfn_mfn(init_level4_pgt);
1670 convert_pfn_mfn(level3_ident_pgt);
1671 convert_pfn_mfn(level3_kernel_pgt);
1672
1673 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1674 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1675
1676 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1677 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1678
1679 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1680 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1681 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1682
1683 /* Set up identity map */
1684 xen_map_identity_early(level2_ident_pgt, max_pfn);
1685
1686 /* Make pagetable pieces RO */
1687 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1688 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1689 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1690 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1691 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1692 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1693
1694 /* Pin down new L4 */
1695 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1696 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1697
1698 /* Unpin Xen-provided one */
1699 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1700
1701 /* Switch over */
1702 pgd = init_level4_pgt;
1703
1704 /*
1705 * At this stage there can be no user pgd, and no page
1706 * structure to attach it to, so make sure we just set kernel
1707 * pgd.
1708 */
1709 xen_mc_batch();
1710 __xen_write_cr3(true, __pa(pgd));
1711 xen_mc_issue(PARAVIRT_LAZY_CPU);
1712
a9ce6bc1 1713 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
319f3ba5
JF
1714 __pa(xen_start_info->pt_base +
1715 xen_start_info->nr_pt_frames * PAGE_SIZE),
1716 "XEN PAGETABLES");
1717
1718 return pgd;
1719}
1720#else /* !CONFIG_X86_64 */
5b5c1af1
IC
1721static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1722static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1723
1724static __init void xen_write_cr3_init(unsigned long cr3)
1725{
1726 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1727
1728 BUG_ON(read_cr3() != __pa(initial_page_table));
1729 BUG_ON(cr3 != __pa(swapper_pg_dir));
1730
1731 /*
1732 * We are switching to swapper_pg_dir for the first time (from
1733 * initial_page_table) and therefore need to mark that page
1734 * read-only and then pin it.
1735 *
1736 * Xen disallows sharing of kernel PMDs for PAE
1737 * guests. Therefore we must copy the kernel PMD from
1738 * initial_page_table into a new kernel PMD to be used in
1739 * swapper_pg_dir.
1740 */
1741 swapper_kernel_pmd =
1742 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1743 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1744 sizeof(pmd_t) * PTRS_PER_PMD);
1745 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1746 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1747 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1748
1749 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1750 xen_write_cr3(cr3);
1751 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1752
1753 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1754 PFN_DOWN(__pa(initial_page_table)));
1755 set_page_prot(initial_page_table, PAGE_KERNEL);
1756 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1757
1758 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1759}
319f3ba5
JF
1760
1761__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1762 unsigned long max_pfn)
1763{
1764 pmd_t *kernel_pmd;
1765
5b5c1af1
IC
1766 initial_kernel_pmd =
1767 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
f0991802 1768
14988a4d 1769 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
319f3ba5
JF
1770
1771 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
5b5c1af1 1772 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
319f3ba5 1773
5b5c1af1 1774 xen_map_identity_early(initial_kernel_pmd, max_pfn);
319f3ba5 1775
5b5c1af1
IC
1776 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1777 initial_page_table[KERNEL_PGD_BOUNDARY] =
1778 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
319f3ba5 1779
5b5c1af1
IC
1780 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1781 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
319f3ba5
JF
1782 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1783
1784 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1785
5b5c1af1
IC
1786 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1787 PFN_DOWN(__pa(initial_page_table)));
1788 xen_write_cr3(__pa(initial_page_table));
319f3ba5 1789
a9ce6bc1 1790 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
33df4db0
JF
1791 __pa(xen_start_info->pt_base +
1792 xen_start_info->nr_pt_frames * PAGE_SIZE),
1793 "XEN PAGETABLES");
1794
5b5c1af1 1795 return initial_page_table;
319f3ba5
JF
1796}
1797#endif /* CONFIG_X86_64 */
1798
98511f35
JF
1799static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1800
3b3809ac 1801static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
319f3ba5
JF
1802{
1803 pte_t pte;
1804
1805 phys >>= PAGE_SHIFT;
1806
1807 switch (idx) {
1808 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1809#ifdef CONFIG_X86_F00F_BUG
1810 case FIX_F00F_IDT:
1811#endif
1812#ifdef CONFIG_X86_32
1813 case FIX_WP_TEST:
1814 case FIX_VDSO:
1815# ifdef CONFIG_HIGHMEM
1816 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1817# endif
1818#else
1819 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
319f3ba5 1820#endif
3ecb1b7d
JF
1821 case FIX_TEXT_POKE0:
1822 case FIX_TEXT_POKE1:
1823 /* All local page mappings */
319f3ba5
JF
1824 pte = pfn_pte(phys, prot);
1825 break;
1826
98511f35
JF
1827#ifdef CONFIG_X86_LOCAL_APIC
1828 case FIX_APIC_BASE: /* maps dummy local APIC */
1829 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1830 break;
1831#endif
1832
1833#ifdef CONFIG_X86_IO_APIC
1834 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1835 /*
1836 * We just don't map the IO APIC - all access is via
1837 * hypercalls. Keep the address in the pte for reference.
1838 */
1839 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1840 break;
1841#endif
1842
c0011dbf
JF
1843 case FIX_PARAVIRT_BOOTMAP:
1844 /* This is an MFN, but it isn't an IO mapping from the
1845 IO domain */
319f3ba5
JF
1846 pte = mfn_pte(phys, prot);
1847 break;
c0011dbf
JF
1848
1849 default:
1850 /* By default, set_fixmap is used for hardware mappings */
1851 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1852 break;
319f3ba5
JF
1853 }
1854
1855 __native_set_fixmap(idx, pte);
1856
1857#ifdef CONFIG_X86_64
1858 /* Replicate changes to map the vsyscall page into the user
1859 pagetable vsyscall mapping. */
1860 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1861 unsigned long vaddr = __fix_to_virt(idx);
1862 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1863 }
1864#endif
1865}
1866
4ec5387c
JQ
1867__init void xen_ident_map_ISA(void)
1868{
1869 unsigned long pa;
1870
1871 /*
1872 * If we're dom0, then linear map the ISA machine addresses into
1873 * the kernel's address space.
1874 */
1875 if (!xen_initial_domain())
1876 return;
1877
1878 xen_raw_printk("Xen: setup ISA identity maps\n");
1879
1880 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1881 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1882
1883 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1884 BUG();
1885 }
1886
1887 xen_flush_tlb();
1888}
1889
f1d7062a 1890static __init void xen_post_allocator_init(void)
319f3ba5 1891{
fc25151d
KRW
1892#ifdef CONFIG_XEN_DEBUG
1893 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1894#endif
319f3ba5
JF
1895 pv_mmu_ops.set_pte = xen_set_pte;
1896 pv_mmu_ops.set_pmd = xen_set_pmd;
1897 pv_mmu_ops.set_pud = xen_set_pud;
1898#if PAGETABLE_LEVELS == 4
1899 pv_mmu_ops.set_pgd = xen_set_pgd;
1900#endif
1901
1902 /* This will work as long as patching hasn't happened yet
1903 (which it hasn't) */
1904 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1905 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1906 pv_mmu_ops.release_pte = xen_release_pte;
1907 pv_mmu_ops.release_pmd = xen_release_pmd;
1908#if PAGETABLE_LEVELS == 4
1909 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1910 pv_mmu_ops.release_pud = xen_release_pud;
1911#endif
1912
1913#ifdef CONFIG_X86_64
1914 SetPagePinned(virt_to_page(level3_user_vsyscall));
1915#endif
1916 xen_mark_init_mm_pinned();
1917}
1918
b407fc57
JF
1919static void xen_leave_lazy_mmu(void)
1920{
5caecb94 1921 preempt_disable();
b407fc57
JF
1922 xen_mc_flush();
1923 paravirt_leave_lazy_mmu();
5caecb94 1924 preempt_enable();
b407fc57 1925}
319f3ba5 1926
030cb6c0 1927static const struct pv_mmu_ops xen_mmu_ops __initdata = {
319f3ba5
JF
1928 .read_cr2 = xen_read_cr2,
1929 .write_cr2 = xen_write_cr2,
1930
1931 .read_cr3 = xen_read_cr3,
5b5c1af1
IC
1932#ifdef CONFIG_X86_32
1933 .write_cr3 = xen_write_cr3_init,
1934#else
319f3ba5 1935 .write_cr3 = xen_write_cr3,
5b5c1af1 1936#endif
319f3ba5
JF
1937
1938 .flush_tlb_user = xen_flush_tlb,
1939 .flush_tlb_kernel = xen_flush_tlb,
1940 .flush_tlb_single = xen_flush_tlb_single,
1941 .flush_tlb_others = xen_flush_tlb_others,
1942
1943 .pte_update = paravirt_nop,
1944 .pte_update_defer = paravirt_nop,
1945
1946 .pgd_alloc = xen_pgd_alloc,
1947 .pgd_free = xen_pgd_free,
1948
1949 .alloc_pte = xen_alloc_pte_init,
1950 .release_pte = xen_release_pte_init,
b96229b5 1951 .alloc_pmd = xen_alloc_pmd_init,
b96229b5 1952 .release_pmd = xen_release_pmd_init,
319f3ba5 1953
319f3ba5 1954 .set_pte = xen_set_pte_init,
319f3ba5
JF
1955 .set_pte_at = xen_set_pte_at,
1956 .set_pmd = xen_set_pmd_hyper,
1957
1958 .ptep_modify_prot_start = __ptep_modify_prot_start,
1959 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1960
da5de7c2
JF
1961 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1962 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
319f3ba5 1963
da5de7c2
JF
1964 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1965 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
319f3ba5
JF
1966
1967#ifdef CONFIG_X86_PAE
1968 .set_pte_atomic = xen_set_pte_atomic,
319f3ba5
JF
1969 .pte_clear = xen_pte_clear,
1970 .pmd_clear = xen_pmd_clear,
1971#endif /* CONFIG_X86_PAE */
1972 .set_pud = xen_set_pud_hyper,
1973
da5de7c2
JF
1974 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1975 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
319f3ba5
JF
1976
1977#if PAGETABLE_LEVELS == 4
da5de7c2
JF
1978 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1979 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
319f3ba5
JF
1980 .set_pgd = xen_set_pgd_hyper,
1981
b96229b5
JF
1982 .alloc_pud = xen_alloc_pmd_init,
1983 .release_pud = xen_release_pmd_init,
319f3ba5
JF
1984#endif /* PAGETABLE_LEVELS == 4 */
1985
1986 .activate_mm = xen_activate_mm,
1987 .dup_mmap = xen_dup_mmap,
1988 .exit_mmap = xen_exit_mmap,
1989
1990 .lazy_mode = {
1991 .enter = paravirt_enter_lazy_mmu,
b407fc57 1992 .leave = xen_leave_lazy_mmu,
319f3ba5
JF
1993 },
1994
1995 .set_fixmap = xen_set_fixmap,
1996};
1997
030cb6c0
TG
1998void __init xen_init_mmu_ops(void)
1999{
279b706b 2000 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
030cb6c0
TG
2001 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2002 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2003 pv_mmu_ops = xen_mmu_ops;
d2cb2145 2004
98511f35 2005 memset(dummy_mapping, 0xff, PAGE_SIZE);
030cb6c0 2006}
319f3ba5 2007
08bbc9da
AN
2008/* Protected by xen_reservation_lock. */
2009#define MAX_CONTIG_ORDER 9 /* 2MB */
2010static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2011
2012#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2013static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2014 unsigned long *in_frames,
2015 unsigned long *out_frames)
2016{
2017 int i;
2018 struct multicall_space mcs;
2019
2020 xen_mc_batch();
2021 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2022 mcs = __xen_mc_entry(0);
2023
2024 if (in_frames)
2025 in_frames[i] = virt_to_mfn(vaddr);
2026
2027 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
6eaa412f 2028 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
08bbc9da
AN
2029
2030 if (out_frames)
2031 out_frames[i] = virt_to_pfn(vaddr);
2032 }
2033 xen_mc_issue(0);
2034}
2035
2036/*
2037 * Update the pfn-to-mfn mappings for a virtual address range, either to
2038 * point to an array of mfns, or contiguously from a single starting
2039 * mfn.
2040 */
2041static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2042 unsigned long *mfns,
2043 unsigned long first_mfn)
2044{
2045 unsigned i, limit;
2046 unsigned long mfn;
2047
2048 xen_mc_batch();
2049
2050 limit = 1u << order;
2051 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2052 struct multicall_space mcs;
2053 unsigned flags;
2054
2055 mcs = __xen_mc_entry(0);
2056 if (mfns)
2057 mfn = mfns[i];
2058 else
2059 mfn = first_mfn + i;
2060
2061 if (i < (limit - 1))
2062 flags = 0;
2063 else {
2064 if (order == 0)
2065 flags = UVMF_INVLPG | UVMF_ALL;
2066 else
2067 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2068 }
2069
2070 MULTI_update_va_mapping(mcs.mc, vaddr,
2071 mfn_pte(mfn, PAGE_KERNEL), flags);
2072
2073 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2074 }
2075
2076 xen_mc_issue(0);
2077}
2078
2079/*
2080 * Perform the hypercall to exchange a region of our pfns to point to
2081 * memory with the required contiguous alignment. Takes the pfns as
2082 * input, and populates mfns as output.
2083 *
2084 * Returns a success code indicating whether the hypervisor was able to
2085 * satisfy the request or not.
2086 */
2087static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2088 unsigned long *pfns_in,
2089 unsigned long extents_out,
2090 unsigned int order_out,
2091 unsigned long *mfns_out,
2092 unsigned int address_bits)
2093{
2094 long rc;
2095 int success;
2096
2097 struct xen_memory_exchange exchange = {
2098 .in = {
2099 .nr_extents = extents_in,
2100 .extent_order = order_in,
2101 .extent_start = pfns_in,
2102 .domid = DOMID_SELF
2103 },
2104 .out = {
2105 .nr_extents = extents_out,
2106 .extent_order = order_out,
2107 .extent_start = mfns_out,
2108 .address_bits = address_bits,
2109 .domid = DOMID_SELF
2110 }
2111 };
2112
2113 BUG_ON(extents_in << order_in != extents_out << order_out);
2114
2115 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2116 success = (exchange.nr_exchanged == extents_in);
2117
2118 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2119 BUG_ON(success && (rc != 0));
2120
2121 return success;
2122}
2123
2124int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2125 unsigned int address_bits)
2126{
2127 unsigned long *in_frames = discontig_frames, out_frame;
2128 unsigned long flags;
2129 int success;
2130
2131 /*
2132 * Currently an auto-translated guest will not perform I/O, nor will
2133 * it require PAE page directories below 4GB. Therefore any calls to
2134 * this function are redundant and can be ignored.
2135 */
2136
2137 if (xen_feature(XENFEAT_auto_translated_physmap))
2138 return 0;
2139
2140 if (unlikely(order > MAX_CONTIG_ORDER))
2141 return -ENOMEM;
2142
2143 memset((void *) vstart, 0, PAGE_SIZE << order);
2144
08bbc9da
AN
2145 spin_lock_irqsave(&xen_reservation_lock, flags);
2146
2147 /* 1. Zap current PTEs, remembering MFNs. */
2148 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2149
2150 /* 2. Get a new contiguous memory extent. */
2151 out_frame = virt_to_pfn(vstart);
2152 success = xen_exchange_memory(1UL << order, 0, in_frames,
2153 1, order, &out_frame,
2154 address_bits);
2155
2156 /* 3. Map the new extent in place of old pages. */
2157 if (success)
2158 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2159 else
2160 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2161
2162 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2163
2164 return success ? 0 : -ENOMEM;
2165}
2166EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2167
2168void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2169{
2170 unsigned long *out_frames = discontig_frames, in_frame;
2171 unsigned long flags;
2172 int success;
2173
2174 if (xen_feature(XENFEAT_auto_translated_physmap))
2175 return;
2176
2177 if (unlikely(order > MAX_CONTIG_ORDER))
2178 return;
2179
2180 memset((void *) vstart, 0, PAGE_SIZE << order);
2181
08bbc9da
AN
2182 spin_lock_irqsave(&xen_reservation_lock, flags);
2183
2184 /* 1. Find start MFN of contiguous extent. */
2185 in_frame = virt_to_mfn(vstart);
2186
2187 /* 2. Zap current PTEs. */
2188 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2189
2190 /* 3. Do the exchange for non-contiguous MFNs. */
2191 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2192 0, out_frames, 0);
2193
2194 /* 4. Map new pages in place of old pages. */
2195 if (success)
2196 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2197 else
2198 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2199
2200 spin_unlock_irqrestore(&xen_reservation_lock, flags);
030cb6c0 2201}
08bbc9da 2202EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
319f3ba5 2203
ca65f9fc 2204#ifdef CONFIG_XEN_PVHVM
59151001
SS
2205static void xen_hvm_exit_mmap(struct mm_struct *mm)
2206{
2207 struct xen_hvm_pagetable_dying a;
2208 int rc;
2209
2210 a.domid = DOMID_SELF;
2211 a.gpa = __pa(mm->pgd);
2212 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2213 WARN_ON_ONCE(rc < 0);
2214}
2215
2216static int is_pagetable_dying_supported(void)
2217{
2218 struct xen_hvm_pagetable_dying a;
2219 int rc = 0;
2220
2221 a.domid = DOMID_SELF;
2222 a.gpa = 0x00;
2223 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2224 if (rc < 0) {
2225 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2226 return 0;
2227 }
2228 return 1;
2229}
2230
2231void __init xen_hvm_init_mmu_ops(void)
2232{
2233 if (is_pagetable_dying_supported())
2234 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2235}
ca65f9fc 2236#endif
59151001 2237
de1ef206
IC
2238#define REMAP_BATCH_SIZE 16
2239
2240struct remap_data {
2241 unsigned long mfn;
2242 pgprot_t prot;
2243 struct mmu_update *mmu_update;
2244};
2245
2246static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2247 unsigned long addr, void *data)
2248{
2249 struct remap_data *rmd = data;
2250 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2251
d5108316 2252 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
de1ef206
IC
2253 rmd->mmu_update->val = pte_val_ma(pte);
2254 rmd->mmu_update++;
2255
2256 return 0;
2257}
2258
2259int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2260 unsigned long addr,
2261 unsigned long mfn, int nr,
2262 pgprot_t prot, unsigned domid)
2263{
2264 struct remap_data rmd;
2265 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2266 int batch;
2267 unsigned long range;
2268 int err = 0;
2269
2270 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2271
e060e7af
SS
2272 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2273 (VM_PFNMAP | VM_RESERVED | VM_IO)));
de1ef206
IC
2274
2275 rmd.mfn = mfn;
2276 rmd.prot = prot;
2277
2278 while (nr) {
2279 batch = min(REMAP_BATCH_SIZE, nr);
2280 range = (unsigned long)batch << PAGE_SHIFT;
2281
2282 rmd.mmu_update = mmu_update;
2283 err = apply_to_page_range(vma->vm_mm, addr, range,
2284 remap_area_mfn_pte_fn, &rmd);
2285 if (err)
2286 goto out;
2287
2288 err = -EFAULT;
2289 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2290 goto out;
2291
2292 nr -= batch;
2293 addr += range;
2294 }
2295
2296 err = 0;
2297out:
2298
2299 flush_tlb_all();
2300
2301 return err;
2302}
2303EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2304
2222e71b
KRW
2305static int p2m_dump_open(struct inode *inode, struct file *filp)
2306{
2307 return single_open(filp, p2m_dump_show, NULL);
2308}
2309
2310static const struct file_operations p2m_dump_fops = {
2311 .open = p2m_dump_open,
2312 .read = seq_read,
2313 .llseek = seq_lseek,
2314 .release = single_release,
2315};