arch/x86/xen/mmu.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched.h>
  42 #include <linux/highmem.h>
  43 #include <linux/debugfs.h>
  44 #include <linux/bug.h>
  45 #include <linux/vmalloc.h>
  46 #include <linux/module.h>
  47 #include <linux/gfp.h>
  48
  49 #include <asm/pgtable.h>
  50 #include <asm/tlbflush.h>
  51 #include <asm/fixmap.h>
  52 #include <asm/mmu_context.h>
  53 #include <asm/setup.h>
  54 #include <asm/paravirt.h>
  55 #include <asm/e820.h>
  56 #include <asm/linkage.h>
  57 #include <asm/page.h>
  58
  59 #include <asm/xen/hypercall.h>
  60 #include <asm/xen/hypervisor.h>
  61
  62 #include <xen/xen.h>
  63 #include <xen/page.h>
  64 #include <xen/interface/xen.h>
  65 #include <xen/interface/hvm/hvm_op.h>
  66 #include <xen/interface/version.h>
  67 #include <xen/interface/memory.h>
  68 #include <xen/hvc-console.h>
  69
  70 #include "multicalls.h"
  71 #include "mmu.h"
  72 #include "debugfs.h"
  73
  74 #define MMU_UPDATE_HISTO        30
  75
  76 /*
  77  * Protects atomic reservation decrease/increase against concurrent increases.
  78  * Also protects non-atomic updates of current_pages and driver_pages, and
  79  * balloon lists.
  80  */
  81 DEFINE_SPINLOCK(xen_reservation_lock);
  82
  83 #ifdef CONFIG_XEN_DEBUG_FS
  84
  85 static struct {
  86         u32 pgd_update;
  87         u32 pgd_update_pinned;
  88         u32 pgd_update_batched;
  89
  90         u32 pud_update;
  91         u32 pud_update_pinned;
  92         u32 pud_update_batched;
  93
  94         u32 pmd_update;
  95         u32 pmd_update_pinned;
  96         u32 pmd_update_batched;
  97
  98         u32 pte_update;
  99         u32 pte_update_pinned;
 100         u32 pte_update_batched;
 101
 102         u32 mmu_update;
 103         u32 mmu_update_extended;
 104         u32 mmu_update_histo[MMU_UPDATE_HISTO];
 105
 106         u32 prot_commit;
 107         u32 prot_commit_batched;
 108
 109         u32 set_pte_at;
 110         u32 set_pte_at_batched;
 111         u32 set_pte_at_pinned;
 112         u32 set_pte_at_current;
 113         u32 set_pte_at_kernel;
 114 } mmu_stats;
 115
 116 static u8 zero_stats;
 117
 118 static inline void check_zero(void)
 119 {
 120         if (unlikely(zero_stats)) {
 121                 memset(&mmu_stats, 0, sizeof(mmu_stats));
 122                 zero_stats = 0;
 123         }
 124 }
 125
 126 #define ADD_STATS(elem, val)                    \
 127         do { check_zero(); mmu_stats.elem += (val); } while(0)
 128
 129 #else  /* !CONFIG_XEN_DEBUG_FS */
 130
 131 #define ADD_STATS(elem, val)    do { (void)(val); } while(0)
 132
 133 #endif /* CONFIG_XEN_DEBUG_FS */
 134
 135
 136 /*
 137  * Identity map, in addition to plain kernel map.  This needs to be
 138  * large enough to allocate page table pages to allocate the rest.
 139  * Each page can map 2MB.
 140  */
 141 static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
 142
 143 #ifdef CONFIG_X86_64
 144 /* l3 pud for userspace vsyscall mapping */
 145 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 146 #endif /* CONFIG_X86_64 */
 147
 148 /*
 149  * Note about cr3 (pagetable base) values:
 150  *
 151  * xen_cr3 contains the current logical cr3 value; it contains the
 152  * last set cr3.  This may not be the current effective cr3, because
 153  * its update may be being lazily deferred.  However, a vcpu looking
 154  * at its own cr3 can use this value knowing that it everything will
 155  * be self-consistent.
 156  *
 157  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 158  * hypercall to set the vcpu cr3 is complete (so it may be a little
 159  * out of date, but it will never be set early).  If one vcpu is
 160  * looking at another vcpu's cr3 value, it should use this variable.
 161  */
 162 DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
 163 DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 164
 165
 166 /*
 167  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 168  * redzone above it, so round it up to a PGD boundary.
 169  */
 170 #define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 171
 172
 173 #define P2M_ENTRIES_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long))
 174 #define TOP_ENTRIES             (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
 175
 176 /* Placeholder for holes in the address space */
 177 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE);
 178
 179  /* Array of pointers to pages containing p2m entries */
 180 static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES);
 181
 182 /* Arrays of p2m arrays expressed in mfns used for save/restore */
 183 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES);
 184
 185 static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list,
 186                          (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE));
 187
 188 static inline unsigned p2m_top_index(unsigned long pfn)
 189 {
 190         BUG_ON(pfn >= MAX_DOMAIN_PAGES);
 191         return pfn / P2M_ENTRIES_PER_PAGE;
 192 }
 193
 194 static inline unsigned p2m_index(unsigned long pfn)
 195 {
 196         return pfn % P2M_ENTRIES_PER_PAGE;
 197 }
 198
 199 /* Build the parallel p2m_top_mfn structures */
 200 void xen_build_mfn_list_list(void)
 201 {
 202         unsigned pfn, idx;
 203
 204         for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
 205                 unsigned topidx = p2m_top_index(pfn);
 206
 207                 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
 208         }
 209
 210         for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) {
 211                 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 212                 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 213         }
 214 }
 215
 216 void xen_setup_mfn_list_list(void)
 217 {
 218         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 219
 220         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 221                 virt_to_mfn(p2m_top_mfn_list);
 222         HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
 223 }
 224
 225 /* Set up p2m_top to point to the domain-builder provided p2m pages */
 226 void __init xen_build_dynamic_phys_to_machine(void)
 227 {
 228         unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 229         unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 230         unsigned pfn;
 231         unsigned i;
 232
 233         p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE,
 234                                  PAGE_SIZE);
 235         for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 236                 p2m_missing[i] = ~0UL;
 237
 238         p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES,
 239                              PAGE_SIZE);
 240         for (i = 0; i < TOP_ENTRIES; i++)
 241                 p2m_top[i] = p2m_missing;
 242
 243         p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE);
 244         p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) *
 245                                       (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE),
 246                                       PAGE_SIZE);
 247
 248         for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 249                 unsigned topidx = p2m_top_index(pfn);
 250
 251                 p2m_top[topidx] = &mfn_list[pfn];
 252         }
 253
 254         xen_build_mfn_list_list();
 255 }
 256
 257 unsigned long get_phys_to_machine(unsigned long pfn)
 258 {
 259         unsigned topidx, idx;
 260
 261         if (unlikely(pfn >= MAX_DOMAIN_PAGES))
 262                 return INVALID_P2M_ENTRY;
 263
 264         topidx = p2m_top_index(pfn);
 265         idx = p2m_index(pfn);
 266         return p2m_top[topidx][idx];
 267 }
 268 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 269
 270 /* install a  new p2m_top page */
 271 bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
 272 {
 273         unsigned topidx = p2m_top_index(pfn);
 274         unsigned long **pfnp, *mfnp;
 275         unsigned i;
 276
 277         pfnp = &p2m_top[topidx];
 278         mfnp = &p2m_top_mfn[topidx];
 279
 280         for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 281                 p[i] = INVALID_P2M_ENTRY;
 282
 283         if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
 284                 *mfnp = virt_to_mfn(p);
 285                 return true;
 286         }
 287
 288         return false;
 289 }
 290
 291 static void alloc_p2m(unsigned long pfn)
 292 {
 293         unsigned long *p;
 294
 295         p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
 296         BUG_ON(p == NULL);
 297
 298         if (!install_p2mtop_page(pfn, p))
 299                 free_page((unsigned long)p);
 300 }
 301
 302 /* Try to install p2m mapping; fail if intermediate bits missing */
 303 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 304 {
 305         unsigned topidx, idx;
 306
 307         if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
 308                 BUG_ON(mfn != INVALID_P2M_ENTRY);
 309                 return true;
 310         }
 311
 312         topidx = p2m_top_index(pfn);
 313         if (p2m_top[topidx] == p2m_missing) {
 314                 if (mfn == INVALID_P2M_ENTRY)
 315                         return true;
 316                 return false;
 317         }
 318
 319         idx = p2m_index(pfn);
 320         p2m_top[topidx][idx] = mfn;
 321
 322         return true;
 323 }
 324
 325 void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 326 {
 327         if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
 328                 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
 329                 return;
 330         }
 331
 332         if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
 333                 alloc_p2m(pfn);
 334
 335                 if (!__set_phys_to_machine(pfn, mfn))
 336                         BUG();
 337         }
 338 }
 339
 340 unsigned long arbitrary_virt_to_mfn(void *vaddr)
 341 {
 342         xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 343
 344         return PFN_DOWN(maddr.maddr);
 345 }
 346
 347 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 348 {
 349         unsigned long address = (unsigned long)vaddr;
 350         unsigned int level;
 351         pte_t *pte;
 352         unsigned offset;
 353
 354         /*
 355          * if the PFN is in the linear mapped vaddr range, we can just use
 356          * the (quick) virt_to_machine() p2m lookup
 357          */
 358         if (virt_addr_valid(vaddr))
 359                 return virt_to_machine(vaddr);
 360
 361         /* otherwise we have to do a (slower) full page-table walk */
 362
 363         pte = lookup_address(address, &level);
 364         BUG_ON(pte == NULL);
 365         offset = address & ~PAGE_MASK;
 366         return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 367 }
 368
 369 void make_lowmem_page_readonly(void *vaddr)
 370 {
 371         pte_t *pte, ptev;
 372         unsigned long address = (unsigned long)vaddr;
 373         unsigned int level;
 374
 375         pte = lookup_address(address, &level);
 376         BUG_ON(pte == NULL);
 377
 378         ptev = pte_wrprotect(*pte);
 379
 380         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 381                 BUG();
 382 }
 383
 384 void make_lowmem_page_readwrite(void *vaddr)
 385 {
 386         pte_t *pte, ptev;
 387         unsigned long address = (unsigned long)vaddr;
 388         unsigned int level;
 389
 390         pte = lookup_address(address, &level);
 391         BUG_ON(pte == NULL);
 392
 393         ptev = pte_mkwrite(*pte);
 394
 395         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 396                 BUG();
 397 }
 398
 399
 400 static bool xen_page_pinned(void *ptr)
 401 {
 402         struct page *page = virt_to_page(ptr);
 403
 404         return PagePinned(page);
 405 }
 406
 407 static bool xen_iomap_pte(pte_t pte)
 408 {
 409         return pte_flags(pte) & _PAGE_IOMAP;
 410 }
 411
 412 static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
 413 {
 414         struct multicall_space mcs;
 415         struct mmu_update *u;
 416
 417         mcs = xen_mc_entry(sizeof(*u));
 418         u = mcs.args;
 419
 420         /* ptep might be kmapped when using 32-bit HIGHPTE */
 421         u->ptr = arbitrary_virt_to_machine(ptep).maddr;
 422         u->val = pte_val_ma(pteval);
 423
 424         MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO);
 425
 426         xen_mc_issue(PARAVIRT_LAZY_MMU);
 427 }
 428
 429 static void xen_extend_mmu_update(const struct mmu_update *update)
 430 {
 431         struct multicall_space mcs;
 432         struct mmu_update *u;
 433
 434         mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 435
 436         if (mcs.mc != NULL) {
 437                 ADD_STATS(mmu_update_extended, 1);
 438                 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
 439
 440                 mcs.mc->args[1]++;
 441
 442                 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
 443                         ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
 444                 else
 445                         ADD_STATS(mmu_update_histo[0], 1);
 446         } else {
 447                 ADD_STATS(mmu_update, 1);
 448                 mcs = __xen_mc_entry(sizeof(*u));
 449                 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 450                 ADD_STATS(mmu_update_histo[1], 1);
 451         }
 452
 453         u = mcs.args;
 454         *u = *update;
 455 }
 456
 457 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 458 {
 459         struct mmu_update u;
 460
 461         preempt_disable();
 462
 463         xen_mc_batch();
 464
 465         /* ptr may be ioremapped for 64-bit pagetable setup */
 466         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 467         u.val = pmd_val_ma(val);
 468         xen_extend_mmu_update(&u);
 469
 470         ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 471
 472         xen_mc_issue(PARAVIRT_LAZY_MMU);
 473
 474         preempt_enable();
 475 }
 476
 477 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 478 {
 479         ADD_STATS(pmd_update, 1);
 480
 481         /* If page is not pinned, we can just update the entry
 482            directly */
 483         if (!xen_page_pinned(ptr)) {
 484                 *ptr = val;
 485                 return;
 486         }
 487
 488         ADD_STATS(pmd_update_pinned, 1);
 489
 490         xen_set_pmd_hyper(ptr, val);
 491 }
 492
 493 /*
 494  * Associate a virtual page frame with a given physical page frame
 495  * and protection flags for that frame.
 496  */
 497 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 498 {
 499         set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 500 }
 501
 502 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 503                     pte_t *ptep, pte_t pteval)
 504 {
 505         if (xen_iomap_pte(pteval)) {
 506                 xen_set_iomap_pte(ptep, pteval);
 507                 goto out;
 508         }
 509
 510         ADD_STATS(set_pte_at, 1);
 511 //      ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
 512         ADD_STATS(set_pte_at_current, mm == current->mm);
 513         ADD_STATS(set_pte_at_kernel, mm == &init_mm);
 514
 515         if (mm == current->mm || mm == &init_mm) {
 516                 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 517                         struct multicall_space mcs;
 518                         mcs = xen_mc_entry(0);
 519
 520                         MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
 521                         ADD_STATS(set_pte_at_batched, 1);
 522                         xen_mc_issue(PARAVIRT_LAZY_MMU);
 523                         goto out;
 524                 } else
 525                         if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
 526                                 goto out;
 527         }
 528         xen_set_pte(ptep, pteval);
 529
 530 out:    return;
 531 }
 532
 533 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 534                                  unsigned long addr, pte_t *ptep)
 535 {
 536         /* Just return the pte as-is.  We preserve the bits on commit */
 537         return *ptep;
 538 }
 539
 540 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 541                                  pte_t *ptep, pte_t pte)
 542 {
 543         struct mmu_update u;
 544
 545         xen_mc_batch();
 546
 547         u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 548         u.val = pte_val_ma(pte);
 549         xen_extend_mmu_update(&u);
 550
 551         ADD_STATS(prot_commit, 1);
 552         ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 553
 554         xen_mc_issue(PARAVIRT_LAZY_MMU);
 555 }
 556
 557 /* Assume pteval_t is equivalent to all the other *val_t types. */
 558 static pteval_t pte_mfn_to_pfn(pteval_t val)
 559 {
 560         if (val & _PAGE_PRESENT) {
 561                 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 562                 pteval_t flags = val & PTE_FLAGS_MASK;
 563                 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
 564         }
 565
 566         return val;
 567 }
 568
 569 static pteval_t pte_pfn_to_mfn(pteval_t val)
 570 {
 571         if (val & _PAGE_PRESENT) {
 572                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 573                 pteval_t flags = val & PTE_FLAGS_MASK;
 574                 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
 575         }
 576
 577         return val;
 578 }
 579
 580 static pteval_t iomap_pte(pteval_t val)
 581 {
 582         if (val & _PAGE_PRESENT) {
 583                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 584                 pteval_t flags = val & PTE_FLAGS_MASK;
 585
 586                 /* We assume the pte frame number is a MFN, so
 587                    just use it as-is. */
 588                 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 589         }
 590
 591         return val;
 592 }
 593
 594 pteval_t xen_pte_val(pte_t pte)
 595 {
 596         if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP))
 597                 return pte.pte;
 598
 599         return pte_mfn_to_pfn(pte.pte);
 600 }
 601 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 602
 603 pgdval_t xen_pgd_val(pgd_t pgd)
 604 {
 605         return pte_mfn_to_pfn(pgd.pgd);
 606 }
 607 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 608
 609 pte_t xen_make_pte(pteval_t pte)
 610 {
 611         phys_addr_t addr = (pte & PTE_PFN_MASK);
 612
 613         /*
 614          * Unprivileged domains are allowed to do IOMAPpings for
 615          * PCI passthrough, but not map ISA space.  The ISA
 616          * mappings are just dummy local mappings to keep other
 617          * parts of the kernel happy.
 618          */
 619         if (unlikely(pte & _PAGE_IOMAP) &&
 620             (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 621                 pte = iomap_pte(pte);
 622         } else {
 623                 pte &= ~_PAGE_IOMAP;
 624                 pte = pte_pfn_to_mfn(pte);
 625         }
 626
 627         return native_make_pte(pte);
 628 }
 629 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 630
 631 pgd_t xen_make_pgd(pgdval_t pgd)
 632 {
 633         pgd = pte_pfn_to_mfn(pgd);
 634         return native_make_pgd(pgd);
 635 }
 636 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 637
 638 pmdval_t xen_pmd_val(pmd_t pmd)
 639 {
 640         return pte_mfn_to_pfn(pmd.pmd);
 641 }
 642 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 643
 644 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 645 {
 646         struct mmu_update u;
 647
 648         preempt_disable();
 649
 650         xen_mc_batch();
 651
 652         /* ptr may be ioremapped for 64-bit pagetable setup */
 653         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 654         u.val = pud_val_ma(val);
 655         xen_extend_mmu_update(&u);
 656
 657         ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 658
 659         xen_mc_issue(PARAVIRT_LAZY_MMU);
 660
 661         preempt_enable();
 662 }
 663
 664 void xen_set_pud(pud_t *ptr, pud_t val)
 665 {
 666         ADD_STATS(pud_update, 1);
 667
 668         /* If page is not pinned, we can just update the entry
 669            directly */
 670         if (!xen_page_pinned(ptr)) {
 671                 *ptr = val;
 672                 return;
 673         }
 674
 675         ADD_STATS(pud_update_pinned, 1);
 676
 677         xen_set_pud_hyper(ptr, val);
 678 }
 679
 680 void xen_set_pte(pte_t *ptep, pte_t pte)
 681 {
 682         if (xen_iomap_pte(pte)) {
 683                 xen_set_iomap_pte(ptep, pte);
 684                 return;
 685         }
 686
 687         ADD_STATS(pte_update, 1);
 688 //      ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
 689         ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 690
 691 #ifdef CONFIG_X86_PAE
 692         ptep->pte_high = pte.pte_high;
 693         smp_wmb();
 694         ptep->pte_low = pte.pte_low;
 695 #else
 696         *ptep = pte;
 697 #endif
 698 }
 699
 700 #ifdef CONFIG_X86_PAE
 701 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 702 {
 703         if (xen_iomap_pte(pte)) {
 704                 xen_set_iomap_pte(ptep, pte);
 705                 return;
 706         }
 707
 708         set_64bit((u64 *)ptep, native_pte_val(pte));
 709 }
 710
 711 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 712 {
 713         ptep->pte_low = 0;
 714         smp_wmb();              /* make sure low gets written first */
 715         ptep->pte_high = 0;
 716 }
 717
 718 void xen_pmd_clear(pmd_t *pmdp)
 719 {
 720         set_pmd(pmdp, __pmd(0));
 721 }
 722 #endif  /* CONFIG_X86_PAE */
 723
 724 pmd_t xen_make_pmd(pmdval_t pmd)
 725 {
 726         pmd = pte_pfn_to_mfn(pmd);
 727         return native_make_pmd(pmd);
 728 }
 729 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 730
 731 #if PAGETABLE_LEVELS == 4
 732 pudval_t xen_pud_val(pud_t pud)
 733 {
 734         return pte_mfn_to_pfn(pud.pud);
 735 }
 736 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 737
 738 pud_t xen_make_pud(pudval_t pud)
 739 {
 740         pud = pte_pfn_to_mfn(pud);
 741
 742         return native_make_pud(pud);
 743 }
 744 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 745
 746 pgd_t *xen_get_user_pgd(pgd_t *pgd)
 747 {
 748         pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 749         unsigned offset = pgd - pgd_page;
 750         pgd_t *user_ptr = NULL;
 751
 752         if (offset < pgd_index(USER_LIMIT)) {
 753                 struct page *page = virt_to_page(pgd_page);
 754                 user_ptr = (pgd_t *)page->private;
 755                 if (user_ptr)
 756                         user_ptr += offset;
 757         }
 758
 759         return user_ptr;
 760 }
 761
 762 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 763 {
 764         struct mmu_update u;
 765
 766         u.ptr = virt_to_machine(ptr).maddr;
 767         u.val = pgd_val_ma(val);
 768         xen_extend_mmu_update(&u);
 769 }
 770
 771 /*
 772  * Raw hypercall-based set_pgd, intended for in early boot before
 773  * there's a page structure.  This implies:
 774  *  1. The only existing pagetable is the kernel's
 775  *  2. It is always pinned
 776  *  3. It has no user pagetable attached to it
 777  */
 778 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 779 {
 780         preempt_disable();
 781
 782         xen_mc_batch();
 783
 784         __xen_set_pgd_hyper(ptr, val);
 785
 786         xen_mc_issue(PARAVIRT_LAZY_MMU);
 787
 788         preempt_enable();
 789 }
 790
 791 void xen_set_pgd(pgd_t *ptr, pgd_t val)
 792 {
 793         pgd_t *user_ptr = xen_get_user_pgd(ptr);
 794
 795         ADD_STATS(pgd_update, 1);
 796
 797         /* If page is not pinned, we can just update the entry
 798            directly */
 799         if (!xen_page_pinned(ptr)) {
 800                 *ptr = val;
 801                 if (user_ptr) {
 802                         WARN_ON(xen_page_pinned(user_ptr));
 803                         *user_ptr = val;
 804                 }
 805                 return;
 806         }
 807
 808         ADD_STATS(pgd_update_pinned, 1);
 809         ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 810
 811         /* If it's pinned, then we can at least batch the kernel and
 812            user updates together. */
 813         xen_mc_batch();
 814
 815         __xen_set_pgd_hyper(ptr, val);
 816         if (user_ptr)
 817                 __xen_set_pgd_hyper(user_ptr, val);
 818
 819         xen_mc_issue(PARAVIRT_LAZY_MMU);
 820 }
 821 #endif  /* PAGETABLE_LEVELS == 4 */
 822
 823 /*
 824  * (Yet another) pagetable walker.  This one is intended for pinning a
 825  * pagetable.  This means that it walks a pagetable and calls the
 826  * callback function on each page it finds making up the page table,
 827  * at every level.  It walks the entire pagetable, but it only bothers
 828  * pinning pte pages which are below limit.  In the normal case this
 829  * will be STACK_TOP_MAX, but at boot we need to pin up to
 830  * FIXADDR_TOP.
 831  *
 832  * For 32-bit the important bit is that we don't pin beyond there,
 833  * because then we start getting into Xen's ptes.
 834  *
 835  * For 64-bit, we must skip the Xen hole in the middle of the address
 836  * space, just after the big x86-64 virtual hole.
 837  */
 838 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 839                           int (*func)(struct mm_struct *mm, struct page *,
 840                                       enum pt_level),
 841                           unsigned long limit)
 842 {
 843         int flush = 0;
 844         unsigned hole_low, hole_high;
 845         unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 846         unsigned pgdidx, pudidx, pmdidx;
 847
 848         /* The limit is the last byte to be touched */
 849         limit--;
 850         BUG_ON(limit >= FIXADDR_TOP);
 851
 852         if (xen_feature(XENFEAT_auto_translated_physmap))
 853                 return 0;
 854
 855         /*
 856          * 64-bit has a great big hole in the middle of the address
 857          * space, which contains the Xen mappings.  On 32-bit these
 858          * will end up making a zero-sized hole and so is a no-op.
 859          */
 860         hole_low = pgd_index(USER_LIMIT);
 861         hole_high = pgd_index(PAGE_OFFSET);
 862
 863         pgdidx_limit = pgd_index(limit);
 864 #if PTRS_PER_PUD > 1
 865         pudidx_limit = pud_index(limit);
 866 #else
 867         pudidx_limit = 0;
 868 #endif
 869 #if PTRS_PER_PMD > 1
 870         pmdidx_limit = pmd_index(limit);
 871 #else
 872         pmdidx_limit = 0;
 873 #endif
 874
 875         for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 876                 pud_t *pud;
 877
 878                 if (pgdidx >= hole_low && pgdidx < hole_high)
 879                         continue;
 880
 881                 if (!pgd_val(pgd[pgdidx]))
 882                         continue;
 883
 884                 pud = pud_offset(&pgd[pgdidx], 0);
 885
 886                 if (PTRS_PER_PUD > 1) /* not folded */
 887                         flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 888
 889                 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 890                         pmd_t *pmd;
 891
 892                         if (pgdidx == pgdidx_limit &&
 893                             pudidx > pudidx_limit)
 894                                 goto out;
 895
 896                         if (pud_none(pud[pudidx]))
 897                                 continue;
 898
 899                         pmd = pmd_offset(&pud[pudidx], 0);
 900
 901                         if (PTRS_PER_PMD > 1) /* not folded */
 902                                 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 903
 904                         for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 905                                 struct page *pte;
 906
 907                                 if (pgdidx == pgdidx_limit &&
 908                                     pudidx == pudidx_limit &&
 909                                     pmdidx > pmdidx_limit)
 910                                         goto out;
 911
 912                                 if (pmd_none(pmd[pmdidx]))
 913                                         continue;
 914
 915                                 pte = pmd_page(pmd[pmdidx]);
 916                                 flush |= (*func)(mm, pte, PT_PTE);
 917                         }
 918                 }
 919         }
 920
 921 out:
 922         /* Do the top level last, so that the callbacks can use it as
 923            a cue to do final things like tlb flushes. */
 924         flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 925
 926         return flush;
 927 }
 928
 929 static int xen_pgd_walk(struct mm_struct *mm,
 930                         int (*func)(struct mm_struct *mm, struct page *,
 931                                     enum pt_level),
 932                         unsigned long limit)
 933 {
 934         return __xen_pgd_walk(mm, mm->pgd, func, limit);
 935 }
 936
 937 /* If we're using split pte locks, then take the page's lock and
 938    return a pointer to it.  Otherwise return NULL. */
 939 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 940 {
 941         spinlock_t *ptl = NULL;
 942
 943 #if USE_SPLIT_PTLOCKS
 944         ptl = __pte_lockptr(page);
 945         spin_lock_nest_lock(ptl, &mm->page_table_lock);
 946 #endif
 947
 948         return ptl;
 949 }
 950
 951 static void xen_pte_unlock(void *v)
 952 {
 953         spinlock_t *ptl = v;
 954         spin_unlock(ptl);
 955 }
 956
 957 static void xen_do_pin(unsigned level, unsigned long pfn)
 958 {
 959         struct mmuext_op *op;
 960         struct multicall_space mcs;
 961
 962         mcs = __xen_mc_entry(sizeof(*op));
 963         op = mcs.args;
 964         op->cmd = level;
 965         op->arg1.mfn = pfn_to_mfn(pfn);
 966         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 967 }
 968
 969 static int xen_pin_page(struct mm_struct *mm, struct page *page,
 970                         enum pt_level level)
 971 {
 972         unsigned pgfl = TestSetPagePinned(page);
 973         int flush;
 974
 975         if (pgfl)
 976                 flush = 0;              /* already pinned */
 977         else if (PageHighMem(page))
 978                 /* kmaps need flushing if we found an unpinned
 979                    highpage */
 980                 flush = 1;
 981         else {
 982                 void *pt = lowmem_page_address(page);
 983                 unsigned long pfn = page_to_pfn(page);
 984                 struct multicall_space mcs = __xen_mc_entry(0);
 985                 spinlock_t *ptl;
 986
 987                 flush = 0;
 988
 989                 /*
 990                  * We need to hold the pagetable lock between the time
 991                  * we make the pagetable RO and when we actually pin
 992                  * it.  If we don't, then other users may come in and
 993                  * attempt to update the pagetable by writing it,
 994                  * which will fail because the memory is RO but not
 995                  * pinned, so Xen won't do the trap'n'emulate.
 996                  *
 997                  * If we're using split pte locks, we can't hold the
 998                  * entire pagetable's worth of locks during the
 999                  * traverse, because we may wrap the preempt count (8
1000                  * bits).  The solution is to mark RO and pin each PTE
1001                  * page while holding the lock.  This means the number
1002                  * of locks we end up holding is never more than a
1003                  * batch size (~32 entries, at present).
1004                  *
1005                  * If we're not using split pte locks, we needn't pin
1006                  * the PTE pages independently, because we're
1007                  * protected by the overall pagetable lock.
1008                  */
1009                 ptl = NULL;
1010                 if (level == PT_PTE)
1011                         ptl = xen_pte_lock(page, mm);
1012
1013                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1014                                         pfn_pte(pfn, PAGE_KERNEL_RO),
1015                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1016
1017                 if (ptl) {
1018                         xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
1019
1020                         /* Queue a deferred unlock for when this batch
1021                            is completed. */
1022                         xen_mc_callback(xen_pte_unlock, ptl);
1023                 }
1024         }
1025
1026         return flush;
1027 }
1028
1029 /* This is called just after a mm has been created, but it has not
1030    been used yet.  We need to make sure that its pagetable is all
1031    read-only, and can be pinned. */
1032 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
1033 {
1034         xen_mc_batch();
1035
1036         if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
1037                 /* re-enable interrupts for flushing */
1038                 xen_mc_issue(0);
1039
1040                 kmap_flush_unused();
1041
1042                 xen_mc_batch();
1043         }
1044
1045 #ifdef CONFIG_X86_64
1046         {
1047                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1048
1049                 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
1050
1051                 if (user_pgd) {
1052                         xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
1053                         xen_do_pin(MMUEXT_PIN_L4_TABLE,
1054                                    PFN_DOWN(__pa(user_pgd)));
1055                 }
1056         }
1057 #else /* CONFIG_X86_32 */
1058 #ifdef CONFIG_X86_PAE
1059         /* Need to make sure unshared kernel PMD is pinnable */
1060         xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1061                      PT_PMD);
1062 #endif
1063         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
1064 #endif /* CONFIG_X86_64 */
1065         xen_mc_issue(0);
1066 }
1067
1068 static void xen_pgd_pin(struct mm_struct *mm)
1069 {
1070         __xen_pgd_pin(mm, mm->pgd);
1071 }
1072
1073 /*
1074  * On save, we need to pin all pagetables to make sure they get their
1075  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
1076  * them (unpinned pgds are not currently in use, probably because the
1077  * process is under construction or destruction).
1078  *
1079  * Expected to be called in stop_machine() ("equivalent to taking
1080  * every spinlock in the system"), so the locking doesn't really
1081  * matter all that much.
1082  */
1083 void xen_mm_pin_all(void)
1084 {
1085         unsigned long flags;
1086         struct page *page;
1087
1088         spin_lock_irqsave(&pgd_lock, flags);
1089
1090         list_for_each_entry(page, &pgd_list, lru) {
1091                 if (!PagePinned(page)) {
1092                         __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
1093                         SetPageSavePinned(page);
1094                 }
1095         }
1096
1097         spin_unlock_irqrestore(&pgd_lock, flags);
1098 }
1099
1100 /*
1101  * The init_mm pagetable is really pinned as soon as its created, but
1102  * that's before we have page structures to store the bits.  So do all
1103  * the book-keeping now.
1104  */
1105 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1106                                   enum pt_level level)
1107 {
1108         SetPagePinned(page);
1109         return 0;
1110 }
1111
1112 static void __init xen_mark_init_mm_pinned(void)
1113 {
1114         xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1115 }
1116
1117 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1118                           enum pt_level level)
1119 {
1120         unsigned pgfl = TestClearPagePinned(page);
1121
1122         if (pgfl && !PageHighMem(page)) {
1123                 void *pt = lowmem_page_address(page);
1124                 unsigned long pfn = page_to_pfn(page);
1125                 spinlock_t *ptl = NULL;
1126                 struct multicall_space mcs;
1127
1128                 /*
1129                  * Do the converse to pin_page.  If we're using split
1130                  * pte locks, we must be holding the lock for while
1131                  * the pte page is unpinned but still RO to prevent
1132                  * concurrent updates from seeing it in this
1133                  * partially-pinned state.
1134                  */
1135                 if (level == PT_PTE) {
1136                         ptl = xen_pte_lock(page, mm);
1137
1138                         if (ptl)
1139                                 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1140                 }
1141
1142                 mcs = __xen_mc_entry(0);
1143
1144                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1145                                         pfn_pte(pfn, PAGE_KERNEL),
1146                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1147
1148                 if (ptl) {
1149                         /* unlock when batch completed */
1150                         xen_mc_callback(xen_pte_unlock, ptl);
1151                 }
1152         }
1153
1154         return 0;               /* never need to flush on unpin */
1155 }
1156
1157 /* Release a pagetables pages back as normal RW */
1158 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1159 {
1160         xen_mc_batch();
1161
1162         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1163
1164 #ifdef CONFIG_X86_64
1165         {
1166                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1167
1168                 if (user_pgd) {
1169                         xen_do_pin(MMUEXT_UNPIN_TABLE,
1170                                    PFN_DOWN(__pa(user_pgd)));
1171                         xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1172                 }
1173         }
1174 #endif
1175
1176 #ifdef CONFIG_X86_PAE
1177         /* Need to make sure unshared kernel PMD is unpinned */
1178         xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1179                        PT_PMD);
1180 #endif
1181
1182         __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1183
1184         xen_mc_issue(0);
1185 }
1186
1187 static void xen_pgd_unpin(struct mm_struct *mm)
1188 {
1189         __xen_pgd_unpin(mm, mm->pgd);
1190 }
1191
1192 /*
1193  * On resume, undo any pinning done at save, so that the rest of the
1194  * kernel doesn't see any unexpected pinned pagetables.
1195  */
1196 void xen_mm_unpin_all(void)
1197 {
1198         unsigned long flags;
1199         struct page *page;
1200
1201         spin_lock_irqsave(&pgd_lock, flags);
1202
1203         list_for_each_entry(page, &pgd_list, lru) {
1204                 if (PageSavePinned(page)) {
1205                         BUG_ON(!PagePinned(page));
1206                         __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1207                         ClearPageSavePinned(page);
1208                 }
1209         }
1210
1211         spin_unlock_irqrestore(&pgd_lock, flags);
1212 }
1213
1214 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1215 {
1216         spin_lock(&next->page_table_lock);
1217         xen_pgd_pin(next);
1218         spin_unlock(&next->page_table_lock);
1219 }
1220
1221 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1222 {
1223         spin_lock(&mm->page_table_lock);
1224         xen_pgd_pin(mm);
1225         spin_unlock(&mm->page_table_lock);
1226 }
1227
1228
1229 #ifdef CONFIG_SMP
1230 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1231    we need to repoint it somewhere else before we can unpin it. */
1232 static void drop_other_mm_ref(void *info)
1233 {
1234         struct mm_struct *mm = info;
1235         struct mm_struct *active_mm;
1236
1237         active_mm = percpu_read(cpu_tlbstate.active_mm);
1238
1239         if (active_mm == mm)
1240                 leave_mm(smp_processor_id());
1241
1242         /* If this cpu still has a stale cr3 reference, then make sure
1243            it has been flushed. */
1244         if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1245                 load_cr3(swapper_pg_dir);
1246 }
1247
1248 static void xen_drop_mm_ref(struct mm_struct *mm)
1249 {
1250         cpumask_var_t mask;
1251         unsigned cpu;
1252
1253         if (current->active_mm == mm) {
1254                 if (current->mm == mm)
1255                         load_cr3(swapper_pg_dir);
1256                 else
1257                         leave_mm(smp_processor_id());
1258         }
1259
1260         /* Get the "official" set of cpus referring to our pagetable. */
1261         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1262                 for_each_online_cpu(cpu) {
1263                         if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1264                             && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1265                                 continue;
1266                         smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1267                 }
1268                 return;
1269         }
1270         cpumask_copy(mask, mm_cpumask(mm));
1271
1272         /* It's possible that a vcpu may have a stale reference to our
1273            cr3, because its in lazy mode, and it hasn't yet flushed
1274            its set of pending hypercalls yet.  In this case, we can
1275            look at its actual current cr3 value, and force it to flush
1276            if needed. */
1277         for_each_online_cpu(cpu) {
1278                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1279                         cpumask_set_cpu(cpu, mask);
1280         }
1281
1282         if (!cpumask_empty(mask))
1283                 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1284         free_cpumask_var(mask);
1285 }
1286 #else
1287 static void xen_drop_mm_ref(struct mm_struct *mm)
1288 {
1289         if (current->active_mm == mm)
1290                 load_cr3(swapper_pg_dir);
1291 }
1292 #endif
1293
1294 /*
1295  * While a process runs, Xen pins its pagetables, which means that the
1296  * hypervisor forces it to be read-only, and it controls all updates
1297  * to it.  This means that all pagetable updates have to go via the
1298  * hypervisor, which is moderately expensive.
1299  *
1300  * Since we're pulling the pagetable down, we switch to use init_mm,
1301  * unpin old process pagetable and mark it all read-write, which
1302  * allows further operations on it to be simple memory accesses.
1303  *
1304  * The only subtle point is that another CPU may be still using the
1305  * pagetable because of lazy tlb flushing.  This means we need need to
1306  * switch all CPUs off this pagetable before we can unpin it.
1307  */
1308 void xen_exit_mmap(struct mm_struct *mm)
1309 {
1310         get_cpu();              /* make sure we don't move around */
1311         xen_drop_mm_ref(mm);
1312         put_cpu();
1313
1314         spin_lock(&mm->page_table_lock);
1315
1316         /* pgd may not be pinned in the error exit path of execve */
1317         if (xen_page_pinned(mm->pgd))
1318                 xen_pgd_unpin(mm);
1319
1320         spin_unlock(&mm->page_table_lock);
1321 }
1322
1323 static __init void xen_pagetable_setup_start(pgd_t *base)
1324 {
1325 }
1326
1327 static void xen_post_allocator_init(void);
1328
1329 static __init void xen_pagetable_setup_done(pgd_t *base)
1330 {
1331         xen_setup_shared_info();
1332         xen_post_allocator_init();
1333 }
1334
1335 static void xen_write_cr2(unsigned long cr2)
1336 {
1337         percpu_read(xen_vcpu)->arch.cr2 = cr2;
1338 }
1339
1340 static unsigned long xen_read_cr2(void)
1341 {
1342         return percpu_read(xen_vcpu)->arch.cr2;
1343 }
1344
1345 unsigned long xen_read_cr2_direct(void)
1346 {
1347         return percpu_read(xen_vcpu_info.arch.cr2);
1348 }
1349
1350 static void xen_flush_tlb(void)
1351 {
1352         struct mmuext_op *op;
1353         struct multicall_space mcs;
1354
1355         preempt_disable();
1356
1357         mcs = xen_mc_entry(sizeof(*op));
1358
1359         op = mcs.args;
1360         op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1361         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1362
1363         xen_mc_issue(PARAVIRT_LAZY_MMU);
1364
1365         preempt_enable();
1366 }
1367
1368 static void xen_flush_tlb_single(unsigned long addr)
1369 {
1370         struct mmuext_op *op;
1371         struct multicall_space mcs;
1372
1373         preempt_disable();
1374
1375         mcs = xen_mc_entry(sizeof(*op));
1376         op = mcs.args;
1377         op->cmd = MMUEXT_INVLPG_LOCAL;
1378         op->arg1.linear_addr = addr & PAGE_MASK;
1379         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1380
1381         xen_mc_issue(PARAVIRT_LAZY_MMU);
1382
1383         preempt_enable();
1384 }
1385
1386 static void xen_flush_tlb_others(const struct cpumask *cpus,
1387                                  struct mm_struct *mm, unsigned long va)
1388 {
1389         struct {
1390                 struct mmuext_op op;
1391                 DECLARE_BITMAP(mask, NR_CPUS);
1392         } *args;
1393         struct multicall_space mcs;
1394
1395         if (cpumask_empty(cpus))
1396                 return;         /* nothing to do */
1397
1398         mcs = xen_mc_entry(sizeof(*args));
1399         args = mcs.args;
1400         args->op.arg2.vcpumask = to_cpumask(args->mask);
1401
1402         /* Remove us, and any offline CPUS. */
1403         cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1404         cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1405
1406         if (va == TLB_FLUSH_ALL) {
1407                 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1408         } else {
1409                 args->op.cmd = MMUEXT_INVLPG_MULTI;
1410                 args->op.arg1.linear_addr = va;
1411         }
1412
1413         MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1414
1415         xen_mc_issue(PARAVIRT_LAZY_MMU);
1416 }
1417
1418 static unsigned long xen_read_cr3(void)
1419 {
1420         return percpu_read(xen_cr3);
1421 }
1422
1423 static void set_current_cr3(void *v)
1424 {
1425         percpu_write(xen_current_cr3, (unsigned long)v);
1426 }
1427
1428 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1429 {
1430         struct mmuext_op *op;
1431         struct multicall_space mcs;
1432         unsigned long mfn;
1433
1434         if (cr3)
1435                 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1436         else
1437                 mfn = 0;
1438
1439         WARN_ON(mfn == 0 && kernel);
1440
1441         mcs = __xen_mc_entry(sizeof(*op));
1442
1443         op = mcs.args;
1444         op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1445         op->arg1.mfn = mfn;
1446
1447         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1448
1449         if (kernel) {
1450                 percpu_write(xen_cr3, cr3);
1451
1452                 /* Update xen_current_cr3 once the batch has actually
1453                    been submitted. */
1454                 xen_mc_callback(set_current_cr3, (void *)cr3);
1455         }
1456 }
1457
1458 static void xen_write_cr3(unsigned long cr3)
1459 {
1460         BUG_ON(preemptible());
1461
1462         xen_mc_batch();  /* disables interrupts */
1463
1464         /* Update while interrupts are disabled, so its atomic with
1465            respect to ipis */
1466         percpu_write(xen_cr3, cr3);
1467
1468         __xen_write_cr3(true, cr3);
1469
1470 #ifdef CONFIG_X86_64
1471         {
1472                 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1473                 if (user_pgd)
1474                         __xen_write_cr3(false, __pa(user_pgd));
1475                 else
1476                         __xen_write_cr3(false, 0);
1477         }
1478 #endif
1479
1480         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1481 }
1482
1483 static int xen_pgd_alloc(struct mm_struct *mm)
1484 {
1485         pgd_t *pgd = mm->pgd;
1486         int ret = 0;
1487
1488         BUG_ON(PagePinned(virt_to_page(pgd)));
1489
1490 #ifdef CONFIG_X86_64
1491         {
1492                 struct page *page = virt_to_page(pgd);
1493                 pgd_t *user_pgd;
1494
1495                 BUG_ON(page->private != 0);
1496
1497                 ret = -ENOMEM;
1498
1499                 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1500                 page->private = (unsigned long)user_pgd;
1501
1502                 if (user_pgd != NULL) {
1503                         user_pgd[pgd_index(VSYSCALL_START)] =
1504                                 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1505                         ret = 0;
1506                 }
1507
1508                 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1509         }
1510 #endif
1511
1512         return ret;
1513 }
1514
1515 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1516 {
1517 #ifdef CONFIG_X86_64
1518         pgd_t *user_pgd = xen_get_user_pgd(pgd);
1519
1520         if (user_pgd)
1521                 free_page((unsigned long)user_pgd);
1522 #endif
1523 }
1524
1525 #ifdef CONFIG_X86_32
1526 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1527 {
1528         /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1529         if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1530                 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1531                                pte_val_ma(pte));
1532
1533         return pte;
1534 }
1535
1536 /* Init-time set_pte while constructing initial pagetables, which
1537    doesn't allow RO pagetable pages to be remapped RW */
1538 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1539 {
1540         pte = mask_rw_pte(ptep, pte);
1541
1542         xen_set_pte(ptep, pte);
1543 }
1544 #endif
1545
1546 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1547 {
1548         struct mmuext_op op;
1549         op.cmd = cmd;
1550         op.arg1.mfn = pfn_to_mfn(pfn);
1551         if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1552                 BUG();
1553 }
1554
1555 /* Early in boot, while setting up the initial pagetable, assume
1556    everything is pinned. */
1557 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1558 {
1559 #ifdef CONFIG_FLATMEM
1560         BUG_ON(mem_map);        /* should only be used early */
1561 #endif
1562         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1563         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1564 }
1565
1566 /* Used for pmd and pud */
1567 static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1568 {
1569 #ifdef CONFIG_FLATMEM
1570         BUG_ON(mem_map);        /* should only be used early */
1571 #endif
1572         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1573 }
1574
1575 /* Early release_pte assumes that all pts are pinned, since there's
1576    only init_mm and anything attached to that is pinned. */
1577 static __init void xen_release_pte_init(unsigned long pfn)
1578 {
1579         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1580         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1581 }
1582
1583 static __init void xen_release_pmd_init(unsigned long pfn)
1584 {
1585         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1586 }
1587
1588 /* This needs to make sure the new pte page is pinned iff its being
1589    attached to a pinned pagetable. */
1590 static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1591 {
1592         struct page *page = pfn_to_page(pfn);
1593
1594         if (PagePinned(virt_to_page(mm->pgd))) {
1595                 SetPagePinned(page);
1596
1597                 if (!PageHighMem(page)) {
1598                         make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1599                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1600                                 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1601                 } else {
1602                         /* make sure there are no stray mappings of
1603                            this page */
1604                         kmap_flush_unused();
1605                 }
1606         }
1607 }
1608
1609 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1610 {
1611         xen_alloc_ptpage(mm, pfn, PT_PTE);
1612 }
1613
1614 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1615 {
1616         xen_alloc_ptpage(mm, pfn, PT_PMD);
1617 }
1618
1619 /* This should never happen until we're OK to use struct page */
1620 static void xen_release_ptpage(unsigned long pfn, unsigned level)
1621 {
1622         struct page *page = pfn_to_page(pfn);
1623
1624         if (PagePinned(page)) {
1625                 if (!PageHighMem(page)) {
1626                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1627                                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1628                         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1629                 }
1630                 ClearPagePinned(page);
1631         }
1632 }
1633
1634 static void xen_release_pte(unsigned long pfn)
1635 {
1636         xen_release_ptpage(pfn, PT_PTE);
1637 }
1638
1639 static void xen_release_pmd(unsigned long pfn)
1640 {
1641         xen_release_ptpage(pfn, PT_PMD);
1642 }
1643
1644 #if PAGETABLE_LEVELS == 4
1645 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1646 {
1647         xen_alloc_ptpage(mm, pfn, PT_PUD);
1648 }
1649
1650 static void xen_release_pud(unsigned long pfn)
1651 {
1652         xen_release_ptpage(pfn, PT_PUD);
1653 }
1654 #endif
1655
1656 void __init xen_reserve_top(void)
1657 {
1658 #ifdef CONFIG_X86_32
1659         unsigned long top = HYPERVISOR_VIRT_START;
1660         struct xen_platform_parameters pp;
1661
1662         if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1663                 top = pp.virt_start;
1664
1665         reserve_top_address(-top);
1666 #endif  /* CONFIG_X86_32 */
1667 }
1668
1669 /*
1670  * Like __va(), but returns address in the kernel mapping (which is
1671  * all we have until the physical memory mapping has been set up.
1672  */
1673 static void *__ka(phys_addr_t paddr)
1674 {
1675 #ifdef CONFIG_X86_64
1676         return (void *)(paddr + __START_KERNEL_map);
1677 #else
1678         return __va(paddr);
1679 #endif
1680 }
1681
1682 /* Convert a machine address to physical address */
1683 static unsigned long m2p(phys_addr_t maddr)
1684 {
1685         phys_addr_t paddr;
1686
1687         maddr &= PTE_PFN_MASK;
1688         paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1689
1690         return paddr;
1691 }
1692
1693 /* Convert a machine address to kernel virtual */
1694 static void *m2v(phys_addr_t maddr)
1695 {
1696         return __ka(m2p(maddr));
1697 }
1698
1699 static void set_page_prot(void *addr, pgprot_t prot)
1700 {
1701         unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1702         pte_t pte = pfn_pte(pfn, prot);
1703
1704         if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1705                 BUG();
1706 }
1707
1708 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1709 {
1710         unsigned pmdidx, pteidx;
1711         unsigned ident_pte;
1712         unsigned long pfn;
1713
1714         ident_pte = 0;
1715         pfn = 0;
1716         for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1717                 pte_t *pte_page;
1718
1719                 /* Reuse or allocate a page of ptes */
1720                 if (pmd_present(pmd[pmdidx]))
1721                         pte_page = m2v(pmd[pmdidx].pmd);
1722                 else {
1723                         /* Check for free pte pages */
1724                         if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1725                                 break;
1726
1727                         pte_page = &level1_ident_pgt[ident_pte];
1728                         ident_pte += PTRS_PER_PTE;
1729
1730                         pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1731                 }
1732
1733                 /* Install mappings */
1734                 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1735                         pte_t pte;
1736
1737                         if (pfn > max_pfn_mapped)
1738                                 max_pfn_mapped = pfn;
1739
1740                         if (!pte_none(pte_page[pteidx]))
1741                                 continue;
1742
1743                         pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1744                         pte_page[pteidx] = pte;
1745                 }
1746         }
1747
1748         for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1749                 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1750
1751         set_page_prot(pmd, PAGE_KERNEL_RO);
1752 }
1753
1754 #ifdef CONFIG_X86_64
1755 static void convert_pfn_mfn(void *v)
1756 {
1757         pte_t *pte = v;
1758         int i;
1759
1760         /* All levels are converted the same way, so just treat them
1761            as ptes. */
1762         for (i = 0; i < PTRS_PER_PTE; i++)
1763                 pte[i] = xen_make_pte(pte[i].pte);
1764 }
1765
1766 /*
1767  * Set up the inital kernel pagetable.
1768  *
1769  * We can construct this by grafting the Xen provided pagetable into
1770  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1771  * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1772  * means that only the kernel has a physical mapping to start with -
1773  * but that's enough to get __va working.  We need to fill in the rest
1774  * of the physical mapping once some sort of allocator has been set
1775  * up.
1776  */
1777 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1778                                          unsigned long max_pfn)
1779 {
1780         pud_t *l3;
1781         pmd_t *l2;
1782
1783         /* Zap identity mapping */
1784         init_level4_pgt[0] = __pgd(0);
1785
1786         /* Pre-constructed entries are in pfn, so convert to mfn */
1787         convert_pfn_mfn(init_level4_pgt);
1788         convert_pfn_mfn(level3_ident_pgt);
1789         convert_pfn_mfn(level3_kernel_pgt);
1790
1791         l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1792         l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1793
1794         memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1795         memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1796
1797         l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1798         l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1799         memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1800
1801         /* Set up identity map */
1802         xen_map_identity_early(level2_ident_pgt, max_pfn);
1803
1804         /* Make pagetable pieces RO */
1805         set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1806         set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1807         set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1808         set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1809         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1810         set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1811
1812         /* Pin down new L4 */
1813         pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1814                           PFN_DOWN(__pa_symbol(init_level4_pgt)));
1815
1816         /* Unpin Xen-provided one */
1817         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1818
1819         /* Switch over */
1820         pgd = init_level4_pgt;
1821
1822         /*
1823          * At this stage there can be no user pgd, and no page
1824          * structure to attach it to, so make sure we just set kernel
1825          * pgd.
1826          */
1827         xen_mc_batch();
1828         __xen_write_cr3(true, __pa(pgd));
1829         xen_mc_issue(PARAVIRT_LAZY_CPU);
1830
1831         reserve_early(__pa(xen_start_info->pt_base),
1832                       __pa(xen_start_info->pt_base +
1833                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1834                       "XEN PAGETABLES");
1835
1836         return pgd;
1837 }
1838 #else   /* !CONFIG_X86_64 */
1839 static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1840
1841 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1842                                          unsigned long max_pfn)
1843 {
1844         pmd_t *kernel_pmd;
1845
1846         max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1847                                   xen_start_info->nr_pt_frames * PAGE_SIZE +
1848                                   512*1024);
1849
1850         kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1851         memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1852
1853         xen_map_identity_early(level2_kernel_pgt, max_pfn);
1854
1855         memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1856         set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1857                         __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1858
1859         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1860         set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1861         set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1862
1863         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1864
1865         xen_write_cr3(__pa(swapper_pg_dir));
1866
1867         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1868
1869         reserve_early(__pa(xen_start_info->pt_base),
1870                       __pa(xen_start_info->pt_base +
1871                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1872                       "XEN PAGETABLES");
1873
1874         return swapper_pg_dir;
1875 }
1876 #endif  /* CONFIG_X86_64 */
1877
1878 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1879 {
1880         pte_t pte;
1881
1882         phys >>= PAGE_SHIFT;
1883
1884         switch (idx) {
1885         case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1886 #ifdef CONFIG_X86_F00F_BUG
1887         case FIX_F00F_IDT:
1888 #endif
1889 #ifdef CONFIG_X86_32
1890         case FIX_WP_TEST:
1891         case FIX_VDSO:
1892 # ifdef CONFIG_HIGHMEM
1893         case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1894 # endif
1895 #else
1896         case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1897 #endif
1898 #ifdef CONFIG_X86_LOCAL_APIC
1899         case FIX_APIC_BASE:     /* maps dummy local APIC */
1900 #endif
1901         case FIX_TEXT_POKE0:
1902         case FIX_TEXT_POKE1:
1903                 /* All local page mappings */
1904                 pte = pfn_pte(phys, prot);
1905                 break;
1906
1907         case FIX_PARAVIRT_BOOTMAP:
1908                 /* This is an MFN, but it isn't an IO mapping from the
1909                    IO domain */
1910                 pte = mfn_pte(phys, prot);
1911                 break;
1912
1913         default:
1914                 /* By default, set_fixmap is used for hardware mappings */
1915                 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1916                 break;
1917         }
1918
1919         __native_set_fixmap(idx, pte);
1920
1921 #ifdef CONFIG_X86_64
1922         /* Replicate changes to map the vsyscall page into the user
1923            pagetable vsyscall mapping. */
1924         if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1925                 unsigned long vaddr = __fix_to_virt(idx);
1926                 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1927         }
1928 #endif
1929 }
1930
1931 static __init void xen_post_allocator_init(void)
1932 {
1933         pv_mmu_ops.set_pte = xen_set_pte;
1934         pv_mmu_ops.set_pmd = xen_set_pmd;
1935         pv_mmu_ops.set_pud = xen_set_pud;
1936 #if PAGETABLE_LEVELS == 4
1937         pv_mmu_ops.set_pgd = xen_set_pgd;
1938 #endif
1939
1940         /* This will work as long as patching hasn't happened yet
1941            (which it hasn't) */
1942         pv_mmu_ops.alloc_pte = xen_alloc_pte;
1943         pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1944         pv_mmu_ops.release_pte = xen_release_pte;
1945         pv_mmu_ops.release_pmd = xen_release_pmd;
1946 #if PAGETABLE_LEVELS == 4
1947         pv_mmu_ops.alloc_pud = xen_alloc_pud;
1948         pv_mmu_ops.release_pud = xen_release_pud;
1949 #endif
1950
1951 #ifdef CONFIG_X86_64
1952         SetPagePinned(virt_to_page(level3_user_vsyscall));
1953 #endif
1954         xen_mark_init_mm_pinned();
1955 }
1956
1957 static void xen_leave_lazy_mmu(void)
1958 {
1959         preempt_disable();
1960         xen_mc_flush();
1961         paravirt_leave_lazy_mmu();
1962         preempt_enable();
1963 }
1964
1965 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1966         .read_cr2 = xen_read_cr2,
1967         .write_cr2 = xen_write_cr2,
1968
1969         .read_cr3 = xen_read_cr3,
1970         .write_cr3 = xen_write_cr3,
1971
1972         .flush_tlb_user = xen_flush_tlb,
1973         .flush_tlb_kernel = xen_flush_tlb,
1974         .flush_tlb_single = xen_flush_tlb_single,
1975         .flush_tlb_others = xen_flush_tlb_others,
1976
1977         .pte_update = paravirt_nop,
1978         .pte_update_defer = paravirt_nop,
1979
1980         .pgd_alloc = xen_pgd_alloc,
1981         .pgd_free = xen_pgd_free,
1982
1983         .alloc_pte = xen_alloc_pte_init,
1984         .release_pte = xen_release_pte_init,
1985         .alloc_pmd = xen_alloc_pmd_init,
1986         .alloc_pmd_clone = paravirt_nop,
1987         .release_pmd = xen_release_pmd_init,
1988
1989 #ifdef CONFIG_X86_64
1990         .set_pte = xen_set_pte,
1991 #else
1992         .set_pte = xen_set_pte_init,
1993 #endif
1994         .set_pte_at = xen_set_pte_at,
1995         .set_pmd = xen_set_pmd_hyper,
1996
1997         .ptep_modify_prot_start = __ptep_modify_prot_start,
1998         .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1999
2000         .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2001         .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2002
2003         .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2004         .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2005
2006 #ifdef CONFIG_X86_PAE
2007         .set_pte_atomic = xen_set_pte_atomic,
2008         .pte_clear = xen_pte_clear,
2009         .pmd_clear = xen_pmd_clear,
2010 #endif  /* CONFIG_X86_PAE */
2011         .set_pud = xen_set_pud_hyper,
2012
2013         .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2014         .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2015
2016 #if PAGETABLE_LEVELS == 4
2017         .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2018         .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2019         .set_pgd = xen_set_pgd_hyper,
2020
2021         .alloc_pud = xen_alloc_pmd_init,
2022         .release_pud = xen_release_pmd_init,
2023 #endif  /* PAGETABLE_LEVELS == 4 */
2024
2025         .activate_mm = xen_activate_mm,
2026         .dup_mmap = xen_dup_mmap,
2027         .exit_mmap = xen_exit_mmap,
2028
2029         .lazy_mode = {
2030                 .enter = paravirt_enter_lazy_mmu,
2031                 .leave = xen_leave_lazy_mmu,
2032         },
2033
2034         .set_fixmap = xen_set_fixmap,
2035 };
2036
2037 void __init xen_init_mmu_ops(void)
2038 {
2039         x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2040         x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2041         pv_mmu_ops = xen_mmu_ops;
2042
2043         vmap_lazy_unmap = false;
2044 }
2045
2046 /* Protected by xen_reservation_lock. */
2047 #define MAX_CONTIG_ORDER 9 /* 2MB */
2048 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2049
2050 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2051 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2052                                 unsigned long *in_frames,
2053                                 unsigned long *out_frames)
2054 {
2055         int i;
2056         struct multicall_space mcs;
2057
2058         xen_mc_batch();
2059         for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2060                 mcs = __xen_mc_entry(0);
2061
2062                 if (in_frames)
2063                         in_frames[i] = virt_to_mfn(vaddr);
2064
2065                 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2066                 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2067
2068                 if (out_frames)
2069                         out_frames[i] = virt_to_pfn(vaddr);
2070         }
2071         xen_mc_issue(0);
2072 }
2073
2074 /*
2075  * Update the pfn-to-mfn mappings for a virtual address range, either to
2076  * point to an array of mfns, or contiguously from a single starting
2077  * mfn.
2078  */
2079 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2080                                      unsigned long *mfns,
2081                                      unsigned long first_mfn)
2082 {
2083         unsigned i, limit;
2084         unsigned long mfn;
2085
2086         xen_mc_batch();
2087
2088         limit = 1u << order;
2089         for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2090                 struct multicall_space mcs;
2091                 unsigned flags;
2092
2093                 mcs = __xen_mc_entry(0);
2094                 if (mfns)
2095                         mfn = mfns[i];
2096                 else
2097                         mfn = first_mfn + i;
2098
2099                 if (i < (limit - 1))
2100                         flags = 0;
2101                 else {
2102                         if (order == 0)
2103                                 flags = UVMF_INVLPG | UVMF_ALL;
2104                         else
2105                                 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2106                 }
2107
2108                 MULTI_update_va_mapping(mcs.mc, vaddr,
2109                                 mfn_pte(mfn, PAGE_KERNEL), flags);
2110
2111                 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2112         }
2113
2114         xen_mc_issue(0);
2115 }
2116
2117 /*
2118  * Perform the hypercall to exchange a region of our pfns to point to
2119  * memory with the required contiguous alignment.  Takes the pfns as
2120  * input, and populates mfns as output.
2121  *
2122  * Returns a success code indicating whether the hypervisor was able to
2123  * satisfy the request or not.
2124  */
2125 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2126                                unsigned long *pfns_in,
2127                                unsigned long extents_out,
2128                                unsigned int order_out,
2129                                unsigned long *mfns_out,
2130                                unsigned int address_bits)
2131 {
2132         long rc;
2133         int success;
2134
2135         struct xen_memory_exchange exchange = {
2136                 .in = {
2137                         .nr_extents   = extents_in,
2138                         .extent_order = order_in,
2139                         .extent_start = pfns_in,
2140                         .domid        = DOMID_SELF
2141                 },
2142                 .out = {
2143                         .nr_extents   = extents_out,
2144                         .extent_order = order_out,
2145                         .extent_start = mfns_out,
2146                         .address_bits = address_bits,
2147                         .domid        = DOMID_SELF
2148                 }
2149         };
2150
2151         BUG_ON(extents_in << order_in != extents_out << order_out);
2152
2153         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2154         success = (exchange.nr_exchanged == extents_in);
2155
2156         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2157         BUG_ON(success && (rc != 0));
2158
2159         return success;
2160 }
2161
2162 int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2163                                  unsigned int address_bits)
2164 {
2165         unsigned long *in_frames = discontig_frames, out_frame;
2166         unsigned long  flags;
2167         int            success;
2168
2169         /*
2170          * Currently an auto-translated guest will not perform I/O, nor will
2171          * it require PAE page directories below 4GB. Therefore any calls to
2172          * this function are redundant and can be ignored.
2173          */
2174
2175         if (xen_feature(XENFEAT_auto_translated_physmap))
2176                 return 0;
2177
2178         if (unlikely(order > MAX_CONTIG_ORDER))
2179                 return -ENOMEM;
2180
2181         memset((void *) vstart, 0, PAGE_SIZE << order);
2182
2183         spin_lock_irqsave(&xen_reservation_lock, flags);
2184
2185         /* 1. Zap current PTEs, remembering MFNs. */
2186         xen_zap_pfn_range(vstart, order, in_frames, NULL);
2187
2188         /* 2. Get a new contiguous memory extent. */
2189         out_frame = virt_to_pfn(vstart);
2190         success = xen_exchange_memory(1UL << order, 0, in_frames,
2191                                       1, order, &out_frame,
2192                                       address_bits);
2193
2194         /* 3. Map the new extent in place of old pages. */
2195         if (success)
2196                 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2197         else
2198                 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2199
2200         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2201
2202         return success ? 0 : -ENOMEM;
2203 }
2204 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2205
2206 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2207 {
2208         unsigned long *out_frames = discontig_frames, in_frame;
2209         unsigned long  flags;
2210         int success;
2211
2212         if (xen_feature(XENFEAT_auto_translated_physmap))
2213                 return;
2214
2215         if (unlikely(order > MAX_CONTIG_ORDER))
2216                 return;
2217
2218         memset((void *) vstart, 0, PAGE_SIZE << order);
2219
2220         spin_lock_irqsave(&xen_reservation_lock, flags);
2221
2222         /* 1. Find start MFN of contiguous extent. */
2223         in_frame = virt_to_mfn(vstart);
2224
2225         /* 2. Zap current PTEs. */
2226         xen_zap_pfn_range(vstart, order, NULL, out_frames);
2227
2228         /* 3. Do the exchange for non-contiguous MFNs. */
2229         success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2230                                         0, out_frames, 0);
2231
2232         /* 4. Map new pages in place of old pages. */
2233         if (success)
2234                 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2235         else
2236                 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2237
2238         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2239 }
2240 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2241
2242 #ifdef CONFIG_XEN_PVHVM
2243 static void xen_hvm_exit_mmap(struct mm_struct *mm)
2244 {
2245         struct xen_hvm_pagetable_dying a;
2246         int rc;
2247
2248         a.domid = DOMID_SELF;
2249         a.gpa = __pa(mm->pgd);
2250         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2251         WARN_ON_ONCE(rc < 0);
2252 }
2253
2254 static int is_pagetable_dying_supported(void)
2255 {
2256         struct xen_hvm_pagetable_dying a;
2257         int rc = 0;
2258
2259         a.domid = DOMID_SELF;
2260         a.gpa = 0x00;
2261         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2262         if (rc < 0) {
2263                 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2264                 return 0;
2265         }
2266         return 1;
2267 }
2268
2269 void __init xen_hvm_init_mmu_ops(void)
2270 {
2271         if (is_pagetable_dying_supported())
2272                 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2273 }
2274 #endif
2275
2276 #ifdef CONFIG_XEN_DEBUG_FS
2277
2278 static struct dentry *d_mmu_debug;
2279
2280 static int __init xen_mmu_debugfs(void)
2281 {
2282         struct dentry *d_xen = xen_init_debugfs();
2283
2284         if (d_xen == NULL)
2285                 return -ENOMEM;
2286
2287         d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2288
2289         debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2290
2291         debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2292         debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2293                            &mmu_stats.pgd_update_pinned);
2294         debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2295                            &mmu_stats.pgd_update_pinned);
2296
2297         debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2298         debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2299                            &mmu_stats.pud_update_pinned);
2300         debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2301                            &mmu_stats.pud_update_pinned);
2302
2303         debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2304         debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2305                            &mmu_stats.pmd_update_pinned);
2306         debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2307                            &mmu_stats.pmd_update_pinned);
2308
2309         debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2310 //      debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2311 //                         &mmu_stats.pte_update_pinned);
2312         debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2313                            &mmu_stats.pte_update_pinned);
2314
2315         debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2316         debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2317                            &mmu_stats.mmu_update_extended);
2318         xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2319                                      mmu_stats.mmu_update_histo, 20);
2320
2321         debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2322         debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2323                            &mmu_stats.set_pte_at_batched);
2324         debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2325                            &mmu_stats.set_pte_at_current);
2326         debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2327                            &mmu_stats.set_pte_at_kernel);
2328
2329         debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2330         debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2331                            &mmu_stats.prot_commit_batched);
2332
2333         return 0;
2334 }
2335 fs_initcall(xen_mmu_debugfs);
2336
2337 #endif  /* CONFIG_XEN_DEBUG_FS */