[mirror_ubuntu-zesty-kernel.git] / arch / x86 / xen / mmu.c

/*
 * Xen mmu operations
 *
 * This file contains the various mmu fetch and update operations.
 * The most important job they must perform is the mapping between the
 * domain's pfn and the overall machine mfns.
 *
 * Xen allows guests to directly update the pagetable, in a controlled
 * fashion.  In other words, the guest modifies the same pagetable
 * that the CPU actually uses, which eliminates the overhead of having
 * a separate shadow pagetable.
 *
 * In order to allow this, it falls on the guest domain to map its
 * notion of a "physical" pfn - which is just a domain-local linear
 * address - into a real "machine address" which the CPU's MMU can
 * use.
 *
 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
 * inserted directly into the pagetable.  When creating a new
 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
 * when reading the content back with __(pgd|pmd|pte)_val, it converts
 * the mfn back into a pfn.
 *
 * The other constraint is that all pages which make up a pagetable
 * must be mapped read-only in the guest.  This prevents uncontrolled
 * guest updates to the pagetable.  Xen strictly enforces this, and
 * will disallow any pagetable update which will end up mapping a
 * pagetable page RW, and will disallow using any writable page as a
 * pagetable.
 *
 * Naively, when loading %cr3 with the base of a new pagetable, Xen
 * would need to validate the whole pagetable before going on.
 * Naturally, this is quite slow.  The solution is to "pin" a
 * pagetable, which enforces all the constraints on the pagetable even
 * when it is not actively in use.  This menas that Xen can be assured
 * that it is still valid when you do load it into %cr3, and doesn't
 * need to revalidate it.
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/bug.h>

#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>

#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>

#include <xen/page.h>
#include <xen/interface/xen.h>

#include "multicalls.h"
#include "mmu.h"

xmaddr_t arbitrary_virt_to_machine(unsigned long address)
{
	unsigned int level;
	pte_t *pte = lookup_address(address, &level);
	unsigned offset = address & PAGE_MASK;

	BUG_ON(pte == NULL);

	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
}

void make_lowmem_page_readonly(void *vaddr)
{
	pte_t *pte, ptev;
	unsigned long address = (unsigned long)vaddr;
	unsigned int level;

	pte = lookup_address(address, &level);
	BUG_ON(pte == NULL);

	ptev = pte_wrprotect(*pte);

	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
		BUG();
}

void make_lowmem_page_readwrite(void *vaddr)
{
	pte_t *pte, ptev;
	unsigned long address = (unsigned long)vaddr;
	unsigned int level;

	pte = lookup_address(address, &level);
	BUG_ON(pte == NULL);

	ptev = pte_mkwrite(*pte);

	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
		BUG();
}


void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
	struct multicall_space mcs;
	struct mmu_update *u;

	preempt_disable();

	mcs = xen_mc_entry(sizeof(*u));
	u = mcs.args;
	u->ptr = virt_to_machine(ptr).maddr;
	u->val = pmd_val_ma(val);
	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
}

/*
 * Associate a virtual page frame with a given physical page frame
 * and protection flags for that frame.
 */
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pgd = swapper_pg_dir + pgd_index(vaddr);
	if (pgd_none(*pgd)) {
		BUG();
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
		BUG();
		return;
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		BUG();
		return;
	}
	pte = pte_offset_kernel(pmd, vaddr);
	/* <mfn,flags> stored as-is, to permit clearing entries */
	xen_set_pte(pte, mfn_pte(mfn, flags));

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
		    pte_t *ptep, pte_t pteval)
{
	if (mm == current->mm || mm == &init_mm) {
		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
			struct multicall_space mcs;
			mcs = xen_mc_entry(0);

			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
			xen_mc_issue(PARAVIRT_LAZY_MMU);
			return;
		} else
			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
				return;
	}
	xen_set_pte(ptep, pteval);
}

#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
	struct multicall_space mcs;
	struct mmu_update *u;

	preempt_disable();

	mcs = xen_mc_entry(sizeof(*u));
	u = mcs.args;
	u->ptr = virt_to_machine(ptr).maddr;
	u->val = pud_val_ma(val);
	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
}

void xen_set_pte(pte_t *ptep, pte_t pte)
{
	ptep->pte_high = pte.pte_high;
	smp_wmb();
	ptep->pte_low = pte.pte_low;
}

void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
	set_64bit((u64 *)ptep, pte_val_ma(pte));
}

void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
	ptep->pte_low = 0;
	smp_wmb();		/* make sure low gets written first */
	ptep->pte_high = 0;
}

void xen_pmd_clear(pmd_t *pmdp)
{
	xen_set_pmd(pmdp, __pmd(0));
}

pteval_t xen_pte_val(pte_t pte)
{
	pteval_t ret = 0;

	if (pte.pte_low) {
		ret = ((pteval_t)pte.pte_high << 32) | pte.pte_low;
		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
	}

	return ret;
}

pmdval_t xen_pmd_val(pmd_t pmd)
{
	pmdval_t ret = pmd.pmd;
	if (ret)
		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
	return ret;
}

pgdval_t xen_pgd_val(pgd_t pgd)
{
	pgdval_t ret = pgd.pgd;
	if (ret)
		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
	return ret;
}

pte_t xen_make_pte(pteval_t pte)
{
	if (pte & _PAGE_PRESENT) {
		pte = phys_to_machine(XPADDR(pte)).maddr;
		pte &= ~(_PAGE_PCD | _PAGE_PWT);
	}

	return (pte_t){ .pte = pte };
}

pmd_t xen_make_pmd(pmdval_t pmd)
{
	if (pmd & 1)
		pmd = phys_to_machine(XPADDR(pmd)).maddr;

	return (pmd_t){ pmd };
}

pgd_t xen_make_pgd(pgdval_t pgd)
{
	if (pgd & _PAGE_PRESENT)
		pgd = phys_to_machine(XPADDR(pgd)).maddr;

	return (pgd_t){ pgd };
}
#else  /* !PAE */
void xen_set_pte(pte_t *ptep, pte_t pte)
{
	*ptep = pte;
}

pteval_t xen_pte_val(pte_t pte)
{
	pteval_t ret = pte.pte_low;

	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr;

	return ret;
}

pgdval_t xen_pgd_val(pgd_t pgd)
{
	pteval_t ret = pgd.pgd;
	if (ret)
		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
	return ret;
}

pte_t xen_make_pte(pteval_t pte)
{
	if (pte & _PAGE_PRESENT) {
		pte = phys_to_machine(XPADDR(pte)).maddr;
		pte &= ~(_PAGE_PCD | _PAGE_PWT);
	}

	return (pte_t){ pte };
}

pgd_t xen_make_pgd(pgdval_t pgd)
{
	if (pgd & _PAGE_PRESENT)
		pgd = phys_to_machine(XPADDR(pgd)).maddr;

	return (pgd_t){ pgd };
}
#endif	/* CONFIG_X86_PAE */

/*
  (Yet another) pagetable walker.  This one is intended for pinning a
  pagetable.  This means that it walks a pagetable and calls the
  callback function on each page it finds making up the page table,
  at every level.  It walks the entire pagetable, but it only bothers
  pinning pte pages which are below pte_limit.  In the normal case
  this will be TASK_SIZE, but at boot we need to pin up to
  FIXADDR_TOP.  But the important bit is that we don't pin beyond
  there, because then we start getting into Xen's ptes.
*/
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
		    unsigned long limit)
{
	pgd_t *pgd = pgd_base;
	int flush = 0;
	unsigned long addr = 0;
	unsigned long pgd_next;

	BUG_ON(limit > FIXADDR_TOP);

	if (xen_feature(XENFEAT_auto_translated_physmap))
		return 0;

	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
		pud_t *pud;
		unsigned long pud_limit, pud_next;

		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);

		if (!pgd_val(*pgd))
			continue;

		pud = pud_offset(pgd, 0);

		if (PTRS_PER_PUD > 1) /* not folded */
			flush |= (*func)(virt_to_page(pud), PT_PUD);

		for (; addr != pud_limit; pud++, addr = pud_next) {
			pmd_t *pmd;
			unsigned long pmd_limit;

			pud_next = pud_addr_end(addr, pud_limit);

			if (pud_next < limit)
				pmd_limit = pud_next;
			else
				pmd_limit = limit;

			if (pud_none(*pud))
				continue;

			pmd = pmd_offset(pud, 0);

			if (PTRS_PER_PMD > 1) /* not folded */
				flush |= (*func)(virt_to_page(pmd), PT_PMD);

			for (; addr != pmd_limit; pmd++) {
				addr += (PAGE_SIZE * PTRS_PER_PTE);
				if ((pmd_limit-1) < (addr-1)) {
					addr = pmd_limit;
					break;
				}

				if (pmd_none(*pmd))
					continue;

				flush |= (*func)(pmd_page(*pmd), PT_PTE);
			}
		}
	}

	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);

	return flush;
}

static spinlock_t *lock_pte(struct page *page)
{
	spinlock_t *ptl = NULL;

#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
	ptl = __pte_lockptr(page);
	spin_lock(ptl);
#endif

	return ptl;
}

static void do_unlock(void *v)
{
	spinlock_t *ptl = v;
	spin_unlock(ptl);
}

static void xen_do_pin(unsigned level, unsigned long pfn)
{
	struct mmuext_op *op;
	struct multicall_space mcs;

	mcs = __xen_mc_entry(sizeof(*op));
	op = mcs.args;
	op->cmd = level;
	op->arg1.mfn = pfn_to_mfn(pfn);
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
}

static int pin_page(struct page *page, enum pt_level level)
{
	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
	int flush;

	if (pgfl)
		flush = 0;		/* already pinned */
	else if (PageHighMem(page))
		/* kmaps need flushing if we found an unpinned
		   highpage */
		flush = 1;
	else {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		struct multicall_space mcs = __xen_mc_entry(0);
		spinlock_t *ptl;

		flush = 0;

		ptl = NULL;
		if (level == PT_PTE)
			ptl = lock_pte(page);

		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL_RO),
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);

		if (level == PT_PTE)
			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);

		if (ptl) {
			/* Queue a deferred unlock for when this batch
			   is completed. */
			xen_mc_callback(do_unlock, ptl);
		}
	}

	return flush;
}

/* This is called just after a mm has been created, but it has not
   been used yet.  We need to make sure that its pagetable is all
   read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
	unsigned level;

	xen_mc_batch();

	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
		/* re-enable interrupts for kmap_flush_unused */
		xen_mc_issue(0);
		kmap_flush_unused();
		xen_mc_batch();
	}

#ifdef CONFIG_X86_PAE
	level = MMUEXT_PIN_L3_TABLE;
#else
	level = MMUEXT_PIN_L2_TABLE;
#endif

	xen_do_pin(level, PFN_DOWN(__pa(pgd)));

	xen_mc_issue(0);
}

/* The init_mm pagetable is really pinned as soon as its created, but
   that's before we have page structures to store the bits.  So do all
   the book-keeping now. */
static __init int mark_pinned(struct page *page, enum pt_level level)
{
	SetPagePinned(page);
	return 0;
}

void __init xen_mark_init_mm_pinned(void)
{
	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
}

static int unpin_page(struct page *page, enum pt_level level)
{
	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);

	if (pgfl && !PageHighMem(page)) {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		spinlock_t *ptl = NULL;
		struct multicall_space mcs;

		if (level == PT_PTE) {
			ptl = lock_pte(page);

			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
		}

		mcs = __xen_mc_entry(0);

		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL),
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);

		if (ptl) {
			/* unlock when batch completed */
			xen_mc_callback(do_unlock, ptl);
		}
	}

	return 0;		/* never need to flush on unpin */
}

/* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd)
{
	xen_mc_batch();

	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));

	pgd_walk(pgd, unpin_page, TASK_SIZE);

	xen_mc_issue(0);
}

void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
	spin_lock(&next->page_table_lock);
	xen_pgd_pin(next->pgd);
	spin_unlock(&next->page_table_lock);
}

void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
	spin_lock(&mm->page_table_lock);
	xen_pgd_pin(mm->pgd);
	spin_unlock(&mm->page_table_lock);
}


#ifdef CONFIG_SMP
/* Another cpu may still have their %cr3 pointing at the pagetable, so
   we need to repoint it somewhere else before we can unpin it. */
static void drop_other_mm_ref(void *info)
{
	struct mm_struct *mm = info;

	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
		leave_mm(smp_processor_id());

	/* If this cpu still has a stale cr3 reference, then make sure
	   it has been flushed. */
	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
		load_cr3(swapper_pg_dir);
		arch_flush_lazy_cpu_mode();
	}
}

static void drop_mm_ref(struct mm_struct *mm)
{
	cpumask_t mask;
	unsigned cpu;

	if (current->active_mm == mm) {
		if (current->mm == mm)
			load_cr3(swapper_pg_dir);
		else
			leave_mm(smp_processor_id());
		arch_flush_lazy_cpu_mode();
	}

	/* Get the "official" set of cpus referring to our pagetable. */
	mask = mm->cpu_vm_mask;

	/* It's possible that a vcpu may have a stale reference to our
	   cr3, because its in lazy mode, and it hasn't yet flushed
	   its set of pending hypercalls yet.  In this case, we can
	   look at its actual current cr3 value, and force it to flush
	   if needed. */
	for_each_online_cpu(cpu) {
		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
			cpu_set(cpu, mask);
	}

	if (!cpus_empty(mask))
		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
}
#else
static void drop_mm_ref(struct mm_struct *mm)
{
	if (current->active_mm == mm)
		load_cr3(swapper_pg_dir);
}
#endif

/*
 * While a process runs, Xen pins its pagetables, which means that the
 * hypervisor forces it to be read-only, and it controls all updates
 * to it.  This means that all pagetable updates have to go via the
 * hypervisor, which is moderately expensive.
 *
 * Since we're pulling the pagetable down, we switch to use init_mm,
 * unpin old process pagetable and mark it all read-write, which
 * allows further operations on it to be simple memory accesses.
 *
 * The only subtle point is that another CPU may be still using the
 * pagetable because of lazy tlb flushing.  This means we need need to
 * switch all CPUs off this pagetable before we can unpin it.
 */
void xen_exit_mmap(struct mm_struct *mm)
{
	get_cpu();		/* make sure we don't move around */
	drop_mm_ref(mm);
	put_cpu();

	spin_lock(&mm->page_table_lock);

	/* pgd may not be pinned in the error exit path of execve */
	if (PagePinned(virt_to_page(mm->pgd)))
		xen_pgd_unpin(mm->pgd);

	spin_unlock(&mm->page_table_lock);
}
Commit	Line	Data
3b827c1b JF	1	/*
	2	* Xen mmu operations
	3	*
	4	* This file contains the various mmu fetch and update operations.
	5	* The most important job they must perform is the mapping between the
	6	* domain's pfn and the overall machine mfns.
	7	*
	8	* Xen allows guests to directly update the pagetable, in a controlled
	9	* fashion. In other words, the guest modifies the same pagetable
	10	* that the CPU actually uses, which eliminates the overhead of having
	11	* a separate shadow pagetable.
	12	*
	13	* In order to allow this, it falls on the guest domain to map its
	14	* notion of a "physical" pfn - which is just a domain-local linear
	15	* address - into a real "machine address" which the CPU's MMU can
	16	* use.
	17	*
	18	* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
	19	* inserted directly into the pagetable. When creating a new
	20	* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
	21	* when reading the content back with __(pgd\|pmd\|pte)_val, it converts
	22	* the mfn back into a pfn.
	23	*
	24	* The other constraint is that all pages which make up a pagetable
	25	* must be mapped read-only in the guest. This prevents uncontrolled
	26	* guest updates to the pagetable. Xen strictly enforces this, and
	27	* will disallow any pagetable update which will end up mapping a
	28	* pagetable page RW, and will disallow using any writable page as a
	29	* pagetable.
	30	*
	31	* Naively, when loading %cr3 with the base of a new pagetable, Xen
	32	* would need to validate the whole pagetable before going on.
	33	* Naturally, this is quite slow. The solution is to "pin" a
	34	* pagetable, which enforces all the constraints on the pagetable even
	35	* when it is not actively in use. This menas that Xen can be assured
	36	* that it is still valid when you do load it into %cr3, and doesn't
	37	* need to revalidate it.
	38	*
	39	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
	40	*/
f120f13e	41	#include <linux/sched.h>
f4f97b3e	42	#include <linux/highmem.h>
3b827c1b	43	#include <linux/bug.h>
3b827c1b JF	44
	45	#include <asm/pgtable.h>
	46	#include <asm/tlbflush.h>
	47	#include <asm/mmu_context.h>
f4f97b3e	48	#include <asm/paravirt.h>
3b827c1b JF	49
3b827c1b JF	50	#include <asm/xen/hypercall.h>
f4f97b3e	51	#include <asm/xen/hypervisor.h>
3b827c1b JF	52
	53	#include <xen/page.h>
	54	#include <xen/interface/xen.h>
	55
f4f97b3e	56	#include "multicalls.h"
3b827c1b JF	57	#include "mmu.h"
	58
	59	xmaddr_t arbitrary_virt_to_machine(unsigned long address)
	60	{
da7bfc50	61	unsigned int level;
f0646e43	62	pte_t *pte = lookup_address(address, &level);
3b827c1b JF	63	unsigned offset = address & PAGE_MASK;
	64
	65	BUG_ON(pte == NULL);
	66
	67	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
	68	}
	69
	70	void make_lowmem_page_readonly(void *vaddr)
	71	{
	72	pte_t *pte, ptev;
	73	unsigned long address = (unsigned long)vaddr;
da7bfc50	74	unsigned int level;
3b827c1b	75
f0646e43	76	pte = lookup_address(address, &level);
3b827c1b JF	77	BUG_ON(pte == NULL);
	78
	79	ptev = pte_wrprotect(*pte);
	80
	81	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
	82	BUG();
	83	}
	84
	85	void make_lowmem_page_readwrite(void *vaddr)
	86	{
	87	pte_t *pte, ptev;
	88	unsigned long address = (unsigned long)vaddr;
da7bfc50	89	unsigned int level;
3b827c1b	90
f0646e43	91	pte = lookup_address(address, &level);
3b827c1b JF	92	BUG_ON(pte == NULL);
	93
	94	ptev = pte_mkwrite(*pte);
	95
	96	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
	97	BUG();
	98	}
	99
	100
3b827c1b JF	101	void xen_set_pmd(pmd_t *ptr, pmd_t val)
3b827c1b JF	102	{
d66bf8fc JF	103	struct multicall_space mcs;
d66bf8fc JF	104	struct mmu_update *u;
3b827c1b	105
d66bf8fc JF	106	preempt_disable();
	107
	108	mcs = xen_mc_entry(sizeof(*u));
	109	u = mcs.args;
	110	u->ptr = virt_to_machine(ptr).maddr;
	111	u->val = pmd_val_ma(val);
	112	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
	113
	114	xen_mc_issue(PARAVIRT_LAZY_MMU);
	115
	116	preempt_enable();
3b827c1b JF	117	}
3b827c1b JF	118
3b827c1b JF	119	/*
	120	* Associate a virtual page frame with a given physical page frame
	121	* and protection flags for that frame.
	122	*/
	123	void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
	124	{
	125	pgd_t *pgd;
	126	pud_t *pud;
	127	pmd_t *pmd;
	128	pte_t *pte;
	129
	130	pgd = swapper_pg_dir + pgd_index(vaddr);
	131	if (pgd_none(*pgd)) {
	132	BUG();
	133	return;
	134	}
	135	pud = pud_offset(pgd, vaddr);
	136	if (pud_none(*pud)) {
	137	BUG();
	138	return;
	139	}
	140	pmd = pmd_offset(pud, vaddr);
	141	if (pmd_none(*pmd)) {
	142	BUG();
	143	return;
	144	}
	145	pte = pte_offset_kernel(pmd, vaddr);
	146	/* <mfn,flags> stored as-is, to permit clearing entries */
	147	xen_set_pte(pte, mfn_pte(mfn, flags));
	148
	149	/*
	150	* It's enough to flush this one mapping.
	151	* (PGE mappings get flushed as well)
	152	*/
	153	__flush_tlb_one(vaddr);
	154	}
	155
	156	void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
	157	pte_t *ptep, pte_t pteval)
	158	{
d66bf8fc	159	if (mm == current->mm \|\| mm == &init_mm) {
8965c1c0	160	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
d66bf8fc JF	161	struct multicall_space mcs;
	162	mcs = xen_mc_entry(0);
	163
	164	MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
	165	xen_mc_issue(PARAVIRT_LAZY_MMU);
	166	return;
	167	} else
	168	if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
	169	return;
	170	}
	171	xen_set_pte(ptep, pteval);
3b827c1b JF	172	}
	173
	174	#ifdef CONFIG_X86_PAE
f4f97b3e JF	175	void xen_set_pud(pud_t *ptr, pud_t val)
f4f97b3e JF	176	{
d66bf8fc JF	177	struct multicall_space mcs;
d66bf8fc JF	178	struct mmu_update *u;
f4f97b3e	179
d66bf8fc JF	180	preempt_disable();
	181
	182	mcs = xen_mc_entry(sizeof(*u));
	183	u = mcs.args;
	184	u->ptr = virt_to_machine(ptr).maddr;
	185	u->val = pud_val_ma(val);
	186	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
	187
	188	xen_mc_issue(PARAVIRT_LAZY_MMU);
	189
	190	preempt_enable();
f4f97b3e JF	191	}
	192
	193	void xen_set_pte(pte_t *ptep, pte_t pte)
	194	{
	195	ptep->pte_high = pte.pte_high;
	196	smp_wmb();
	197	ptep->pte_low = pte.pte_low;
	198	}
	199
3b827c1b JF	200	void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
	201	{
	202	set_64bit((u64 *)ptep, pte_val_ma(pte));
	203	}
	204
	205	void xen_pte_clear(struct mm_struct mm, unsigned long addr, pte_t ptep)
	206	{
	207	ptep->pte_low = 0;
	208	smp_wmb(); /* make sure low gets written first */
	209	ptep->pte_high = 0;
	210	}
	211
	212	void xen_pmd_clear(pmd_t *pmdp)
	213	{
	214	xen_set_pmd(pmdp, __pmd(0));
	215	}
	216
abf33038	217	pteval_t xen_pte_val(pte_t pte)
3b827c1b	218	{
abf33038	219	pteval_t ret = 0;
3b827c1b JF	220
3b827c1b JF	221	if (pte.pte_low) {
abf33038	222	ret = ((pteval_t)pte.pte_high << 32) \| pte.pte_low;
3b827c1b JF	223	ret = machine_to_phys(XMADDR(ret)).paddr \| 1;
	224	}
	225
	226	return ret;
	227	}
	228
abf33038	229	pmdval_t xen_pmd_val(pmd_t pmd)
3b827c1b	230	{
abf33038	231	pmdval_t ret = pmd.pmd;
3b827c1b JF	232	if (ret)
	233	ret = machine_to_phys(XMADDR(ret)).paddr \| 1;
	234	return ret;
	235	}
	236
abf33038	237	pgdval_t xen_pgd_val(pgd_t pgd)
3b827c1b	238	{
abf33038	239	pgdval_t ret = pgd.pgd;
3b827c1b JF	240	if (ret)
	241	ret = machine_to_phys(XMADDR(ret)).paddr \| 1;
	242	return ret;
	243	}
	244
abf33038	245	pte_t xen_make_pte(pteval_t pte)
3b827c1b	246	{
a89780f3	247	if (pte & _PAGE_PRESENT) {
3b827c1b	248	pte = phys_to_machine(XPADDR(pte)).maddr;
a89780f3 JF	249	pte &= ~(_PAGE_PCD \| _PAGE_PWT);
a89780f3 JF	250	}
3b827c1b	251
c8e5393a	252	return (pte_t){ .pte = pte };
3b827c1b JF	253	}
3b827c1b JF	254
abf33038	255	pmd_t xen_make_pmd(pmdval_t pmd)
3b827c1b JF	256	{
	257	if (pmd & 1)
	258	pmd = phys_to_machine(XPADDR(pmd)).maddr;
	259
	260	return (pmd_t){ pmd };
	261	}
	262
abf33038	263	pgd_t xen_make_pgd(pgdval_t pgd)
3b827c1b JF	264	{
	265	if (pgd & _PAGE_PRESENT)
	266	pgd = phys_to_machine(XPADDR(pgd)).maddr;
	267
	268	return (pgd_t){ pgd };
	269	}
	270	#else /* !PAE */
f4f97b3e JF	271	void xen_set_pte(pte_t *ptep, pte_t pte)
	272	{
	273	*ptep = pte;
	274	}
	275
abf33038	276	pteval_t xen_pte_val(pte_t pte)
3b827c1b	277	{
abf33038	278	pteval_t ret = pte.pte_low;
3b827c1b JF	279
	280	if (ret & _PAGE_PRESENT)
	281	ret = machine_to_phys(XMADDR(ret)).paddr;
	282
	283	return ret;
	284	}
	285
abf33038	286	pgdval_t xen_pgd_val(pgd_t pgd)
3b827c1b	287	{
abf33038	288	pteval_t ret = pgd.pgd;
3b827c1b JF	289	if (ret)
	290	ret = machine_to_phys(XMADDR(ret)).paddr \| 1;
	291	return ret;
	292	}
	293
abf33038	294	pte_t xen_make_pte(pteval_t pte)
3b827c1b	295	{
a89780f3	296	if (pte & _PAGE_PRESENT) {
3b827c1b	297	pte = phys_to_machine(XPADDR(pte)).maddr;
a89780f3 JF	298	pte &= ~(_PAGE_PCD \| _PAGE_PWT);
a89780f3 JF	299	}
2c80b01b	300
3b827c1b JF	301	return (pte_t){ pte };
	302	}
	303
abf33038	304	pgd_t xen_make_pgd(pgdval_t pgd)
3b827c1b JF	305	{
	306	if (pgd & _PAGE_PRESENT)
	307	pgd = phys_to_machine(XPADDR(pgd)).maddr;
	308
	309	return (pgd_t){ pgd };
	310	}
	311	#endif /* CONFIG_X86_PAE */
	312
f4f97b3e JF	313	/*
	314	(Yet another) pagetable walker. This one is intended for pinning a
	315	pagetable. This means that it walks a pagetable and calls the
	316	callback function on each page it finds making up the page table,
	317	at every level. It walks the entire pagetable, but it only bothers
	318	pinning pte pages which are below pte_limit. In the normal case
	319	this will be TASK_SIZE, but at boot we need to pin up to
	320	FIXADDR_TOP. But the important bit is that we don't pin beyond
	321	there, because then we start getting into Xen's ptes.
	322	*/
74260714	323	static int pgd_walk(pgd_t pgd_base, int (func)(struct page *, enum pt_level),
f4f97b3e	324	unsigned long limit)
3b827c1b JF	325	{
3b827c1b JF	326	pgd_t *pgd = pgd_base;
f4f97b3e JF	327	int flush = 0;
	328	unsigned long addr = 0;
	329	unsigned long pgd_next;
	330
	331	BUG_ON(limit > FIXADDR_TOP);
3b827c1b JF	332
3b827c1b JF	333	if (xen_feature(XENFEAT_auto_translated_physmap))
f4f97b3e JF	334	return 0;
	335
	336	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
	337	pud_t *pud;
	338	unsigned long pud_limit, pud_next;
3b827c1b	339
f4f97b3e JF	340	pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
	341
	342	if (!pgd_val(*pgd))
3b827c1b	343	continue;
f4f97b3e	344
3b827c1b JF	345	pud = pud_offset(pgd, 0);
	346
	347	if (PTRS_PER_PUD > 1) /* not folded */
74260714	348	flush \|= (*func)(virt_to_page(pud), PT_PUD);
f4f97b3e JF	349
	350	for (; addr != pud_limit; pud++, addr = pud_next) {
	351	pmd_t *pmd;
	352	unsigned long pmd_limit;
	353
	354	pud_next = pud_addr_end(addr, pud_limit);
	355
	356	if (pud_next < limit)
	357	pmd_limit = pud_next;
	358	else
	359	pmd_limit = limit;
3b827c1b	360
3b827c1b JF	361	if (pud_none(*pud))
3b827c1b JF	362	continue;
f4f97b3e	363
3b827c1b JF	364	pmd = pmd_offset(pud, 0);
	365
	366	if (PTRS_PER_PMD > 1) /* not folded */
74260714	367	flush \|= (*func)(virt_to_page(pmd), PT_PMD);
f4f97b3e JF	368
	369	for (; addr != pmd_limit; pmd++) {
	370	addr += (PAGE_SIZE * PTRS_PER_PTE);
	371	if ((pmd_limit-1) < (addr-1)) {
	372	addr = pmd_limit;
	373	break;
	374	}
3b827c1b	375
3b827c1b JF	376	if (pmd_none(*pmd))
	377	continue;
	378
74260714	379	flush \|= (func)(pmd_page(pmd), PT_PTE);
3b827c1b JF	380	}
	381	}
	382	}
	383
74260714	384	flush \|= (*func)(virt_to_page(pgd_base), PT_PGD);
f4f97b3e JF	385
f4f97b3e JF	386	return flush;
3b827c1b JF	387	}
3b827c1b JF	388
74260714 JF	389	static spinlock_t lock_pte(struct page page)
	390	{
	391	spinlock_t *ptl = NULL;
	392
	393	#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
	394	ptl = __pte_lockptr(page);
	395	spin_lock(ptl);
	396	#endif
	397
	398	return ptl;
	399	}
	400
	401	static void do_unlock(void *v)
	402	{
	403	spinlock_t *ptl = v;
	404	spin_unlock(ptl);
	405	}
	406
	407	static void xen_do_pin(unsigned level, unsigned long pfn)
	408	{
	409	struct mmuext_op *op;
	410	struct multicall_space mcs;
	411
	412	mcs = __xen_mc_entry(sizeof(*op));
	413	op = mcs.args;
	414	op->cmd = level;
	415	op->arg1.mfn = pfn_to_mfn(pfn);
	416	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
	417	}
	418
	419	static int pin_page(struct page *page, enum pt_level level)
f4f97b3e JF	420	{
	421	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
	422	int flush;
	423
	424	if (pgfl)
	425	flush = 0; /* already pinned */
	426	else if (PageHighMem(page))
	427	/* kmaps need flushing if we found an unpinned
	428	highpage */
	429	flush = 1;
	430	else {
	431	void *pt = lowmem_page_address(page);
	432	unsigned long pfn = page_to_pfn(page);
	433	struct multicall_space mcs = __xen_mc_entry(0);
74260714	434	spinlock_t *ptl;
f4f97b3e JF	435
	436	flush = 0;
	437
74260714 JF	438	ptl = NULL;
	439	if (level == PT_PTE)
	440	ptl = lock_pte(page);
	441
f4f97b3e JF	442	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
f4f97b3e JF	443	pfn_pte(pfn, PAGE_KERNEL_RO),
74260714 JF	444	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
	445
	446	if (level == PT_PTE)
	447	xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
	448
	449	if (ptl) {
	450	/* Queue a deferred unlock for when this batch
	451	is completed. */
	452	xen_mc_callback(do_unlock, ptl);
	453	}
f4f97b3e JF	454	}
	455
	456	return flush;
	457	}
3b827c1b	458
f4f97b3e JF	459	/* This is called just after a mm has been created, but it has not
	460	been used yet. We need to make sure that its pagetable is all
	461	read-only, and can be pinned. */
3b827c1b JF	462	void xen_pgd_pin(pgd_t *pgd)
3b827c1b JF	463	{
74260714	464	unsigned level;
3b827c1b	465
f4f97b3e	466	xen_mc_batch();
3b827c1b	467
f87e4cac JF	468	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
	469	/* re-enable interrupts for kmap_flush_unused */
	470	xen_mc_issue(0);
f4f97b3e	471	kmap_flush_unused();
f87e4cac JF	472	xen_mc_batch();
f87e4cac JF	473	}
f4f97b3e	474
f4f97b3e	475	#ifdef CONFIG_X86_PAE
74260714	476	level = MMUEXT_PIN_L3_TABLE;
3b827c1b	477	#else
74260714	478	level = MMUEXT_PIN_L2_TABLE;
3b827c1b	479	#endif
74260714 JF	480
74260714 JF	481	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
f4f97b3e JF	482
f4f97b3e JF	483	xen_mc_issue(0);
3b827c1b JF	484	}
3b827c1b JF	485
f4f97b3e JF	486	/* The init_mm pagetable is really pinned as soon as its created, but
	487	that's before we have page structures to store the bits. So do all
	488	the book-keeping now. */
74260714	489	static __init int mark_pinned(struct page *page, enum pt_level level)
3b827c1b	490	{
f4f97b3e JF	491	SetPagePinned(page);
	492	return 0;
	493	}
3b827c1b	494
f4f97b3e JF	495	void __init xen_mark_init_mm_pinned(void)
	496	{
	497	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
	498	}
3b827c1b	499
74260714	500	static int unpin_page(struct page *page, enum pt_level level)
f4f97b3e JF	501	{
f4f97b3e JF	502	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
3b827c1b	503
f4f97b3e JF	504	if (pgfl && !PageHighMem(page)) {
	505	void *pt = lowmem_page_address(page);
	506	unsigned long pfn = page_to_pfn(page);
74260714 JF	507	spinlock_t *ptl = NULL;
	508	struct multicall_space mcs;
	509
	510	if (level == PT_PTE) {
	511	ptl = lock_pte(page);
	512
	513	xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
	514	}
	515
	516	mcs = __xen_mc_entry(0);
f4f97b3e JF	517
	518	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
	519	pfn_pte(pfn, PAGE_KERNEL),
74260714 JF	520	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
	521
	522	if (ptl) {
	523	/* unlock when batch completed */
	524	xen_mc_callback(do_unlock, ptl);
	525	}
f4f97b3e JF	526	}
	527
	528	return 0; /* never need to flush on unpin */
3b827c1b JF	529	}
3b827c1b JF	530
f4f97b3e JF	531	/* Release a pagetables pages back as normal RW */
	532	static void xen_pgd_unpin(pgd_t *pgd)
	533	{
f4f97b3e JF	534	xen_mc_batch();
f4f97b3e JF	535
74260714	536	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e JF	537
	538	pgd_walk(pgd, unpin_page, TASK_SIZE);
	539
	540	xen_mc_issue(0);
	541	}
3b827c1b JF	542
	543	void xen_activate_mm(struct mm_struct prev, struct mm_struct next)
	544	{
f4f97b3e	545	spin_lock(&next->page_table_lock);
3b827c1b	546	xen_pgd_pin(next->pgd);
f4f97b3e	547	spin_unlock(&next->page_table_lock);
3b827c1b JF	548	}
	549
	550	void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)
	551	{
f4f97b3e	552	spin_lock(&mm->page_table_lock);
3b827c1b	553	xen_pgd_pin(mm->pgd);
f4f97b3e	554	spin_unlock(&mm->page_table_lock);
3b827c1b JF	555	}
3b827c1b JF	556
3b827c1b	557
f87e4cac JF	558	#ifdef CONFIG_SMP
	559	/* Another cpu may still have their %cr3 pointing at the pagetable, so
	560	we need to repoint it somewhere else before we can unpin it. */
	561	static void drop_other_mm_ref(void *info)
	562	{
	563	struct mm_struct *mm = info;
3b827c1b	564
f87e4cac JF	565	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
f87e4cac JF	566	leave_mm(smp_processor_id());
9f79991d JF	567
	568	/* If this cpu still has a stale cr3 reference, then make sure
	569	it has been flushed. */
	570	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
	571	load_cr3(swapper_pg_dir);
	572	arch_flush_lazy_cpu_mode();
	573	}
f87e4cac	574	}
3b827c1b	575
f87e4cac JF	576	static void drop_mm_ref(struct mm_struct *mm)
f87e4cac JF	577	{
9f79991d JF	578	cpumask_t mask;
	579	unsigned cpu;
	580
f87e4cac JF	581	if (current->active_mm == mm) {
	582	if (current->mm == mm)
	583	load_cr3(swapper_pg_dir);
	584	else
	585	leave_mm(smp_processor_id());
9f79991d JF	586	arch_flush_lazy_cpu_mode();
	587	}
	588
	589	/* Get the "official" set of cpus referring to our pagetable. */
	590	mask = mm->cpu_vm_mask;
	591
	592	/* It's possible that a vcpu may have a stale reference to our
	593	cr3, because its in lazy mode, and it hasn't yet flushed
	594	its set of pending hypercalls yet. In this case, we can
	595	look at its actual current cr3 value, and force it to flush
	596	if needed. */
	597	for_each_online_cpu(cpu) {
	598	if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
	599	cpu_set(cpu, mask);
3b827c1b JF	600	}
3b827c1b JF	601
9f79991d JF	602	if (!cpus_empty(mask))
9f79991d JF	603	xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
f87e4cac JF	604	}
	605	#else
	606	static void drop_mm_ref(struct mm_struct *mm)
	607	{
	608	if (current->active_mm == mm)
	609	load_cr3(swapper_pg_dir);
	610	}
	611	#endif
	612
	613	/*
	614	* While a process runs, Xen pins its pagetables, which means that the
	615	* hypervisor forces it to be read-only, and it controls all updates
	616	* to it. This means that all pagetable updates have to go via the
	617	* hypervisor, which is moderately expensive.
	618	*
	619	* Since we're pulling the pagetable down, we switch to use init_mm,
	620	* unpin old process pagetable and mark it all read-write, which
	621	* allows further operations on it to be simple memory accesses.
	622	*
	623	* The only subtle point is that another CPU may be still using the
	624	* pagetable because of lazy tlb flushing. This means we need need to
	625	* switch all CPUs off this pagetable before we can unpin it.
	626	*/
	627	void xen_exit_mmap(struct mm_struct *mm)
	628	{
	629	get_cpu(); /* make sure we don't move around */
	630	drop_mm_ref(mm);
	631	put_cpu();
3b827c1b	632
f120f13e	633	spin_lock(&mm->page_table_lock);
df912ea4 JF	634
	635	/* pgd may not be pinned in the error exit path of execve */
	636	if (PagePinned(virt_to_page(mm->pgd)))
	637	xen_pgd_unpin(mm->pgd);
74260714	638
f120f13e	639	spin_unlock(&mm->page_table_lock);
3b827c1b	640	}