[mirror_ubuntu-zesty-kernel.git] / arch / x86 / xen / mmu.c

/*
 * Xen mmu operations
 *
 * This file contains the various mmu fetch and update operations.
 * The most important job they must perform is the mapping between the
 * domain's pfn and the overall machine mfns.
 *
 * Xen allows guests to directly update the pagetable, in a controlled
 * fashion.  In other words, the guest modifies the same pagetable
 * that the CPU actually uses, which eliminates the overhead of having
 * a separate shadow pagetable.
 *
 * In order to allow this, it falls on the guest domain to map its
 * notion of a "physical" pfn - which is just a domain-local linear
 * address - into a real "machine address" which the CPU's MMU can
 * use.
 *
 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
 * inserted directly into the pagetable.  When creating a new
 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
 * when reading the content back with __(pgd|pmd|pte)_val, it converts
 * the mfn back into a pfn.
 *
 * The other constraint is that all pages which make up a pagetable
 * must be mapped read-only in the guest.  This prevents uncontrolled
 * guest updates to the pagetable.  Xen strictly enforces this, and
 * will disallow any pagetable update which will end up mapping a
 * pagetable page RW, and will disallow using any writable page as a
 * pagetable.
 *
 * Naively, when loading %cr3 with the base of a new pagetable, Xen
 * would need to validate the whole pagetable before going on.
 * Naturally, this is quite slow.  The solution is to "pin" a
 * pagetable, which enforces all the constraints on the pagetable even
 * when it is not actively in use.  This menas that Xen can be assured
 * that it is still valid when you do load it into %cr3, and doesn't
 * need to revalidate it.
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/bug.h>

#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>

#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>

#include <xen/page.h>
#include <xen/interface/xen.h>

#include "multicalls.h"
#include "mmu.h"

xmaddr_t arbitrary_virt_to_machine(unsigned long address)
{
	unsigned int level;
	pte_t *pte = lookup_address(address, &level);
	unsigned offset = address & PAGE_MASK;

	BUG_ON(pte == NULL);

	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
}

void make_lowmem_page_readonly(void *vaddr)
{
	pte_t *pte, ptev;
	unsigned long address = (unsigned long)vaddr;
	unsigned int level;

	pte = lookup_address(address, &level);
	BUG_ON(pte == NULL);

	ptev = pte_wrprotect(*pte);

	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
		BUG();
}

void make_lowmem_page_readwrite(void *vaddr)
{
	pte_t *pte, ptev;
	unsigned long address = (unsigned long)vaddr;
	unsigned int level;

	pte = lookup_address(address, &level);
	BUG_ON(pte == NULL);

	ptev = pte_mkwrite(*pte);

	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
		BUG();
}


void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
	struct multicall_space mcs;
	struct mmu_update *u;

	preempt_disable();

	mcs = xen_mc_entry(sizeof(*u));
	u = mcs.args;
	u->ptr = virt_to_machine(ptr).maddr;
	u->val = pmd_val_ma(val);
	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
}

/*
 * Associate a virtual page frame with a given physical page frame
 * and protection flags for that frame.
 */
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pgd = swapper_pg_dir + pgd_index(vaddr);
	if (pgd_none(*pgd)) {
		BUG();
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
		BUG();
		return;
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		BUG();
		return;
	}
	pte = pte_offset_kernel(pmd, vaddr);
	/* <mfn,flags> stored as-is, to permit clearing entries */
	xen_set_pte(pte, mfn_pte(mfn, flags));

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
		    pte_t *ptep, pte_t pteval)
{
	/* updates to init_mm may be done without lock */
	if (mm == &init_mm)
		preempt_disable();

	if (mm == current->mm || mm == &init_mm) {
		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
			struct multicall_space mcs;
			mcs = xen_mc_entry(0);

			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
			xen_mc_issue(PARAVIRT_LAZY_MMU);
			goto out;
		} else
			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
				goto out;
	}
	xen_set_pte(ptep, pteval);

out:
	if (mm == &init_mm)
		preempt_enable();
}

pteval_t xen_pte_val(pte_t pte)
{
	pteval_t ret = pte.pte;

	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;

	return ret;
}

pgdval_t xen_pgd_val(pgd_t pgd)
{
	pgdval_t ret = pgd.pgd;
	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
	return ret;
}

pte_t xen_make_pte(pteval_t pte)
{
	if (pte & _PAGE_PRESENT) {
		pte = phys_to_machine(XPADDR(pte)).maddr;
		pte &= ~(_PAGE_PCD | _PAGE_PWT);
	}

	return (pte_t){ .pte = pte };
}

pgd_t xen_make_pgd(pgdval_t pgd)
{
	if (pgd & _PAGE_PRESENT)
		pgd = phys_to_machine(XPADDR(pgd)).maddr;

	return (pgd_t){ pgd };
}

pmdval_t xen_pmd_val(pmd_t pmd)
{
	pmdval_t ret = native_pmd_val(pmd);
	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
	return ret;
}

void xen_set_pud(pud_t *ptr, pud_t val)
{
	struct multicall_space mcs;
	struct mmu_update *u;

	preempt_disable();

	mcs = xen_mc_entry(sizeof(*u));
	u = mcs.args;
	u->ptr = virt_to_machine(ptr).maddr;
	u->val = pud_val_ma(val);
	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
}

void xen_set_pte(pte_t *ptep, pte_t pte)
{
	ptep->pte_high = pte.pte_high;
	smp_wmb();
	ptep->pte_low = pte.pte_low;
}

void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
	set_64bit((u64 *)ptep, pte_val_ma(pte));
}

void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
	ptep->pte_low = 0;
	smp_wmb();		/* make sure low gets written first */
	ptep->pte_high = 0;
}

void xen_pmd_clear(pmd_t *pmdp)
{
	xen_set_pmd(pmdp, __pmd(0));
}

pmd_t xen_make_pmd(pmdval_t pmd)
{
	if (pmd & _PAGE_PRESENT)
		pmd = phys_to_machine(XPADDR(pmd)).maddr;

	return native_make_pmd(pmd);
}

/*
  (Yet another) pagetable walker.  This one is intended for pinning a
  pagetable.  This means that it walks a pagetable and calls the
  callback function on each page it finds making up the page table,
  at every level.  It walks the entire pagetable, but it only bothers
  pinning pte pages which are below pte_limit.  In the normal case
  this will be TASK_SIZE, but at boot we need to pin up to
  FIXADDR_TOP.  But the important bit is that we don't pin beyond
  there, because then we start getting into Xen's ptes.
*/
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
		    unsigned long limit)
{
	pgd_t *pgd = pgd_base;
	int flush = 0;
	unsigned long addr = 0;
	unsigned long pgd_next;

	BUG_ON(limit > FIXADDR_TOP);

	if (xen_feature(XENFEAT_auto_translated_physmap))
		return 0;

	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
		pud_t *pud;
		unsigned long pud_limit, pud_next;

		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);

		if (!pgd_val(*pgd))
			continue;

		pud = pud_offset(pgd, 0);

		if (PTRS_PER_PUD > 1) /* not folded */
			flush |= (*func)(virt_to_page(pud), PT_PUD);

		for (; addr != pud_limit; pud++, addr = pud_next) {
			pmd_t *pmd;
			unsigned long pmd_limit;

			pud_next = pud_addr_end(addr, pud_limit);

			if (pud_next < limit)
				pmd_limit = pud_next;
			else
				pmd_limit = limit;

			if (pud_none(*pud))
				continue;

			pmd = pmd_offset(pud, 0);

			if (PTRS_PER_PMD > 1) /* not folded */
				flush |= (*func)(virt_to_page(pmd), PT_PMD);

			for (; addr != pmd_limit; pmd++) {
				addr += (PAGE_SIZE * PTRS_PER_PTE);
				if ((pmd_limit-1) < (addr-1)) {
					addr = pmd_limit;
					break;
				}

				if (pmd_none(*pmd))
					continue;

				flush |= (*func)(pmd_page(*pmd), PT_PTE);
			}
		}
	}

	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);

	return flush;
}

static spinlock_t *lock_pte(struct page *page)
{
	spinlock_t *ptl = NULL;

#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
	ptl = __pte_lockptr(page);
	spin_lock(ptl);
#endif

	return ptl;
}

static void do_unlock(void *v)
{
	spinlock_t *ptl = v;
	spin_unlock(ptl);
}

static void xen_do_pin(unsigned level, unsigned long pfn)
{
	struct mmuext_op *op;
	struct multicall_space mcs;

	mcs = __xen_mc_entry(sizeof(*op));
	op = mcs.args;
	op->cmd = level;
	op->arg1.mfn = pfn_to_mfn(pfn);
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
}

static int pin_page(struct page *page, enum pt_level level)
{
	unsigned pgfl = TestSetPagePinned(page);
	int flush;

	if (pgfl)
		flush = 0;		/* already pinned */
	else if (PageHighMem(page))
		/* kmaps need flushing if we found an unpinned
		   highpage */
		flush = 1;
	else {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		struct multicall_space mcs = __xen_mc_entry(0);
		spinlock_t *ptl;

		flush = 0;

		ptl = NULL;
		if (level == PT_PTE)
			ptl = lock_pte(page);

		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL_RO),
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);

		if (level == PT_PTE)
			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);

		if (ptl) {
			/* Queue a deferred unlock for when this batch
			   is completed. */
			xen_mc_callback(do_unlock, ptl);
		}
	}

	return flush;
}

/* This is called just after a mm has been created, but it has not
   been used yet.  We need to make sure that its pagetable is all
   read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
	xen_mc_batch();

	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
		/* re-enable interrupts for kmap_flush_unused */
		xen_mc_issue(0);
		kmap_flush_unused();
		xen_mc_batch();
	}

	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
	xen_mc_issue(0);
}

/* The init_mm pagetable is really pinned as soon as its created, but
   that's before we have page structures to store the bits.  So do all
   the book-keeping now. */
static __init int mark_pinned(struct page *page, enum pt_level level)
{
	SetPagePinned(page);
	return 0;
}

void __init xen_mark_init_mm_pinned(void)
{
	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
}

static int unpin_page(struct page *page, enum pt_level level)
{
	unsigned pgfl = TestClearPagePinned(page);

	if (pgfl && !PageHighMem(page)) {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		spinlock_t *ptl = NULL;
		struct multicall_space mcs;

		if (level == PT_PTE) {
			ptl = lock_pte(page);

			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
		}

		mcs = __xen_mc_entry(0);

		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL),
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);

		if (ptl) {
			/* unlock when batch completed */
			xen_mc_callback(do_unlock, ptl);
		}
	}

	return 0;		/* never need to flush on unpin */
}

/* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd)
{
	xen_mc_batch();

	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));

	pgd_walk(pgd, unpin_page, TASK_SIZE);

	xen_mc_issue(0);
}

void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
	spin_lock(&next->page_table_lock);
	xen_pgd_pin(next->pgd);
	spin_unlock(&next->page_table_lock);
}

void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
	spin_lock(&mm->page_table_lock);
	xen_pgd_pin(mm->pgd);
	spin_unlock(&mm->page_table_lock);
}


#ifdef CONFIG_SMP
/* Another cpu may still have their %cr3 pointing at the pagetable, so
   we need to repoint it somewhere else before we can unpin it. */
static void drop_other_mm_ref(void *info)
{
	struct mm_struct *mm = info;

	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
		leave_mm(smp_processor_id());

	/* If this cpu still has a stale cr3 reference, then make sure
	   it has been flushed. */
	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
		load_cr3(swapper_pg_dir);
		arch_flush_lazy_cpu_mode();
	}
}

static void drop_mm_ref(struct mm_struct *mm)
{
	cpumask_t mask;
	unsigned cpu;

	if (current->active_mm == mm) {
		if (current->mm == mm)
			load_cr3(swapper_pg_dir);
		else
			leave_mm(smp_processor_id());
		arch_flush_lazy_cpu_mode();
	}

	/* Get the "official" set of cpus referring to our pagetable. */
	mask = mm->cpu_vm_mask;

	/* It's possible that a vcpu may have a stale reference to our
	   cr3, because its in lazy mode, and it hasn't yet flushed
	   its set of pending hypercalls yet.  In this case, we can
	   look at its actual current cr3 value, and force it to flush
	   if needed. */
	for_each_online_cpu(cpu) {
		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
			cpu_set(cpu, mask);
	}

	if (!cpus_empty(mask))
		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
}
#else
static void drop_mm_ref(struct mm_struct *mm)
{
	if (current->active_mm == mm)
		load_cr3(swapper_pg_dir);
}
#endif

/*
 * While a process runs, Xen pins its pagetables, which means that the
 * hypervisor forces it to be read-only, and it controls all updates
 * to it.  This means that all pagetable updates have to go via the
 * hypervisor, which is moderately expensive.
 *
 * Since we're pulling the pagetable down, we switch to use init_mm,
 * unpin old process pagetable and mark it all read-write, which
 * allows further operations on it to be simple memory accesses.
 *
 * The only subtle point is that another CPU may be still using the
 * pagetable because of lazy tlb flushing.  This means we need need to
 * switch all CPUs off this pagetable before we can unpin it.
 */
void xen_exit_mmap(struct mm_struct *mm)
{
	get_cpu();		/* make sure we don't move around */
	drop_mm_ref(mm);
	put_cpu();

	spin_lock(&mm->page_table_lock);

	/* pgd may not be pinned in the error exit path of execve */
	if (PagePinned(virt_to_page(mm->pgd)))
		xen_pgd_unpin(mm->pgd);

	spin_unlock(&mm->page_table_lock);
}
Commit	Line	Data
3b827c1b JF	1	/*
	2	* Xen mmu operations
	3	*
	4	* This file contains the various mmu fetch and update operations.
	5	* The most important job they must perform is the mapping between the
	6	* domain's pfn and the overall machine mfns.
	7	*
	8	* Xen allows guests to directly update the pagetable, in a controlled
	9	* fashion. In other words, the guest modifies the same pagetable
	10	* that the CPU actually uses, which eliminates the overhead of having
	11	* a separate shadow pagetable.
	12	*
	13	* In order to allow this, it falls on the guest domain to map its
	14	* notion of a "physical" pfn - which is just a domain-local linear
	15	* address - into a real "machine address" which the CPU's MMU can
	16	* use.
	17	*
	18	* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
	19	* inserted directly into the pagetable. When creating a new
	20	* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
	21	* when reading the content back with __(pgd\|pmd\|pte)_val, it converts
	22	* the mfn back into a pfn.
	23	*
	24	* The other constraint is that all pages which make up a pagetable
	25	* must be mapped read-only in the guest. This prevents uncontrolled
	26	* guest updates to the pagetable. Xen strictly enforces this, and
	27	* will disallow any pagetable update which will end up mapping a
	28	* pagetable page RW, and will disallow using any writable page as a
	29	* pagetable.
	30	*
	31	* Naively, when loading %cr3 with the base of a new pagetable, Xen
	32	* would need to validate the whole pagetable before going on.
	33	* Naturally, this is quite slow. The solution is to "pin" a
	34	* pagetable, which enforces all the constraints on the pagetable even
	35	* when it is not actively in use. This menas that Xen can be assured
	36	* that it is still valid when you do load it into %cr3, and doesn't
	37	* need to revalidate it.
	38	*
	39	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
	40	*/
f120f13e	41	#include <linux/sched.h>
f4f97b3e	42	#include <linux/highmem.h>
3b827c1b	43	#include <linux/bug.h>
3b827c1b JF	44
	45	#include <asm/pgtable.h>
	46	#include <asm/tlbflush.h>
	47	#include <asm/mmu_context.h>
f4f97b3e	48	#include <asm/paravirt.h>
3b827c1b JF	49
3b827c1b JF	50	#include <asm/xen/hypercall.h>
f4f97b3e	51	#include <asm/xen/hypervisor.h>
3b827c1b JF	52
	53	#include <xen/page.h>
	54	#include <xen/interface/xen.h>
	55
f4f97b3e	56	#include "multicalls.h"
3b827c1b JF	57	#include "mmu.h"
	58
	59	xmaddr_t arbitrary_virt_to_machine(unsigned long address)
	60	{
da7bfc50	61	unsigned int level;
f0646e43	62	pte_t *pte = lookup_address(address, &level);
3b827c1b JF	63	unsigned offset = address & PAGE_MASK;
	64
	65	BUG_ON(pte == NULL);
	66
	67	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
	68	}
	69
	70	void make_lowmem_page_readonly(void *vaddr)
	71	{
	72	pte_t *pte, ptev;
	73	unsigned long address = (unsigned long)vaddr;
da7bfc50	74	unsigned int level;
3b827c1b	75
f0646e43	76	pte = lookup_address(address, &level);
3b827c1b JF	77	BUG_ON(pte == NULL);
	78
	79	ptev = pte_wrprotect(*pte);
	80
	81	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
	82	BUG();
	83	}
	84
	85	void make_lowmem_page_readwrite(void *vaddr)
	86	{
	87	pte_t *pte, ptev;
	88	unsigned long address = (unsigned long)vaddr;
da7bfc50	89	unsigned int level;
3b827c1b	90
f0646e43	91	pte = lookup_address(address, &level);
3b827c1b JF	92	BUG_ON(pte == NULL);
	93
	94	ptev = pte_mkwrite(*pte);
	95
	96	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
	97	BUG();
	98	}
	99
	100
3b827c1b JF	101	void xen_set_pmd(pmd_t *ptr, pmd_t val)
3b827c1b JF	102	{
d66bf8fc JF	103	struct multicall_space mcs;
d66bf8fc JF	104	struct mmu_update *u;
3b827c1b	105
d66bf8fc JF	106	preempt_disable();
	107
	108	mcs = xen_mc_entry(sizeof(*u));
	109	u = mcs.args;
	110	u->ptr = virt_to_machine(ptr).maddr;
	111	u->val = pmd_val_ma(val);
	112	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
	113
	114	xen_mc_issue(PARAVIRT_LAZY_MMU);
	115
	116	preempt_enable();
3b827c1b JF	117	}
3b827c1b JF	118
3b827c1b JF	119	/*
	120	* Associate a virtual page frame with a given physical page frame
	121	* and protection flags for that frame.
	122	*/
	123	void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
	124	{
	125	pgd_t *pgd;
	126	pud_t *pud;
	127	pmd_t *pmd;
	128	pte_t *pte;
	129
	130	pgd = swapper_pg_dir + pgd_index(vaddr);
	131	if (pgd_none(*pgd)) {
	132	BUG();
	133	return;
	134	}
	135	pud = pud_offset(pgd, vaddr);
	136	if (pud_none(*pud)) {
	137	BUG();
	138	return;
	139	}
	140	pmd = pmd_offset(pud, vaddr);
	141	if (pmd_none(*pmd)) {
	142	BUG();
	143	return;
	144	}
	145	pte = pte_offset_kernel(pmd, vaddr);
	146	/* <mfn,flags> stored as-is, to permit clearing entries */
	147	xen_set_pte(pte, mfn_pte(mfn, flags));
	148
	149	/*
	150	* It's enough to flush this one mapping.
	151	* (PGE mappings get flushed as well)
	152	*/
	153	__flush_tlb_one(vaddr);
	154	}
	155
	156	void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
	157	pte_t *ptep, pte_t pteval)
	158	{
2bd50036 JF	159	/* updates to init_mm may be done without lock */
	160	if (mm == &init_mm)
	161	preempt_disable();
	162
d66bf8fc	163	if (mm == current->mm \|\| mm == &init_mm) {
8965c1c0	164	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
d66bf8fc JF	165	struct multicall_space mcs;
	166	mcs = xen_mc_entry(0);
	167
	168	MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
	169	xen_mc_issue(PARAVIRT_LAZY_MMU);
2bd50036	170	goto out;
d66bf8fc JF	171	} else
d66bf8fc JF	172	if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
2bd50036	173	goto out;
d66bf8fc JF	174	}
d66bf8fc JF	175	xen_set_pte(ptep, pteval);
2bd50036 JF	176
	177	out:
	178	if (mm == &init_mm)
	179	preempt_enable();
3b827c1b JF	180	}
3b827c1b JF	181
947a69c9 JF	182	pteval_t xen_pte_val(pte_t pte)
	183	{
	184	pteval_t ret = pte.pte;
	185
	186	if (ret & _PAGE_PRESENT)
	187	ret = machine_to_phys(XMADDR(ret)).paddr \| _PAGE_PRESENT;
	188
	189	return ret;
	190	}
	191
	192	pgdval_t xen_pgd_val(pgd_t pgd)
	193	{
	194	pgdval_t ret = pgd.pgd;
	195	if (ret & _PAGE_PRESENT)
	196	ret = machine_to_phys(XMADDR(ret)).paddr \| _PAGE_PRESENT;
	197	return ret;
	198	}
	199
	200	pte_t xen_make_pte(pteval_t pte)
	201	{
	202	if (pte & _PAGE_PRESENT) {
	203	pte = phys_to_machine(XPADDR(pte)).maddr;
	204	pte &= ~(_PAGE_PCD \| _PAGE_PWT);
	205	}
	206
	207	return (pte_t){ .pte = pte };
	208	}
	209
	210	pgd_t xen_make_pgd(pgdval_t pgd)
	211	{
	212	if (pgd & _PAGE_PRESENT)
	213	pgd = phys_to_machine(XPADDR(pgd)).maddr;
	214
	215	return (pgd_t){ pgd };
	216	}
	217
	218	pmdval_t xen_pmd_val(pmd_t pmd)
	219	{
	220	pmdval_t ret = native_pmd_val(pmd);
	221	if (ret & _PAGE_PRESENT)
	222	ret = machine_to_phys(XMADDR(ret)).paddr \| _PAGE_PRESENT;
	223	return ret;
	224	}
3843fc25	225
f4f97b3e JF	226	void xen_set_pud(pud_t *ptr, pud_t val)
f4f97b3e JF	227	{
d66bf8fc JF	228	struct multicall_space mcs;
d66bf8fc JF	229	struct mmu_update *u;
f4f97b3e	230
d66bf8fc JF	231	preempt_disable();
	232
	233	mcs = xen_mc_entry(sizeof(*u));
	234	u = mcs.args;
	235	u->ptr = virt_to_machine(ptr).maddr;
	236	u->val = pud_val_ma(val);
	237	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
	238
	239	xen_mc_issue(PARAVIRT_LAZY_MMU);
	240
	241	preempt_enable();
f4f97b3e JF	242	}
	243
	244	void xen_set_pte(pte_t *ptep, pte_t pte)
	245	{
	246	ptep->pte_high = pte.pte_high;
	247	smp_wmb();
	248	ptep->pte_low = pte.pte_low;
	249	}
	250
3b827c1b JF	251	void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
	252	{
	253	set_64bit((u64 *)ptep, pte_val_ma(pte));
	254	}
	255
	256	void xen_pte_clear(struct mm_struct mm, unsigned long addr, pte_t ptep)
	257	{
	258	ptep->pte_low = 0;
	259	smp_wmb(); /* make sure low gets written first */
	260	ptep->pte_high = 0;
	261	}
	262
	263	void xen_pmd_clear(pmd_t *pmdp)
	264	{
	265	xen_set_pmd(pmdp, __pmd(0));
	266	}
	267
abf33038	268	pmd_t xen_make_pmd(pmdval_t pmd)
3b827c1b	269	{
430442e3	270	if (pmd & _PAGE_PRESENT)
3b827c1b JF	271	pmd = phys_to_machine(XPADDR(pmd)).maddr;
3b827c1b JF	272
947a69c9	273	return native_make_pmd(pmd);
3b827c1b	274	}
3b827c1b	275
f4f97b3e JF	276	/*
	277	(Yet another) pagetable walker. This one is intended for pinning a
	278	pagetable. This means that it walks a pagetable and calls the
	279	callback function on each page it finds making up the page table,
	280	at every level. It walks the entire pagetable, but it only bothers
	281	pinning pte pages which are below pte_limit. In the normal case
	282	this will be TASK_SIZE, but at boot we need to pin up to
	283	FIXADDR_TOP. But the important bit is that we don't pin beyond
	284	there, because then we start getting into Xen's ptes.
	285	*/
74260714	286	static int pgd_walk(pgd_t pgd_base, int (func)(struct page *, enum pt_level),
f4f97b3e	287	unsigned long limit)
3b827c1b JF	288	{
3b827c1b JF	289	pgd_t *pgd = pgd_base;
f4f97b3e JF	290	int flush = 0;
	291	unsigned long addr = 0;
	292	unsigned long pgd_next;
	293
	294	BUG_ON(limit > FIXADDR_TOP);
3b827c1b JF	295
3b827c1b JF	296	if (xen_feature(XENFEAT_auto_translated_physmap))
f4f97b3e JF	297	return 0;
	298
	299	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
	300	pud_t *pud;
	301	unsigned long pud_limit, pud_next;
3b827c1b	302
f4f97b3e JF	303	pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
	304
	305	if (!pgd_val(*pgd))
3b827c1b	306	continue;
f4f97b3e	307
3b827c1b JF	308	pud = pud_offset(pgd, 0);
	309
	310	if (PTRS_PER_PUD > 1) /* not folded */
74260714	311	flush \|= (*func)(virt_to_page(pud), PT_PUD);
f4f97b3e JF	312
	313	for (; addr != pud_limit; pud++, addr = pud_next) {
	314	pmd_t *pmd;
	315	unsigned long pmd_limit;
	316
	317	pud_next = pud_addr_end(addr, pud_limit);
	318
	319	if (pud_next < limit)
	320	pmd_limit = pud_next;
	321	else
	322	pmd_limit = limit;
3b827c1b	323
3b827c1b JF	324	if (pud_none(*pud))
3b827c1b JF	325	continue;
f4f97b3e	326
3b827c1b JF	327	pmd = pmd_offset(pud, 0);
	328
	329	if (PTRS_PER_PMD > 1) /* not folded */
74260714	330	flush \|= (*func)(virt_to_page(pmd), PT_PMD);
f4f97b3e JF	331
	332	for (; addr != pmd_limit; pmd++) {
	333	addr += (PAGE_SIZE * PTRS_PER_PTE);
	334	if ((pmd_limit-1) < (addr-1)) {
	335	addr = pmd_limit;
	336	break;
	337	}
3b827c1b	338
3b827c1b JF	339	if (pmd_none(*pmd))
	340	continue;
	341
74260714	342	flush \|= (func)(pmd_page(pmd), PT_PTE);
3b827c1b JF	343	}
	344	}
	345	}
	346
74260714	347	flush \|= (*func)(virt_to_page(pgd_base), PT_PGD);
f4f97b3e JF	348
f4f97b3e JF	349	return flush;
3b827c1b JF	350	}
3b827c1b JF	351
74260714 JF	352	static spinlock_t lock_pte(struct page page)
	353	{
	354	spinlock_t *ptl = NULL;
	355
	356	#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
	357	ptl = __pte_lockptr(page);
	358	spin_lock(ptl);
	359	#endif
	360
	361	return ptl;
	362	}
	363
	364	static void do_unlock(void *v)
	365	{
	366	spinlock_t *ptl = v;
	367	spin_unlock(ptl);
	368	}
	369
	370	static void xen_do_pin(unsigned level, unsigned long pfn)
	371	{
	372	struct mmuext_op *op;
	373	struct multicall_space mcs;
	374
	375	mcs = __xen_mc_entry(sizeof(*op));
	376	op = mcs.args;
	377	op->cmd = level;
	378	op->arg1.mfn = pfn_to_mfn(pfn);
	379	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
	380	}
	381
	382	static int pin_page(struct page *page, enum pt_level level)
f4f97b3e	383	{
d60cd46b	384	unsigned pgfl = TestSetPagePinned(page);
f4f97b3e JF	385	int flush;
	386
	387	if (pgfl)
	388	flush = 0; /* already pinned */
	389	else if (PageHighMem(page))
	390	/* kmaps need flushing if we found an unpinned
	391	highpage */
	392	flush = 1;
	393	else {
	394	void *pt = lowmem_page_address(page);
	395	unsigned long pfn = page_to_pfn(page);
	396	struct multicall_space mcs = __xen_mc_entry(0);
74260714	397	spinlock_t *ptl;
f4f97b3e JF	398
	399	flush = 0;
	400
74260714 JF	401	ptl = NULL;
	402	if (level == PT_PTE)
	403	ptl = lock_pte(page);
	404
f4f97b3e JF	405	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
f4f97b3e JF	406	pfn_pte(pfn, PAGE_KERNEL_RO),
74260714 JF	407	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
	408
	409	if (level == PT_PTE)
	410	xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
	411
	412	if (ptl) {
	413	/* Queue a deferred unlock for when this batch
	414	is completed. */
	415	xen_mc_callback(do_unlock, ptl);
	416	}
f4f97b3e JF	417	}
	418
	419	return flush;
	420	}
3b827c1b	421
f4f97b3e JF	422	/* This is called just after a mm has been created, but it has not
	423	been used yet. We need to make sure that its pagetable is all
	424	read-only, and can be pinned. */
3b827c1b JF	425	void xen_pgd_pin(pgd_t *pgd)
3b827c1b JF	426	{
f4f97b3e	427	xen_mc_batch();
3b827c1b	428
f87e4cac JF	429	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
	430	/* re-enable interrupts for kmap_flush_unused */
	431	xen_mc_issue(0);
f4f97b3e	432	kmap_flush_unused();
f87e4cac JF	433	xen_mc_batch();
f87e4cac JF	434	}
f4f97b3e	435
3843fc25	436	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e	437	xen_mc_issue(0);
3b827c1b JF	438	}
3b827c1b JF	439
f4f97b3e JF	440	/* The init_mm pagetable is really pinned as soon as its created, but
	441	that's before we have page structures to store the bits. So do all
	442	the book-keeping now. */
74260714	443	static __init int mark_pinned(struct page *page, enum pt_level level)
3b827c1b	444	{
f4f97b3e JF	445	SetPagePinned(page);
	446	return 0;
	447	}
3b827c1b	448
f4f97b3e JF	449	void __init xen_mark_init_mm_pinned(void)
	450	{
	451	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
	452	}
3b827c1b	453
74260714	454	static int unpin_page(struct page *page, enum pt_level level)
f4f97b3e	455	{
d60cd46b	456	unsigned pgfl = TestClearPagePinned(page);
3b827c1b	457
f4f97b3e JF	458	if (pgfl && !PageHighMem(page)) {
	459	void *pt = lowmem_page_address(page);
	460	unsigned long pfn = page_to_pfn(page);
74260714 JF	461	spinlock_t *ptl = NULL;
	462	struct multicall_space mcs;
	463
	464	if (level == PT_PTE) {
	465	ptl = lock_pte(page);
	466
	467	xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
	468	}
	469
	470	mcs = __xen_mc_entry(0);
f4f97b3e JF	471
	472	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
	473	pfn_pte(pfn, PAGE_KERNEL),
74260714 JF	474	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
	475
	476	if (ptl) {
	477	/* unlock when batch completed */
	478	xen_mc_callback(do_unlock, ptl);
	479	}
f4f97b3e JF	480	}
	481
	482	return 0; /* never need to flush on unpin */
3b827c1b JF	483	}
3b827c1b JF	484
f4f97b3e JF	485	/* Release a pagetables pages back as normal RW */
	486	static void xen_pgd_unpin(pgd_t *pgd)
	487	{
f4f97b3e JF	488	xen_mc_batch();
f4f97b3e JF	489
74260714	490	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e JF	491
	492	pgd_walk(pgd, unpin_page, TASK_SIZE);
	493
	494	xen_mc_issue(0);
	495	}
3b827c1b JF	496
	497	void xen_activate_mm(struct mm_struct prev, struct mm_struct next)
	498	{
f4f97b3e	499	spin_lock(&next->page_table_lock);
3b827c1b	500	xen_pgd_pin(next->pgd);
f4f97b3e	501	spin_unlock(&next->page_table_lock);
3b827c1b JF	502	}
	503
	504	void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)
	505	{
f4f97b3e	506	spin_lock(&mm->page_table_lock);
3b827c1b	507	xen_pgd_pin(mm->pgd);
f4f97b3e	508	spin_unlock(&mm->page_table_lock);
3b827c1b JF	509	}
3b827c1b JF	510
3b827c1b	511
f87e4cac JF	512	#ifdef CONFIG_SMP
	513	/* Another cpu may still have their %cr3 pointing at the pagetable, so
	514	we need to repoint it somewhere else before we can unpin it. */
	515	static void drop_other_mm_ref(void *info)
	516	{
	517	struct mm_struct *mm = info;
3b827c1b	518
f87e4cac JF	519	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
f87e4cac JF	520	leave_mm(smp_processor_id());
9f79991d JF	521
	522	/* If this cpu still has a stale cr3 reference, then make sure
	523	it has been flushed. */
	524	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
	525	load_cr3(swapper_pg_dir);
	526	arch_flush_lazy_cpu_mode();
	527	}
f87e4cac	528	}
3b827c1b	529
f87e4cac JF	530	static void drop_mm_ref(struct mm_struct *mm)
f87e4cac JF	531	{
9f79991d JF	532	cpumask_t mask;
	533	unsigned cpu;
	534
f87e4cac JF	535	if (current->active_mm == mm) {
	536	if (current->mm == mm)
	537	load_cr3(swapper_pg_dir);
	538	else
	539	leave_mm(smp_processor_id());
9f79991d JF	540	arch_flush_lazy_cpu_mode();
	541	}
	542
	543	/* Get the "official" set of cpus referring to our pagetable. */
	544	mask = mm->cpu_vm_mask;
	545
	546	/* It's possible that a vcpu may have a stale reference to our
	547	cr3, because its in lazy mode, and it hasn't yet flushed
	548	its set of pending hypercalls yet. In this case, we can
	549	look at its actual current cr3 value, and force it to flush
	550	if needed. */
	551	for_each_online_cpu(cpu) {
	552	if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
	553	cpu_set(cpu, mask);
3b827c1b JF	554	}
3b827c1b JF	555
9f79991d JF	556	if (!cpus_empty(mask))
9f79991d JF	557	xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
f87e4cac JF	558	}
	559	#else
	560	static void drop_mm_ref(struct mm_struct *mm)
	561	{
	562	if (current->active_mm == mm)
	563	load_cr3(swapper_pg_dir);
	564	}
	565	#endif
	566
	567	/*
	568	* While a process runs, Xen pins its pagetables, which means that the
	569	* hypervisor forces it to be read-only, and it controls all updates
	570	* to it. This means that all pagetable updates have to go via the
	571	* hypervisor, which is moderately expensive.
	572	*
	573	* Since we're pulling the pagetable down, we switch to use init_mm,
	574	* unpin old process pagetable and mark it all read-write, which
	575	* allows further operations on it to be simple memory accesses.
	576	*
	577	* The only subtle point is that another CPU may be still using the
	578	* pagetable because of lazy tlb flushing. This means we need need to
	579	* switch all CPUs off this pagetable before we can unpin it.
	580	*/
	581	void xen_exit_mmap(struct mm_struct *mm)
	582	{
	583	get_cpu(); /* make sure we don't move around */
	584	drop_mm_ref(mm);
	585	put_cpu();
3b827c1b	586
f120f13e	587	spin_lock(&mm->page_table_lock);
df912ea4 JF	588
	589	/* pgd may not be pinned in the error exit path of execve */
	590	if (PagePinned(virt_to_page(mm->pgd)))
	591	xen_pgd_unpin(mm->pgd);
74260714	592
f120f13e	593	spin_unlock(&mm->page_table_lock);
3b827c1b	594	}