[mirror_ubuntu-zesty-kernel.git] / arch / x86 / xen / mmu.c

/*
 * Xen mmu operations
 *
 * This file contains the various mmu fetch and update operations.
 * The most important job they must perform is the mapping between the
 * domain's pfn and the overall machine mfns.
 *
 * Xen allows guests to directly update the pagetable, in a controlled
 * fashion.  In other words, the guest modifies the same pagetable
 * that the CPU actually uses, which eliminates the overhead of having
 * a separate shadow pagetable.
 *
 * In order to allow this, it falls on the guest domain to map its
 * notion of a "physical" pfn - which is just a domain-local linear
 * address - into a real "machine address" which the CPU's MMU can
 * use.
 *
 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
 * inserted directly into the pagetable.  When creating a new
 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
 * when reading the content back with __(pgd|pmd|pte)_val, it converts
 * the mfn back into a pfn.
 *
 * The other constraint is that all pages which make up a pagetable
 * must be mapped read-only in the guest.  This prevents uncontrolled
 * guest updates to the pagetable.  Xen strictly enforces this, and
 * will disallow any pagetable update which will end up mapping a
 * pagetable page RW, and will disallow using any writable page as a
 * pagetable.
 *
 * Naively, when loading %cr3 with the base of a new pagetable, Xen
 * would need to validate the whole pagetable before going on.
 * Naturally, this is quite slow.  The solution is to "pin" a
 * pagetable, which enforces all the constraints on the pagetable even
 * when it is not actively in use.  This menas that Xen can be assured
 * that it is still valid when you do load it into %cr3, and doesn't
 * need to revalidate it.
 *
 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
 */
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/bug.h>

#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>

#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>

#include <xen/page.h>
#include <xen/interface/xen.h>

#include "multicalls.h"
#include "mmu.h"

xmaddr_t arbitrary_virt_to_machine(unsigned long address)
{
	unsigned int level;
	pte_t *pte = lookup_address(address, &level);
	unsigned offset = address & PAGE_MASK;

	BUG_ON(pte == NULL);

	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
}

void make_lowmem_page_readonly(void *vaddr)
{
	pte_t *pte, ptev;
	unsigned long address = (unsigned long)vaddr;
	unsigned int level;

	pte = lookup_address(address, &level);
	BUG_ON(pte == NULL);

	ptev = pte_wrprotect(*pte);

	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
		BUG();
}

void make_lowmem_page_readwrite(void *vaddr)
{
	pte_t *pte, ptev;
	unsigned long address = (unsigned long)vaddr;
	unsigned int level;

	pte = lookup_address(address, &level);
	BUG_ON(pte == NULL);

	ptev = pte_mkwrite(*pte);

	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
		BUG();
}


void xen_set_pmd(pmd_t *ptr, pmd_t val)
{
	struct multicall_space mcs;
	struct mmu_update *u;

	preempt_disable();

	mcs = xen_mc_entry(sizeof(*u));
	u = mcs.args;
	u->ptr = virt_to_machine(ptr).maddr;
	u->val = pmd_val_ma(val);
	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
}

/*
 * Associate a virtual page frame with a given physical page frame
 * and protection flags for that frame.
 */
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pgd = swapper_pg_dir + pgd_index(vaddr);
	if (pgd_none(*pgd)) {
		BUG();
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
		BUG();
		return;
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		BUG();
		return;
	}
	pte = pte_offset_kernel(pmd, vaddr);
	/* <mfn,flags> stored as-is, to permit clearing entries */
	xen_set_pte(pte, mfn_pte(mfn, flags));

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
		    pte_t *ptep, pte_t pteval)
{
	if (mm == current->mm || mm == &init_mm) {
		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
			struct multicall_space mcs;
			mcs = xen_mc_entry(0);

			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
			xen_mc_issue(PARAVIRT_LAZY_MMU);
			return;
		} else
			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
				return;
	}
	xen_set_pte(ptep, pteval);
}

#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
	struct multicall_space mcs;
	struct mmu_update *u;

	preempt_disable();

	mcs = xen_mc_entry(sizeof(*u));
	u = mcs.args;
	u->ptr = virt_to_machine(ptr).maddr;
	u->val = pud_val_ma(val);
	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);

	xen_mc_issue(PARAVIRT_LAZY_MMU);

	preempt_enable();
}

void xen_set_pte(pte_t *ptep, pte_t pte)
{
	ptep->pte_high = pte.pte_high;
	smp_wmb();
	ptep->pte_low = pte.pte_low;
}

void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
	set_64bit((u64 *)ptep, pte_val_ma(pte));
}

void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
	ptep->pte_low = 0;
	smp_wmb();		/* make sure low gets written first */
	ptep->pte_high = 0;
}

void xen_pmd_clear(pmd_t *pmdp)
{
	xen_set_pmd(pmdp, __pmd(0));
}

pteval_t xen_pte_val(pte_t pte)
{
	pteval_t ret = pte.pte;

	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;

	return ret;
}

pmdval_t xen_pmd_val(pmd_t pmd)
{
	pmdval_t ret = pmd.pmd;
	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
	return ret;
}

pgdval_t xen_pgd_val(pgd_t pgd)
{
	pgdval_t ret = pgd.pgd;
	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
	return ret;
}

pte_t xen_make_pte(pteval_t pte)
{
	if (pte & _PAGE_PRESENT) {
		pte = phys_to_machine(XPADDR(pte)).maddr;
		pte &= ~(_PAGE_PCD | _PAGE_PWT);
	}

	return (pte_t){ .pte = pte };
}

pmd_t xen_make_pmd(pmdval_t pmd)
{
	if (pmd & _PAGE_PRESENT)
		pmd = phys_to_machine(XPADDR(pmd)).maddr;

	return (pmd_t){ pmd };
}

pgd_t xen_make_pgd(pgdval_t pgd)
{
	if (pgd & _PAGE_PRESENT)
		pgd = phys_to_machine(XPADDR(pgd)).maddr;

	return (pgd_t){ pgd };
}
#else  /* !PAE */
void xen_set_pte(pte_t *ptep, pte_t pte)
{
	*ptep = pte;
}

pteval_t xen_pte_val(pte_t pte)
{
	pteval_t ret = pte.pte;

	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr;

	return ret;
}

pgdval_t xen_pgd_val(pgd_t pgd)
{
	pteval_t ret = pgd.pgd;
	if (ret & _PAGE_PRESENT)
		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
	return ret;
}

pte_t xen_make_pte(pteval_t pte)
{
	if (pte & _PAGE_PRESENT) {
		pte = phys_to_machine(XPADDR(pte)).maddr;
		pte &= ~(_PAGE_PCD | _PAGE_PWT);
	}

	return (pte_t){ pte };
}

pgd_t xen_make_pgd(pgdval_t pgd)
{
	if (pgd & _PAGE_PRESENT)
		pgd = phys_to_machine(XPADDR(pgd)).maddr;

	return (pgd_t){ pgd };
}
#endif	/* CONFIG_X86_PAE */

/*
  (Yet another) pagetable walker.  This one is intended for pinning a
  pagetable.  This means that it walks a pagetable and calls the
  callback function on each page it finds making up the page table,
  at every level.  It walks the entire pagetable, but it only bothers
  pinning pte pages which are below pte_limit.  In the normal case
  this will be TASK_SIZE, but at boot we need to pin up to
  FIXADDR_TOP.  But the important bit is that we don't pin beyond
  there, because then we start getting into Xen's ptes.
*/
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
		    unsigned long limit)
{
	pgd_t *pgd = pgd_base;
	int flush = 0;
	unsigned long addr = 0;
	unsigned long pgd_next;

	BUG_ON(limit > FIXADDR_TOP);

	if (xen_feature(XENFEAT_auto_translated_physmap))
		return 0;

	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
		pud_t *pud;
		unsigned long pud_limit, pud_next;

		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);

		if (!pgd_val(*pgd))
			continue;

		pud = pud_offset(pgd, 0);

		if (PTRS_PER_PUD > 1) /* not folded */
			flush |= (*func)(virt_to_page(pud), PT_PUD);

		for (; addr != pud_limit; pud++, addr = pud_next) {
			pmd_t *pmd;
			unsigned long pmd_limit;

			pud_next = pud_addr_end(addr, pud_limit);

			if (pud_next < limit)
				pmd_limit = pud_next;
			else
				pmd_limit = limit;

			if (pud_none(*pud))
				continue;

			pmd = pmd_offset(pud, 0);

			if (PTRS_PER_PMD > 1) /* not folded */
				flush |= (*func)(virt_to_page(pmd), PT_PMD);

			for (; addr != pmd_limit; pmd++) {
				addr += (PAGE_SIZE * PTRS_PER_PTE);
				if ((pmd_limit-1) < (addr-1)) {
					addr = pmd_limit;
					break;
				}

				if (pmd_none(*pmd))
					continue;

				flush |= (*func)(pmd_page(*pmd), PT_PTE);
			}
		}
	}

	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);

	return flush;
}

static spinlock_t *lock_pte(struct page *page)
{
	spinlock_t *ptl = NULL;

#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
	ptl = __pte_lockptr(page);
	spin_lock(ptl);
#endif

	return ptl;
}

static void do_unlock(void *v)
{
	spinlock_t *ptl = v;
	spin_unlock(ptl);
}

static void xen_do_pin(unsigned level, unsigned long pfn)
{
	struct mmuext_op *op;
	struct multicall_space mcs;

	mcs = __xen_mc_entry(sizeof(*op));
	op = mcs.args;
	op->cmd = level;
	op->arg1.mfn = pfn_to_mfn(pfn);
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
}

static int pin_page(struct page *page, enum pt_level level)
{
	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
	int flush;

	if (pgfl)
		flush = 0;		/* already pinned */
	else if (PageHighMem(page))
		/* kmaps need flushing if we found an unpinned
		   highpage */
		flush = 1;
	else {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		struct multicall_space mcs = __xen_mc_entry(0);
		spinlock_t *ptl;

		flush = 0;

		ptl = NULL;
		if (level == PT_PTE)
			ptl = lock_pte(page);

		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL_RO),
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);

		if (level == PT_PTE)
			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);

		if (ptl) {
			/* Queue a deferred unlock for when this batch
			   is completed. */
			xen_mc_callback(do_unlock, ptl);
		}
	}

	return flush;
}

/* This is called just after a mm has been created, but it has not
   been used yet.  We need to make sure that its pagetable is all
   read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
	unsigned level;

	xen_mc_batch();

	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
		/* re-enable interrupts for kmap_flush_unused */
		xen_mc_issue(0);
		kmap_flush_unused();
		xen_mc_batch();
	}

#ifdef CONFIG_X86_PAE
	level = MMUEXT_PIN_L3_TABLE;
#else
	level = MMUEXT_PIN_L2_TABLE;
#endif

	xen_do_pin(level, PFN_DOWN(__pa(pgd)));

	xen_mc_issue(0);
}

/* The init_mm pagetable is really pinned as soon as its created, but
   that's before we have page structures to store the bits.  So do all
   the book-keeping now. */
static __init int mark_pinned(struct page *page, enum pt_level level)
{
	SetPagePinned(page);
	return 0;
}

void __init xen_mark_init_mm_pinned(void)
{
	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
}

static int unpin_page(struct page *page, enum pt_level level)
{
	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);

	if (pgfl && !PageHighMem(page)) {
		void *pt = lowmem_page_address(page);
		unsigned long pfn = page_to_pfn(page);
		spinlock_t *ptl = NULL;
		struct multicall_space mcs;

		if (level == PT_PTE) {
			ptl = lock_pte(page);

			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
		}

		mcs = __xen_mc_entry(0);

		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
					pfn_pte(pfn, PAGE_KERNEL),
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);

		if (ptl) {
			/* unlock when batch completed */
			xen_mc_callback(do_unlock, ptl);
		}
	}

	return 0;		/* never need to flush on unpin */
}

/* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd)
{
	xen_mc_batch();

	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));

	pgd_walk(pgd, unpin_page, TASK_SIZE);

	xen_mc_issue(0);
}

void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
	spin_lock(&next->page_table_lock);
	xen_pgd_pin(next->pgd);
	spin_unlock(&next->page_table_lock);
}

void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
	spin_lock(&mm->page_table_lock);
	xen_pgd_pin(mm->pgd);
	spin_unlock(&mm->page_table_lock);
}


#ifdef CONFIG_SMP
/* Another cpu may still have their %cr3 pointing at the pagetable, so
   we need to repoint it somewhere else before we can unpin it. */
static void drop_other_mm_ref(void *info)
{
	struct mm_struct *mm = info;

	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
		leave_mm(smp_processor_id());

	/* If this cpu still has a stale cr3 reference, then make sure
	   it has been flushed. */
	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
		load_cr3(swapper_pg_dir);
		arch_flush_lazy_cpu_mode();
	}
}

static void drop_mm_ref(struct mm_struct *mm)
{
	cpumask_t mask;
	unsigned cpu;

	if (current->active_mm == mm) {
		if (current->mm == mm)
			load_cr3(swapper_pg_dir);
		else
			leave_mm(smp_processor_id());
		arch_flush_lazy_cpu_mode();
	}

	/* Get the "official" set of cpus referring to our pagetable. */
	mask = mm->cpu_vm_mask;

	/* It's possible that a vcpu may have a stale reference to our
	   cr3, because its in lazy mode, and it hasn't yet flushed
	   its set of pending hypercalls yet.  In this case, we can
	   look at its actual current cr3 value, and force it to flush
	   if needed. */
	for_each_online_cpu(cpu) {
		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
			cpu_set(cpu, mask);
	}

	if (!cpus_empty(mask))
		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
}
#else
static void drop_mm_ref(struct mm_struct *mm)
{
	if (current->active_mm == mm)
		load_cr3(swapper_pg_dir);
}
#endif

/*
 * While a process runs, Xen pins its pagetables, which means that the
 * hypervisor forces it to be read-only, and it controls all updates
 * to it.  This means that all pagetable updates have to go via the
 * hypervisor, which is moderately expensive.
 *
 * Since we're pulling the pagetable down, we switch to use init_mm,
 * unpin old process pagetable and mark it all read-write, which
 * allows further operations on it to be simple memory accesses.
 *
 * The only subtle point is that another CPU may be still using the
 * pagetable because of lazy tlb flushing.  This means we need need to
 * switch all CPUs off this pagetable before we can unpin it.
 */
void xen_exit_mmap(struct mm_struct *mm)
{
	get_cpu();		/* make sure we don't move around */
	drop_mm_ref(mm);
	put_cpu();

	spin_lock(&mm->page_table_lock);

	/* pgd may not be pinned in the error exit path of execve */
	if (PagePinned(virt_to_page(mm->pgd)))
		xen_pgd_unpin(mm->pgd);

	spin_unlock(&mm->page_table_lock);
}
Commit	Line	Data
3b827c1b JF	1	/*
	2	* Xen mmu operations
	3	*
	4	* This file contains the various mmu fetch and update operations.
	5	* The most important job they must perform is the mapping between the
	6	* domain's pfn and the overall machine mfns.
	7	*
	8	* Xen allows guests to directly update the pagetable, in a controlled
	9	* fashion. In other words, the guest modifies the same pagetable
	10	* that the CPU actually uses, which eliminates the overhead of having
	11	* a separate shadow pagetable.
	12	*
	13	* In order to allow this, it falls on the guest domain to map its
	14	* notion of a "physical" pfn - which is just a domain-local linear
	15	* address - into a real "machine address" which the CPU's MMU can
	16	* use.
	17	*
	18	* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
	19	* inserted directly into the pagetable. When creating a new
	20	* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
	21	* when reading the content back with __(pgd\|pmd\|pte)_val, it converts
	22	* the mfn back into a pfn.
	23	*
	24	* The other constraint is that all pages which make up a pagetable
	25	* must be mapped read-only in the guest. This prevents uncontrolled
	26	* guest updates to the pagetable. Xen strictly enforces this, and
	27	* will disallow any pagetable update which will end up mapping a
	28	* pagetable page RW, and will disallow using any writable page as a
	29	* pagetable.
	30	*
	31	* Naively, when loading %cr3 with the base of a new pagetable, Xen
	32	* would need to validate the whole pagetable before going on.
	33	* Naturally, this is quite slow. The solution is to "pin" a
	34	* pagetable, which enforces all the constraints on the pagetable even
	35	* when it is not actively in use. This menas that Xen can be assured
	36	* that it is still valid when you do load it into %cr3, and doesn't
	37	* need to revalidate it.
	38	*
	39	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
	40	*/
f120f13e	41	#include <linux/sched.h>
f4f97b3e	42	#include <linux/highmem.h>
3b827c1b	43	#include <linux/bug.h>
3b827c1b JF	44
	45	#include <asm/pgtable.h>
	46	#include <asm/tlbflush.h>
	47	#include <asm/mmu_context.h>
f4f97b3e	48	#include <asm/paravirt.h>
3b827c1b JF	49
3b827c1b JF	50	#include <asm/xen/hypercall.h>
f4f97b3e	51	#include <asm/xen/hypervisor.h>
3b827c1b JF	52
	53	#include <xen/page.h>
	54	#include <xen/interface/xen.h>
	55
f4f97b3e	56	#include "multicalls.h"
3b827c1b JF	57	#include "mmu.h"
	58
	59	xmaddr_t arbitrary_virt_to_machine(unsigned long address)
	60	{
da7bfc50	61	unsigned int level;
f0646e43	62	pte_t *pte = lookup_address(address, &level);
3b827c1b JF	63	unsigned offset = address & PAGE_MASK;
	64
	65	BUG_ON(pte == NULL);
	66
	67	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
	68	}
	69
	70	void make_lowmem_page_readonly(void *vaddr)
	71	{
	72	pte_t *pte, ptev;
	73	unsigned long address = (unsigned long)vaddr;
da7bfc50	74	unsigned int level;
3b827c1b	75
f0646e43	76	pte = lookup_address(address, &level);
3b827c1b JF	77	BUG_ON(pte == NULL);
	78
	79	ptev = pte_wrprotect(*pte);
	80
	81	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
	82	BUG();
	83	}
	84
	85	void make_lowmem_page_readwrite(void *vaddr)
	86	{
	87	pte_t *pte, ptev;
	88	unsigned long address = (unsigned long)vaddr;
da7bfc50	89	unsigned int level;
3b827c1b	90
f0646e43	91	pte = lookup_address(address, &level);
3b827c1b JF	92	BUG_ON(pte == NULL);
	93
	94	ptev = pte_mkwrite(*pte);
	95
	96	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
	97	BUG();
	98	}
	99
	100
3b827c1b JF	101	void xen_set_pmd(pmd_t *ptr, pmd_t val)
3b827c1b JF	102	{
d66bf8fc JF	103	struct multicall_space mcs;
d66bf8fc JF	104	struct mmu_update *u;
3b827c1b	105
d66bf8fc JF	106	preempt_disable();
	107
	108	mcs = xen_mc_entry(sizeof(*u));
	109	u = mcs.args;
	110	u->ptr = virt_to_machine(ptr).maddr;
	111	u->val = pmd_val_ma(val);
	112	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
	113
	114	xen_mc_issue(PARAVIRT_LAZY_MMU);
	115
	116	preempt_enable();
3b827c1b JF	117	}
3b827c1b JF	118
3b827c1b JF	119	/*
	120	* Associate a virtual page frame with a given physical page frame
	121	* and protection flags for that frame.
	122	*/
	123	void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
	124	{
	125	pgd_t *pgd;
	126	pud_t *pud;
	127	pmd_t *pmd;
	128	pte_t *pte;
	129
	130	pgd = swapper_pg_dir + pgd_index(vaddr);
	131	if (pgd_none(*pgd)) {
	132	BUG();
	133	return;
	134	}
	135	pud = pud_offset(pgd, vaddr);
	136	if (pud_none(*pud)) {
	137	BUG();
	138	return;
	139	}
	140	pmd = pmd_offset(pud, vaddr);
	141	if (pmd_none(*pmd)) {
	142	BUG();
	143	return;
	144	}
	145	pte = pte_offset_kernel(pmd, vaddr);
	146	/* <mfn,flags> stored as-is, to permit clearing entries */
	147	xen_set_pte(pte, mfn_pte(mfn, flags));
	148
	149	/*
	150	* It's enough to flush this one mapping.
	151	* (PGE mappings get flushed as well)
	152	*/
	153	__flush_tlb_one(vaddr);
	154	}
	155
	156	void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
	157	pte_t *ptep, pte_t pteval)
	158	{
d66bf8fc	159	if (mm == current->mm \|\| mm == &init_mm) {
8965c1c0	160	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
d66bf8fc JF	161	struct multicall_space mcs;
	162	mcs = xen_mc_entry(0);
	163
	164	MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
	165	xen_mc_issue(PARAVIRT_LAZY_MMU);
	166	return;
	167	} else
	168	if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
	169	return;
	170	}
	171	xen_set_pte(ptep, pteval);
3b827c1b JF	172	}
	173
	174	#ifdef CONFIG_X86_PAE
f4f97b3e JF	175	void xen_set_pud(pud_t *ptr, pud_t val)
f4f97b3e JF	176	{
d66bf8fc JF	177	struct multicall_space mcs;
d66bf8fc JF	178	struct mmu_update *u;
f4f97b3e	179
d66bf8fc JF	180	preempt_disable();
	181
	182	mcs = xen_mc_entry(sizeof(*u));
	183	u = mcs.args;
	184	u->ptr = virt_to_machine(ptr).maddr;
	185	u->val = pud_val_ma(val);
	186	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
	187
	188	xen_mc_issue(PARAVIRT_LAZY_MMU);
	189
	190	preempt_enable();
f4f97b3e JF	191	}
	192
	193	void xen_set_pte(pte_t *ptep, pte_t pte)
	194	{
	195	ptep->pte_high = pte.pte_high;
	196	smp_wmb();
	197	ptep->pte_low = pte.pte_low;
	198	}
	199
3b827c1b JF	200	void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
	201	{
	202	set_64bit((u64 *)ptep, pte_val_ma(pte));
	203	}
	204
	205	void xen_pte_clear(struct mm_struct mm, unsigned long addr, pte_t ptep)
	206	{
	207	ptep->pte_low = 0;
	208	smp_wmb(); /* make sure low gets written first */
	209	ptep->pte_high = 0;
	210	}
	211
	212	void xen_pmd_clear(pmd_t *pmdp)
	213	{
	214	xen_set_pmd(pmdp, __pmd(0));
	215	}
	216
abf33038	217	pteval_t xen_pte_val(pte_t pte)
3b827c1b	218	{
430442e3	219	pteval_t ret = pte.pte;
3b827c1b	220
430442e3 JF	221	if (ret & _PAGE_PRESENT)
430442e3 JF	222	ret = machine_to_phys(XMADDR(ret)).paddr \| _PAGE_PRESENT;
3b827c1b JF	223
	224	return ret;
	225	}
	226
abf33038	227	pmdval_t xen_pmd_val(pmd_t pmd)
3b827c1b	228	{
abf33038	229	pmdval_t ret = pmd.pmd;
430442e3 JF	230	if (ret & _PAGE_PRESENT)
430442e3 JF	231	ret = machine_to_phys(XMADDR(ret)).paddr \| _PAGE_PRESENT;
3b827c1b JF	232	return ret;
	233	}
	234
abf33038	235	pgdval_t xen_pgd_val(pgd_t pgd)
3b827c1b	236	{
abf33038	237	pgdval_t ret = pgd.pgd;
430442e3 JF	238	if (ret & _PAGE_PRESENT)
430442e3 JF	239	ret = machine_to_phys(XMADDR(ret)).paddr \| _PAGE_PRESENT;
3b827c1b JF	240	return ret;
	241	}
	242
abf33038	243	pte_t xen_make_pte(pteval_t pte)
3b827c1b	244	{
a89780f3	245	if (pte & _PAGE_PRESENT) {
3b827c1b	246	pte = phys_to_machine(XPADDR(pte)).maddr;
a89780f3 JF	247	pte &= ~(_PAGE_PCD \| _PAGE_PWT);
a89780f3 JF	248	}
3b827c1b	249
c8e5393a	250	return (pte_t){ .pte = pte };
3b827c1b JF	251	}
3b827c1b JF	252
abf33038	253	pmd_t xen_make_pmd(pmdval_t pmd)
3b827c1b	254	{
430442e3	255	if (pmd & _PAGE_PRESENT)
3b827c1b JF	256	pmd = phys_to_machine(XPADDR(pmd)).maddr;
	257
	258	return (pmd_t){ pmd };
	259	}
	260
abf33038	261	pgd_t xen_make_pgd(pgdval_t pgd)
3b827c1b JF	262	{
	263	if (pgd & _PAGE_PRESENT)
	264	pgd = phys_to_machine(XPADDR(pgd)).maddr;
	265
	266	return (pgd_t){ pgd };
	267	}
	268	#else /* !PAE */
f4f97b3e JF	269	void xen_set_pte(pte_t *ptep, pte_t pte)
	270	{
	271	*ptep = pte;
	272	}
	273
abf33038	274	pteval_t xen_pte_val(pte_t pte)
3b827c1b	275	{
430442e3	276	pteval_t ret = pte.pte;
3b827c1b JF	277
	278	if (ret & _PAGE_PRESENT)
	279	ret = machine_to_phys(XMADDR(ret)).paddr;
	280
	281	return ret;
	282	}
	283
abf33038	284	pgdval_t xen_pgd_val(pgd_t pgd)
3b827c1b	285	{
abf33038	286	pteval_t ret = pgd.pgd;
430442e3 JF	287	if (ret & _PAGE_PRESENT)
430442e3 JF	288	ret = machine_to_phys(XMADDR(ret)).paddr \| _PAGE_PRESENT;
3b827c1b JF	289	return ret;
	290	}
	291
abf33038	292	pte_t xen_make_pte(pteval_t pte)
3b827c1b	293	{
a89780f3	294	if (pte & _PAGE_PRESENT) {
3b827c1b	295	pte = phys_to_machine(XPADDR(pte)).maddr;
a89780f3 JF	296	pte &= ~(_PAGE_PCD \| _PAGE_PWT);
a89780f3 JF	297	}
2c80b01b	298
3b827c1b JF	299	return (pte_t){ pte };
	300	}
	301
abf33038	302	pgd_t xen_make_pgd(pgdval_t pgd)
3b827c1b JF	303	{
	304	if (pgd & _PAGE_PRESENT)
	305	pgd = phys_to_machine(XPADDR(pgd)).maddr;
	306
	307	return (pgd_t){ pgd };
	308	}
	309	#endif /* CONFIG_X86_PAE */
	310
f4f97b3e JF	311	/*
	312	(Yet another) pagetable walker. This one is intended for pinning a
	313	pagetable. This means that it walks a pagetable and calls the
	314	callback function on each page it finds making up the page table,
	315	at every level. It walks the entire pagetable, but it only bothers
	316	pinning pte pages which are below pte_limit. In the normal case
	317	this will be TASK_SIZE, but at boot we need to pin up to
	318	FIXADDR_TOP. But the important bit is that we don't pin beyond
	319	there, because then we start getting into Xen's ptes.
	320	*/
74260714	321	static int pgd_walk(pgd_t pgd_base, int (func)(struct page *, enum pt_level),
f4f97b3e	322	unsigned long limit)
3b827c1b JF	323	{
3b827c1b JF	324	pgd_t *pgd = pgd_base;
f4f97b3e JF	325	int flush = 0;
	326	unsigned long addr = 0;
	327	unsigned long pgd_next;
	328
	329	BUG_ON(limit > FIXADDR_TOP);
3b827c1b JF	330
3b827c1b JF	331	if (xen_feature(XENFEAT_auto_translated_physmap))
f4f97b3e JF	332	return 0;
	333
	334	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
	335	pud_t *pud;
	336	unsigned long pud_limit, pud_next;
3b827c1b	337
f4f97b3e JF	338	pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
	339
	340	if (!pgd_val(*pgd))
3b827c1b	341	continue;
f4f97b3e	342
3b827c1b JF	343	pud = pud_offset(pgd, 0);
	344
	345	if (PTRS_PER_PUD > 1) /* not folded */
74260714	346	flush \|= (*func)(virt_to_page(pud), PT_PUD);
f4f97b3e JF	347
	348	for (; addr != pud_limit; pud++, addr = pud_next) {
	349	pmd_t *pmd;
	350	unsigned long pmd_limit;
	351
	352	pud_next = pud_addr_end(addr, pud_limit);
	353
	354	if (pud_next < limit)
	355	pmd_limit = pud_next;
	356	else
	357	pmd_limit = limit;
3b827c1b	358
3b827c1b JF	359	if (pud_none(*pud))
3b827c1b JF	360	continue;
f4f97b3e	361
3b827c1b JF	362	pmd = pmd_offset(pud, 0);
	363
	364	if (PTRS_PER_PMD > 1) /* not folded */
74260714	365	flush \|= (*func)(virt_to_page(pmd), PT_PMD);
f4f97b3e JF	366
	367	for (; addr != pmd_limit; pmd++) {
	368	addr += (PAGE_SIZE * PTRS_PER_PTE);
	369	if ((pmd_limit-1) < (addr-1)) {
	370	addr = pmd_limit;
	371	break;
	372	}
3b827c1b	373
3b827c1b JF	374	if (pmd_none(*pmd))
	375	continue;
	376
74260714	377	flush \|= (func)(pmd_page(pmd), PT_PTE);
3b827c1b JF	378	}
	379	}
	380	}
	381
74260714	382	flush \|= (*func)(virt_to_page(pgd_base), PT_PGD);
f4f97b3e JF	383
f4f97b3e JF	384	return flush;
3b827c1b JF	385	}
3b827c1b JF	386
74260714 JF	387	static spinlock_t lock_pte(struct page page)
	388	{
	389	spinlock_t *ptl = NULL;
	390
	391	#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
	392	ptl = __pte_lockptr(page);
	393	spin_lock(ptl);
	394	#endif
	395
	396	return ptl;
	397	}
	398
	399	static void do_unlock(void *v)
	400	{
	401	spinlock_t *ptl = v;
	402	spin_unlock(ptl);
	403	}
	404
	405	static void xen_do_pin(unsigned level, unsigned long pfn)
	406	{
	407	struct mmuext_op *op;
	408	struct multicall_space mcs;
	409
	410	mcs = __xen_mc_entry(sizeof(*op));
	411	op = mcs.args;
	412	op->cmd = level;
	413	op->arg1.mfn = pfn_to_mfn(pfn);
	414	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
	415	}
	416
	417	static int pin_page(struct page *page, enum pt_level level)
f4f97b3e JF	418	{
	419	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
	420	int flush;
	421
	422	if (pgfl)
	423	flush = 0; /* already pinned */
	424	else if (PageHighMem(page))
	425	/* kmaps need flushing if we found an unpinned
	426	highpage */
	427	flush = 1;
	428	else {
	429	void *pt = lowmem_page_address(page);
	430	unsigned long pfn = page_to_pfn(page);
	431	struct multicall_space mcs = __xen_mc_entry(0);
74260714	432	spinlock_t *ptl;
f4f97b3e JF	433
	434	flush = 0;
	435
74260714 JF	436	ptl = NULL;
	437	if (level == PT_PTE)
	438	ptl = lock_pte(page);
	439
f4f97b3e JF	440	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
f4f97b3e JF	441	pfn_pte(pfn, PAGE_KERNEL_RO),
74260714 JF	442	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
	443
	444	if (level == PT_PTE)
	445	xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
	446
	447	if (ptl) {
	448	/* Queue a deferred unlock for when this batch
	449	is completed. */
	450	xen_mc_callback(do_unlock, ptl);
	451	}
f4f97b3e JF	452	}
	453
	454	return flush;
	455	}
3b827c1b	456
f4f97b3e JF	457	/* This is called just after a mm has been created, but it has not
	458	been used yet. We need to make sure that its pagetable is all
	459	read-only, and can be pinned. */
3b827c1b JF	460	void xen_pgd_pin(pgd_t *pgd)
3b827c1b JF	461	{
74260714	462	unsigned level;
3b827c1b	463
f4f97b3e	464	xen_mc_batch();
3b827c1b	465
f87e4cac JF	466	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
	467	/* re-enable interrupts for kmap_flush_unused */
	468	xen_mc_issue(0);
f4f97b3e	469	kmap_flush_unused();
f87e4cac JF	470	xen_mc_batch();
f87e4cac JF	471	}
f4f97b3e	472
f4f97b3e	473	#ifdef CONFIG_X86_PAE
74260714	474	level = MMUEXT_PIN_L3_TABLE;
3b827c1b	475	#else
74260714	476	level = MMUEXT_PIN_L2_TABLE;
3b827c1b	477	#endif
74260714 JF	478
74260714 JF	479	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
f4f97b3e JF	480
f4f97b3e JF	481	xen_mc_issue(0);
3b827c1b JF	482	}
3b827c1b JF	483
f4f97b3e JF	484	/* The init_mm pagetable is really pinned as soon as its created, but
	485	that's before we have page structures to store the bits. So do all
	486	the book-keeping now. */
74260714	487	static __init int mark_pinned(struct page *page, enum pt_level level)
3b827c1b	488	{
f4f97b3e JF	489	SetPagePinned(page);
	490	return 0;
	491	}
3b827c1b	492
f4f97b3e JF	493	void __init xen_mark_init_mm_pinned(void)
	494	{
	495	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
	496	}
3b827c1b	497
74260714	498	static int unpin_page(struct page *page, enum pt_level level)
f4f97b3e JF	499	{
f4f97b3e JF	500	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
3b827c1b	501
f4f97b3e JF	502	if (pgfl && !PageHighMem(page)) {
	503	void *pt = lowmem_page_address(page);
	504	unsigned long pfn = page_to_pfn(page);
74260714 JF	505	spinlock_t *ptl = NULL;
	506	struct multicall_space mcs;
	507
	508	if (level == PT_PTE) {
	509	ptl = lock_pte(page);
	510
	511	xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
	512	}
	513
	514	mcs = __xen_mc_entry(0);
f4f97b3e JF	515
	516	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
	517	pfn_pte(pfn, PAGE_KERNEL),
74260714 JF	518	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
	519
	520	if (ptl) {
	521	/* unlock when batch completed */
	522	xen_mc_callback(do_unlock, ptl);
	523	}
f4f97b3e JF	524	}
	525
	526	return 0; /* never need to flush on unpin */
3b827c1b JF	527	}
3b827c1b JF	528
f4f97b3e JF	529	/* Release a pagetables pages back as normal RW */
	530	static void xen_pgd_unpin(pgd_t *pgd)
	531	{
f4f97b3e JF	532	xen_mc_batch();
f4f97b3e JF	533
74260714	534	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e JF	535
	536	pgd_walk(pgd, unpin_page, TASK_SIZE);
	537
	538	xen_mc_issue(0);
	539	}
3b827c1b JF	540
	541	void xen_activate_mm(struct mm_struct prev, struct mm_struct next)
	542	{
f4f97b3e	543	spin_lock(&next->page_table_lock);
3b827c1b	544	xen_pgd_pin(next->pgd);
f4f97b3e	545	spin_unlock(&next->page_table_lock);
3b827c1b JF	546	}
	547
	548	void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)
	549	{
f4f97b3e	550	spin_lock(&mm->page_table_lock);
3b827c1b	551	xen_pgd_pin(mm->pgd);
f4f97b3e	552	spin_unlock(&mm->page_table_lock);
3b827c1b JF	553	}
3b827c1b JF	554
3b827c1b	555
f87e4cac JF	556	#ifdef CONFIG_SMP
	557	/* Another cpu may still have their %cr3 pointing at the pagetable, so
	558	we need to repoint it somewhere else before we can unpin it. */
	559	static void drop_other_mm_ref(void *info)
	560	{
	561	struct mm_struct *mm = info;
3b827c1b	562
f87e4cac JF	563	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
f87e4cac JF	564	leave_mm(smp_processor_id());
9f79991d JF	565
	566	/* If this cpu still has a stale cr3 reference, then make sure
	567	it has been flushed. */
	568	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
	569	load_cr3(swapper_pg_dir);
	570	arch_flush_lazy_cpu_mode();
	571	}
f87e4cac	572	}
3b827c1b	573
f87e4cac JF	574	static void drop_mm_ref(struct mm_struct *mm)
f87e4cac JF	575	{
9f79991d JF	576	cpumask_t mask;
	577	unsigned cpu;
	578
f87e4cac JF	579	if (current->active_mm == mm) {
	580	if (current->mm == mm)
	581	load_cr3(swapper_pg_dir);
	582	else
	583	leave_mm(smp_processor_id());
9f79991d JF	584	arch_flush_lazy_cpu_mode();
	585	}
	586
	587	/* Get the "official" set of cpus referring to our pagetable. */
	588	mask = mm->cpu_vm_mask;
	589
	590	/* It's possible that a vcpu may have a stale reference to our
	591	cr3, because its in lazy mode, and it hasn't yet flushed
	592	its set of pending hypercalls yet. In this case, we can
	593	look at its actual current cr3 value, and force it to flush
	594	if needed. */
	595	for_each_online_cpu(cpu) {
	596	if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
	597	cpu_set(cpu, mask);
3b827c1b JF	598	}
3b827c1b JF	599
9f79991d JF	600	if (!cpus_empty(mask))
9f79991d JF	601	xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
f87e4cac JF	602	}
	603	#else
	604	static void drop_mm_ref(struct mm_struct *mm)
	605	{
	606	if (current->active_mm == mm)
	607	load_cr3(swapper_pg_dir);
	608	}
	609	#endif
	610
	611	/*
	612	* While a process runs, Xen pins its pagetables, which means that the
	613	* hypervisor forces it to be read-only, and it controls all updates
	614	* to it. This means that all pagetable updates have to go via the
	615	* hypervisor, which is moderately expensive.
	616	*
	617	* Since we're pulling the pagetable down, we switch to use init_mm,
	618	* unpin old process pagetable and mark it all read-write, which
	619	* allows further operations on it to be simple memory accesses.
	620	*
	621	* The only subtle point is that another CPU may be still using the
	622	* pagetable because of lazy tlb flushing. This means we need need to
	623	* switch all CPUs off this pagetable before we can unpin it.
	624	*/
	625	void xen_exit_mmap(struct mm_struct *mm)
	626	{
	627	get_cpu(); /* make sure we don't move around */
	628	drop_mm_ref(mm);
	629	put_cpu();
3b827c1b	630
f120f13e	631	spin_lock(&mm->page_table_lock);
df912ea4 JF	632
	633	/* pgd may not be pinned in the error exit path of execve */
	634	if (PagePinned(virt_to_page(mm->pgd)))
	635	xen_pgd_unpin(mm->pgd);
74260714	636
f120f13e	637	spin_unlock(&mm->page_table_lock);
3b827c1b	638	}